Ok so I tried a couple of things:
- I tried fixing the loop so that there wouldn’t be any broadcasting using cartesian index:
function calculation1!(plan, Qa::Matrix{ComplexF64}, Qb::Matrix{ComplexF64}, Qc::Matrix{ComplexF64}, OP::Matrix{ComplexF64}, f::Vector{Float64}, wj::Vector{Float64}, dwj::Vector{Float64}, gj::Vector{Float64})
#FFT
plan*Qa;
plan*Qb;
plan*Qc;
@threads for l in CartesianIndices(Qa)
m = l[1];
n = l[2];
for j in 1:size(wj)[1]
Qa[l] = Qa[l] +0.5*wj[j]*dwj[j]*gj[j]/(-f[m]^2 - 1.0im *f[m] *dwj[j] + wj[j]^2)*abs2(OP[l]);
Qb[l] = Qb[l] +0.5*wj[j]*dwj[j]*gj[j]/(-f[m]^2 - 1.0im *f[m] *dwj[j] + wj[j]^2)*conj(OP[l])*OP[l];
Qc[l] = Qc[l] +0.5*wj[j]*dwj[j]*gj[j]/(-f[m]^2 - 1.0im *f[m] *dwj[j] + wj[j]^2)*conj(OP[l])*OP[l];
end
end
plan\Qa;
plan\Qb;
plan\Qc;
end
Benchmarking gave me:
@btime calculation1!(plan_thread, Q0, Q1, Q2, Operator, fr, wj,dwj, gj );
3.122 s (609 allocations: 71.98 KiB)
So this was actually worse.
- Next I realized that the frequency operation (
wj , f, dwj, and gj) is the same for all three matrices and can be computed once for all three rather than for each:
function calculation2!(plan, Qa::Matrix{ComplexF64}, Qb::Matrix{ComplexF64}, Qc::Matrix{ComplexF64}, OP::Matrix{ComplexF64}, f::Vector{Float64}, wj::Vector{Float64}, dwj::Vector{Float64}, gj::Vector{Float64})
#FFT
plan*Qa;
plan*Qb;
plan*Qc;
@threads for l in CartesianIndices(Qa)
m = l[1];
n = l[2];
for j in 1:size(wj)[1]
gainop = 0.5*wj[j]*dwj[j]*gj[j]/(-f[m]^2 - 1.0im *f[m] *dwj[j] + wj[j]^2);
Qa[l] = Qa[l] +gainop*abs2(OP[l]);
Qb[l] = Qb[l] +gainop*conj(OP[l])*OP[l];
Qc[l] = Qc[l] +gainop*conj(OP[l])*OP[l];
end
end
plan\Qa;
plan\Qb;
plan\Qc;
end
With benchmarking of:
@btime calculation2!(plan_thread, Q0, Q1, Q2, Operator, fr, wj,dwj, gj );
2.023 s (204 allocations: 19.98 KiB)
This definitely improved it and the code looks much cleaner than broadcasting.
- Next, rather than use
CartesianIndexing (which I read somewhere a long while ago that it is slow?) I used the loop configuration that @stevengj recommended:
function calculation3!(plan, Qa::Matrix{ComplexF64}, Qb::Matrix{ComplexF64}, Qc::Matrix{ComplexF64}, OP::Matrix{ComplexF64}, f::Vector{Float64}, wj::Vector{Float64}, dwj::Vector{Float64}, gj::Vector{Float64})
#FFT
plan*Qa;
plan*Qb;
plan*Qc;
@threads for l in 1:size(Qa)[2]
for i in 1:size(Qa,1), j in 1:length(wj)
gainop = 0.5*wj[j]*dwj[j]*gj[j]/(-f[i]^2 - 1.0im *f[i] *dwj[j] + wj[j]^2);
Qa[i,l] = Qa[i,l] +gainop*abs2(OP[i,l]);
Qb[i,l] = Qb[i,l] +gainop*conj(OP[i,l])*OP[i,l];
Qc[i,l] = Qc[i,l] +gainop*conj(OP[i,l])*OP[i,l];
end
end
plan\Qa;
plan\Qb;
plan\Qc;
end
With benchmarking:
@btime calculation3!(plan_thread, Q0, Q1, Q2, Operator, fr, wj,dwj, gj );
1.995 s (206 allocations: 20.05 KiB)
This improved it just a tad bit. It makes me wonder what the difference is in CartesianIndices and specifying loop parameters explicitly.
- Next, because the frequency operator (which I call
gOperator) was constant for all matrices, I realized again that it was a constant throughout all computations. So I made it into a matrix operator.
wj = rand(10);
dwj = rand(10);
gj = rand(10);
gop = zeros(ComplexF64, size(Q0));
function goperator!(gOP::Matrix{ComplexF64}, f::Vector{Float64}, wj::Vector{Float64}, dwj::Vector{Float64}, gj::Vector{Float64})
@threads for l in 1:size(gOP,2)
for m in 1:size(gOP,1), j in 1:length(wj)
gOP[m,l] = 0.5*wj[j]*dwj[j]*gj[j]/(-f[m]^2 - 1.0im *f[m] *dwj[j] + wj[j]^2);
end
end
end
With this I made a new calculation function:
function calculation4!(plan, In::Matrix{ComplexF64},Qa::Matrix{ComplexF64}, Qb::Matrix{ComplexF64}, Qc::Matrix{ComplexF64}, gOP::Matrix{ComplexF64})
#FFT
plan*Qa;
plan*Qb;
plan*Qc;
@threads for l in eachindex(Qa)
Qa[l] = Qa[l] +gOP[l]*abs2(In[l]);
Qb[l] = Qb[l] +gOP[l]*conj(In[l])*In[l];
Qc[l] = Qc[l] +gOP[l]*conj(In[l])*In[l];
end
plan\Qa;
plan\Qb;
plan\Qc;
end
with benchmark:
@btime calculation4!(plan_thread, Intensity, Q0, Q1, Q2, gop);
1.268 s (214 allocations: 19.23 KiB)
This made it significantly faster. Although if things were actually changing in the gOperator this might not be possible. But for my problem this works.
NOTE: I used eachindex() function
- Next, rather than using the
eachindex() to loop, I did explicit for loop:
function calculation5!(plan, In::Matrix{ComplexF64},Q::Array{ComplexF64, 3}, gOP::Matrix{ComplexF64})
#FFT
plan*Q;
@threads for l in 1:size(Q,2)
for m in 1:size(Q,1)
Q[m,l,1] = Q[m,l,1] +gOP[m,l]*abs2(In[m,l]);
Q[m,l,2] = Q[m,l,2] +gOP[m,l]*conj(In[m,l])*In[m,l];
Q[m,l,3]= Q[m,l,3] +gOP[m,l]*conj(In[m,l])*In[m,l];
end
end
plan\Q;
end
@btime calculation5!(plan_thread3D, Intensity,Q3D, gop);
1.147 s (680 allocations: 56.45 KiB)
This was again faster than using CartesianIndices or eachindex
- Next, I combined my Q matrices into a 3D array of 2D matrices (sz,sz,3).
NOTE: For 6 and 7 I had to return to using OpenBLAS because MKL did not want to do non-square FFT
Q3D = rand(ComplexF64, (sz,sz,3));
and formed a FFT plan in the 1st dimension:
plan_thread3D = FFTW.plan_fft!(Q3D,1,flags=FFTW.MEASURE);
function calculation6!(plan, In::Matrix{ComplexF64},Q::Array{ComplexF64, 3}, gOP::Matrix{ComplexF64})
#FFT
plan*Q;
@threads for l in 1:size(Q,3)
for m in 1:size(Q,2)
Q[1,m,l] = Q[1,m,l] +gOP[m,l]*abs2(In[m,l]);
Q[2,m,l] = Q[2,m,l] +gOP[m,l]*conj(In[m,l])*In[m,l];
Q[3,m,l]= Q[3,m,l] +gOP[m,l]*conj(In[m,l])*In[m,l];
end
end
plan\Q;
end
@btime calculation6!(plan_thread3D, Intensity,Q3D, gop);
1.352 s (678 allocations: 56.42 KiB)
This was slower than doing individual 1D FFT on each of the three 2D matrices although not by much.
- I thought maybe the slowing on 6 came from the way the matrices were stacked so I flipped the order:
Q3Da = rand(ComplexF64, (3,sz,sz));
plan_thread3Da = FFTW.plan_fft!(Q3Da,2,flags=FFTW.MEASURE);
and using calculation6!
@btime calculation6!(plan_thread3Da, Intensity,Q3Da, gop);
1.238 s (678 allocations: 56.42 KiB)
- Last I did a serial FFT planner to see if I could incorporate it into the multithread operations:
seralarray = rand(ComplexF64, sz);
serailplan = FFTW.plan_fft!(seralarray,flags=FFTW.MEASURE);
function calculation7!(plan, In::Matrix{ComplexF64}, Qa::Matrix{ComplexF64}, Qb::Matrix{ComplexF64}, Qc::Matrix{ComplexF64}, gOP::Matrix{ComplexF64})
#FFT
#plan*Q;
@threads for l in 1:size(Qa,2)
plan*Qa[:,l];
plan*Qb[:,l];
plan*Qc[:,l];
for m in 1:size(Qa,1)
Qa[m,l] = Qa[m,l] +gOP[m,l]*abs2(In[m,l]);
Qb[m,l] = Qb[m,l] +gOP[m,l]*conj(In[m,l])*In[m,l];
Qc[m,l]= Qc[m,l] +gOP[m,l]*conj(In[m,l])*In[m,l];
end
plan\Qa[:,l];
plan\Qb[:,l];
plan\Qc[:,l];
end
#plan\Q;
end
With the benchmark:
@btime calculation7!(serailplan, Intensity, Q0, Q1, Q2, gop);
1.504 s (10226925 allocations: 6.84 GiB)
The allocations necessary for this was huge and did not provide any extra time benefit.
CONCLUSION
- There’s still more to be learned about best way to optimize FFT over many matrices using multithreading.
- There’s something I don’t understand in looping when explicitly setting the loop indices and using
CartesianIndices and eachindex. Additonally, I’m not sure broadcasting doesn’t do the same thing as explicitly indexing in for loops? Perhaps @stevengj can comment on this
NOTE: For 6 and 7 I had to return to using OpenBLAS because MKL did not want to do non-square FFT