Ok so I tried a couple of things:
- I tried fixing the loop so that there wouldn’t be any broadcasting using cartesian index:
function calculation1!(plan, Qa::Matrix{ComplexF64}, Qb::Matrix{ComplexF64}, Qc::Matrix{ComplexF64}, OP::Matrix{ComplexF64}, f::Vector{Float64}, wj::Vector{Float64}, dwj::Vector{Float64}, gj::Vector{Float64})
#FFT
plan*Qa;
plan*Qb;
plan*Qc;
@threads for l in CartesianIndices(Qa)
m = l[1];
n = l[2];
for j in 1:size(wj)[1]
Qa[l] = Qa[l] +0.5*wj[j]*dwj[j]*gj[j]/(-f[m]^2 - 1.0im *f[m] *dwj[j] + wj[j]^2)*abs2(OP[l]);
Qb[l] = Qb[l] +0.5*wj[j]*dwj[j]*gj[j]/(-f[m]^2 - 1.0im *f[m] *dwj[j] + wj[j]^2)*conj(OP[l])*OP[l];
Qc[l] = Qc[l] +0.5*wj[j]*dwj[j]*gj[j]/(-f[m]^2 - 1.0im *f[m] *dwj[j] + wj[j]^2)*conj(OP[l])*OP[l];
end
end
plan\Qa;
plan\Qb;
plan\Qc;
end
Benchmarking gave me:
@btime calculation1!(plan_thread, Q0, Q1, Q2, Operator, fr, wj,dwj, gj );
3.122 s (609 allocations: 71.98 KiB)
So this was actually worse.
- Next I realized that the frequency operation (
wj
, f
, dwj
, and gj
) is the same for all three matrices and can be computed once for all three rather than for each:
function calculation2!(plan, Qa::Matrix{ComplexF64}, Qb::Matrix{ComplexF64}, Qc::Matrix{ComplexF64}, OP::Matrix{ComplexF64}, f::Vector{Float64}, wj::Vector{Float64}, dwj::Vector{Float64}, gj::Vector{Float64})
#FFT
plan*Qa;
plan*Qb;
plan*Qc;
@threads for l in CartesianIndices(Qa)
m = l[1];
n = l[2];
for j in 1:size(wj)[1]
gainop = 0.5*wj[j]*dwj[j]*gj[j]/(-f[m]^2 - 1.0im *f[m] *dwj[j] + wj[j]^2);
Qa[l] = Qa[l] +gainop*abs2(OP[l]);
Qb[l] = Qb[l] +gainop*conj(OP[l])*OP[l];
Qc[l] = Qc[l] +gainop*conj(OP[l])*OP[l];
end
end
plan\Qa;
plan\Qb;
plan\Qc;
end
With benchmarking of:
@btime calculation2!(plan_thread, Q0, Q1, Q2, Operator, fr, wj,dwj, gj );
2.023 s (204 allocations: 19.98 KiB)
This definitely improved it and the code looks much cleaner than broadcasting.
- Next, rather than use
CartesianIndexing
(which I read somewhere a long while ago that it is slow?) I used the loop configuration that @stevengj recommended:
function calculation3!(plan, Qa::Matrix{ComplexF64}, Qb::Matrix{ComplexF64}, Qc::Matrix{ComplexF64}, OP::Matrix{ComplexF64}, f::Vector{Float64}, wj::Vector{Float64}, dwj::Vector{Float64}, gj::Vector{Float64})
#FFT
plan*Qa;
plan*Qb;
plan*Qc;
@threads for l in 1:size(Qa)[2]
for i in 1:size(Qa,1), j in 1:length(wj)
gainop = 0.5*wj[j]*dwj[j]*gj[j]/(-f[i]^2 - 1.0im *f[i] *dwj[j] + wj[j]^2);
Qa[i,l] = Qa[i,l] +gainop*abs2(OP[i,l]);
Qb[i,l] = Qb[i,l] +gainop*conj(OP[i,l])*OP[i,l];
Qc[i,l] = Qc[i,l] +gainop*conj(OP[i,l])*OP[i,l];
end
end
plan\Qa;
plan\Qb;
plan\Qc;
end
With benchmarking:
@btime calculation3!(plan_thread, Q0, Q1, Q2, Operator, fr, wj,dwj, gj );
1.995 s (206 allocations: 20.05 KiB)
This improved it just a tad bit. It makes me wonder what the difference is in CartesianIndices
and specifying loop parameters explicitly.
- Next, because the frequency operator (which I call
gOperator
) was constant for all matrices, I realized again that it was a constant throughout all computations. So I made it into a matrix operator.
wj = rand(10);
dwj = rand(10);
gj = rand(10);
gop = zeros(ComplexF64, size(Q0));
function goperator!(gOP::Matrix{ComplexF64}, f::Vector{Float64}, wj::Vector{Float64}, dwj::Vector{Float64}, gj::Vector{Float64})
@threads for l in 1:size(gOP,2)
for m in 1:size(gOP,1), j in 1:length(wj)
gOP[m,l] = 0.5*wj[j]*dwj[j]*gj[j]/(-f[m]^2 - 1.0im *f[m] *dwj[j] + wj[j]^2);
end
end
end
With this I made a new calculation function:
function calculation4!(plan, In::Matrix{ComplexF64},Qa::Matrix{ComplexF64}, Qb::Matrix{ComplexF64}, Qc::Matrix{ComplexF64}, gOP::Matrix{ComplexF64})
#FFT
plan*Qa;
plan*Qb;
plan*Qc;
@threads for l in eachindex(Qa)
Qa[l] = Qa[l] +gOP[l]*abs2(In[l]);
Qb[l] = Qb[l] +gOP[l]*conj(In[l])*In[l];
Qc[l] = Qc[l] +gOP[l]*conj(In[l])*In[l];
end
plan\Qa;
plan\Qb;
plan\Qc;
end
with benchmark:
@btime calculation4!(plan_thread, Intensity, Q0, Q1, Q2, gop);
1.268 s (214 allocations: 19.23 KiB)
This made it significantly faster. Although if things were actually changing in the gOperator this might not be possible. But for my problem this works.
NOTE: I used eachindex()
function
- Next, rather than using the
eachindex()
to loop, I did explicit for loop:
function calculation5!(plan, In::Matrix{ComplexF64},Q::Array{ComplexF64, 3}, gOP::Matrix{ComplexF64})
#FFT
plan*Q;
@threads for l in 1:size(Q,2)
for m in 1:size(Q,1)
Q[m,l,1] = Q[m,l,1] +gOP[m,l]*abs2(In[m,l]);
Q[m,l,2] = Q[m,l,2] +gOP[m,l]*conj(In[m,l])*In[m,l];
Q[m,l,3]= Q[m,l,3] +gOP[m,l]*conj(In[m,l])*In[m,l];
end
end
plan\Q;
end
@btime calculation5!(plan_thread3D, Intensity,Q3D, gop);
1.147 s (680 allocations: 56.45 KiB)
This was again faster than using CartesianIndices
or eachindex
- Next, I combined my Q matrices into a 3D array of 2D matrices (sz,sz,3).
NOTE: For 6 and 7 I had to return to using OpenBLAS because MKL did not want to do non-square FFT
Q3D = rand(ComplexF64, (sz,sz,3));
and formed a FFT plan in the 1st dimension:
plan_thread3D = FFTW.plan_fft!(Q3D,1,flags=FFTW.MEASURE);
function calculation6!(plan, In::Matrix{ComplexF64},Q::Array{ComplexF64, 3}, gOP::Matrix{ComplexF64})
#FFT
plan*Q;
@threads for l in 1:size(Q,3)
for m in 1:size(Q,2)
Q[1,m,l] = Q[1,m,l] +gOP[m,l]*abs2(In[m,l]);
Q[2,m,l] = Q[2,m,l] +gOP[m,l]*conj(In[m,l])*In[m,l];
Q[3,m,l]= Q[3,m,l] +gOP[m,l]*conj(In[m,l])*In[m,l];
end
end
plan\Q;
end
@btime calculation6!(plan_thread3D, Intensity,Q3D, gop);
1.352 s (678 allocations: 56.42 KiB)
This was slower than doing individual 1D FFT on each of the three 2D matrices although not by much.
- I thought maybe the slowing on 6 came from the way the matrices were stacked so I flipped the order:
Q3Da = rand(ComplexF64, (3,sz,sz));
plan_thread3Da = FFTW.plan_fft!(Q3Da,2,flags=FFTW.MEASURE);
and using calculation6!
@btime calculation6!(plan_thread3Da, Intensity,Q3Da, gop);
1.238 s (678 allocations: 56.42 KiB)
- Last I did a serial FFT planner to see if I could incorporate it into the multithread operations:
seralarray = rand(ComplexF64, sz);
serailplan = FFTW.plan_fft!(seralarray,flags=FFTW.MEASURE);
function calculation7!(plan, In::Matrix{ComplexF64}, Qa::Matrix{ComplexF64}, Qb::Matrix{ComplexF64}, Qc::Matrix{ComplexF64}, gOP::Matrix{ComplexF64})
#FFT
#plan*Q;
@threads for l in 1:size(Qa,2)
plan*Qa[:,l];
plan*Qb[:,l];
plan*Qc[:,l];
for m in 1:size(Qa,1)
Qa[m,l] = Qa[m,l] +gOP[m,l]*abs2(In[m,l]);
Qb[m,l] = Qb[m,l] +gOP[m,l]*conj(In[m,l])*In[m,l];
Qc[m,l]= Qc[m,l] +gOP[m,l]*conj(In[m,l])*In[m,l];
end
plan\Qa[:,l];
plan\Qb[:,l];
plan\Qc[:,l];
end
#plan\Q;
end
With the benchmark:
@btime calculation7!(serailplan, Intensity, Q0, Q1, Q2, gop);
1.504 s (10226925 allocations: 6.84 GiB)
The allocations necessary for this was huge and did not provide any extra time benefit.
CONCLUSION
- There’s still more to be learned about best way to optimize FFT over many matrices using multithreading.
- There’s something I don’t understand in looping when explicitly setting the loop indices and using
CartesianIndices
and eachindex
. Additonally, I’m not sure broadcasting doesn’t do the same thing as explicitly indexing in for loops? Perhaps @stevengj can comment on this
NOTE: For 6 and 7 I had to return to using OpenBLAS because MKL did not want to do non-square FFT