Ok so I tried a couple of things:

- I tried fixing the loop so that there wouldn’t be any broadcasting using cartesian index:

```
function calculation1!(plan, Qa::Matrix{ComplexF64}, Qb::Matrix{ComplexF64}, Qc::Matrix{ComplexF64}, OP::Matrix{ComplexF64}, f::Vector{Float64}, wj::Vector{Float64}, dwj::Vector{Float64}, gj::Vector{Float64})
#FFT
plan*Qa;
plan*Qb;
plan*Qc;
@threads for l in CartesianIndices(Qa)
m = l[1];
n = l[2];
for j in 1:size(wj)[1]
Qa[l] = Qa[l] +0.5*wj[j]*dwj[j]*gj[j]/(-f[m]^2 - 1.0im *f[m] *dwj[j] + wj[j]^2)*abs2(OP[l]);
Qb[l] = Qb[l] +0.5*wj[j]*dwj[j]*gj[j]/(-f[m]^2 - 1.0im *f[m] *dwj[j] + wj[j]^2)*conj(OP[l])*OP[l];
Qc[l] = Qc[l] +0.5*wj[j]*dwj[j]*gj[j]/(-f[m]^2 - 1.0im *f[m] *dwj[j] + wj[j]^2)*conj(OP[l])*OP[l];
end
end
plan\Qa;
plan\Qb;
plan\Qc;
end
```

Benchmarking gave me:

```
@btime calculation1!(plan_thread, Q0, Q1, Q2, Operator, fr, wj,dwj, gj );
3.122 s (609 allocations: 71.98 KiB)
```

So this was actually worse.

- Next I realized that the frequency operation (
`wj`

, `f`

, `dwj`

, and `gj`

) is the same for all three matrices and can be computed once for all three rather than for each:

```
function calculation2!(plan, Qa::Matrix{ComplexF64}, Qb::Matrix{ComplexF64}, Qc::Matrix{ComplexF64}, OP::Matrix{ComplexF64}, f::Vector{Float64}, wj::Vector{Float64}, dwj::Vector{Float64}, gj::Vector{Float64})
#FFT
plan*Qa;
plan*Qb;
plan*Qc;
@threads for l in CartesianIndices(Qa)
m = l[1];
n = l[2];
for j in 1:size(wj)[1]
gainop = 0.5*wj[j]*dwj[j]*gj[j]/(-f[m]^2 - 1.0im *f[m] *dwj[j] + wj[j]^2);
Qa[l] = Qa[l] +gainop*abs2(OP[l]);
Qb[l] = Qb[l] +gainop*conj(OP[l])*OP[l];
Qc[l] = Qc[l] +gainop*conj(OP[l])*OP[l];
end
end
plan\Qa;
plan\Qb;
plan\Qc;
end
```

With benchmarking of:

```
@btime calculation2!(plan_thread, Q0, Q1, Q2, Operator, fr, wj,dwj, gj );
2.023 s (204 allocations: 19.98 KiB)
```

This definitely improved it and the code looks much cleaner than broadcasting.

- Next, rather than use
`CartesianIndexing`

(which I read somewhere a long while ago that it is slow?) I used the loop configuration that @stevengj recommended:

```
function calculation3!(plan, Qa::Matrix{ComplexF64}, Qb::Matrix{ComplexF64}, Qc::Matrix{ComplexF64}, OP::Matrix{ComplexF64}, f::Vector{Float64}, wj::Vector{Float64}, dwj::Vector{Float64}, gj::Vector{Float64})
#FFT
plan*Qa;
plan*Qb;
plan*Qc;
@threads for l in 1:size(Qa)[2]
for i in 1:size(Qa,1), j in 1:length(wj)
gainop = 0.5*wj[j]*dwj[j]*gj[j]/(-f[i]^2 - 1.0im *f[i] *dwj[j] + wj[j]^2);
Qa[i,l] = Qa[i,l] +gainop*abs2(OP[i,l]);
Qb[i,l] = Qb[i,l] +gainop*conj(OP[i,l])*OP[i,l];
Qc[i,l] = Qc[i,l] +gainop*conj(OP[i,l])*OP[i,l];
end
end
plan\Qa;
plan\Qb;
plan\Qc;
end
```

With benchmarking:

```
@btime calculation3!(plan_thread, Q0, Q1, Q2, Operator, fr, wj,dwj, gj );
1.995 s (206 allocations: 20.05 KiB)
```

This improved it just a tad bit. It makes me wonder what the difference is in `CartesianIndices`

and **specifying loop parameters explicitly**.

- Next, because the frequency operator (which I call
`gOperator`

) was constant for all matrices, I realized again that it was a constant throughout all computations. So I made it into a matrix operator.

```
wj = rand(10);
dwj = rand(10);
gj = rand(10);
gop = zeros(ComplexF64, size(Q0));
function goperator!(gOP::Matrix{ComplexF64}, f::Vector{Float64}, wj::Vector{Float64}, dwj::Vector{Float64}, gj::Vector{Float64})
@threads for l in 1:size(gOP,2)
for m in 1:size(gOP,1), j in 1:length(wj)
gOP[m,l] = 0.5*wj[j]*dwj[j]*gj[j]/(-f[m]^2 - 1.0im *f[m] *dwj[j] + wj[j]^2);
end
end
end
```

With this I made a new calculation function:

```
function calculation4!(plan, In::Matrix{ComplexF64},Qa::Matrix{ComplexF64}, Qb::Matrix{ComplexF64}, Qc::Matrix{ComplexF64}, gOP::Matrix{ComplexF64})
#FFT
plan*Qa;
plan*Qb;
plan*Qc;
@threads for l in eachindex(Qa)
Qa[l] = Qa[l] +gOP[l]*abs2(In[l]);
Qb[l] = Qb[l] +gOP[l]*conj(In[l])*In[l];
Qc[l] = Qc[l] +gOP[l]*conj(In[l])*In[l];
end
plan\Qa;
plan\Qb;
plan\Qc;
end
```

with benchmark:

```
@btime calculation4!(plan_thread, Intensity, Q0, Q1, Q2, gop);
1.268 s (214 allocations: 19.23 KiB)
```

This made it significantly faster. Although if things were actually changing in the gOperator this might not be possible. But for my problem this works.

**NOTE:** I used `eachindex()`

function

- Next, rather than using the
`eachindex()`

to loop, I did explicit for loop:

```
function calculation5!(plan, In::Matrix{ComplexF64},Q::Array{ComplexF64, 3}, gOP::Matrix{ComplexF64})
#FFT
plan*Q;
@threads for l in 1:size(Q,2)
for m in 1:size(Q,1)
Q[m,l,1] = Q[m,l,1] +gOP[m,l]*abs2(In[m,l]);
Q[m,l,2] = Q[m,l,2] +gOP[m,l]*conj(In[m,l])*In[m,l];
Q[m,l,3]= Q[m,l,3] +gOP[m,l]*conj(In[m,l])*In[m,l];
end
end
plan\Q;
end
```

```
@btime calculation5!(plan_thread3D, Intensity,Q3D, gop);
1.147 s (680 allocations: 56.45 KiB)
```

This was again faster than using `CartesianIndices`

or `eachindex`

- Next, I combined my Q matrices into a 3D array of 2D matrices (sz,sz,3).

**NOTE:** For 6 and 7 I had to return to using OpenBLAS because MKL did not want to do non-square FFT

`Q3D = rand(ComplexF64, (sz,sz,3));`

and formed a FFT plan in the 1st dimension:

`plan_thread3D = FFTW.plan_fft!(Q3D,1,flags=FFTW.MEASURE);`

```
function calculation6!(plan, In::Matrix{ComplexF64},Q::Array{ComplexF64, 3}, gOP::Matrix{ComplexF64})
#FFT
plan*Q;
@threads for l in 1:size(Q,3)
for m in 1:size(Q,2)
Q[1,m,l] = Q[1,m,l] +gOP[m,l]*abs2(In[m,l]);
Q[2,m,l] = Q[2,m,l] +gOP[m,l]*conj(In[m,l])*In[m,l];
Q[3,m,l]= Q[3,m,l] +gOP[m,l]*conj(In[m,l])*In[m,l];
end
end
plan\Q;
end
```

```
@btime calculation6!(plan_thread3D, Intensity,Q3D, gop);
1.352 s (678 allocations: 56.42 KiB)
```

This was slower than doing individual 1D FFT on each of the three 2D matrices although not by much.

- I thought maybe the slowing on 6 came from the way the matrices were stacked so I flipped the order:

```
Q3Da = rand(ComplexF64, (3,sz,sz));
plan_thread3Da = FFTW.plan_fft!(Q3Da,2,flags=FFTW.MEASURE);
```

and using `calculation6!`

```
@btime calculation6!(plan_thread3Da, Intensity,Q3Da, gop);
1.238 s (678 allocations: 56.42 KiB)
```

- Last I did a serial FFT planner to see if I could incorporate it into the multithread operations:

```
seralarray = rand(ComplexF64, sz);
serailplan = FFTW.plan_fft!(seralarray,flags=FFTW.MEASURE);
```

```
function calculation7!(plan, In::Matrix{ComplexF64}, Qa::Matrix{ComplexF64}, Qb::Matrix{ComplexF64}, Qc::Matrix{ComplexF64}, gOP::Matrix{ComplexF64})
#FFT
#plan*Q;
@threads for l in 1:size(Qa,2)
plan*Qa[:,l];
plan*Qb[:,l];
plan*Qc[:,l];
for m in 1:size(Qa,1)
Qa[m,l] = Qa[m,l] +gOP[m,l]*abs2(In[m,l]);
Qb[m,l] = Qb[m,l] +gOP[m,l]*conj(In[m,l])*In[m,l];
Qc[m,l]= Qc[m,l] +gOP[m,l]*conj(In[m,l])*In[m,l];
end
plan\Qa[:,l];
plan\Qb[:,l];
plan\Qc[:,l];
end
#plan\Q;
end
```

With the benchmark:

```
@btime calculation7!(serailplan, Intensity, Q0, Q1, Q2, gop);
1.504 s (10226925 allocations: 6.84 GiB)
```

The allocations necessary for this was huge and did not provide any extra time benefit.

**CONCLUSION**

- There’s still more to be learned about best way to optimize FFT over many matrices using multithreading.
- There’s something I don’t understand in looping when explicitly setting the loop indices and using
`CartesianIndices`

and `eachindex`

. Additonally, I’m not sure broadcasting doesn’t do the same thing as explicitly indexing in for loops? Perhaps @stevengj can comment on this

**NOTE:** For 6 and 7 I had to return to using OpenBLAS because MKL did not want to do non-square FFT