Building on the result of @PeterSimon, I found another big improvement: Note your matrices `m1`

,`m1t`

,`m2`

,`m2t`

are all very sparse so you should use SparseArrays.jl for that. Doing that I got an improvement

```
old: 3.356652 seconds (6.60 k allocations: 114.960 MiB)
new: 2.286051 seconds (805 allocations: 83.535 MiB)
```

I then notices that the last operation in the loop is missing a `.`

in front of the “-” so it allocates a new array and then just copies it to `j`

. I also made use of the 5-arg version of `mul!`

:

```
mul!(A,B,C,x,y) -> A = x*B*C + y A
```

to get rid of half the temporary arrays for a ~5-10% speedup.

My final time is

```
2.067672 seconds (113 allocations: 32.046 MiB)
```

with code:

```
using LinearAlgebra, MKL, SparseArrays
BLAS.set_num_threads(1) # Avoid contention with threaded loop
using OhMyThreads: TaskLocalValue
@views function tryit()
Nf = 40
##---Initialisations---
Vvi = rand(ComplexF64, 4*(2*Nf+1),4*(2*Nf+1))
Gr = rand(ComplexF64, 4*(2*Nf+1),4*(2*Nf+1))
Gl = rand(ComplexF64, 4*(2*Nf+1),4*(2*Nf+1))
Ga = rand(ComplexF64, 4*(2*Nf+1),4*(2*Nf+1))
Mi = rand(ComplexF64, 2*2*(2*Nf+1),2*2*(2*Nf+1),8*Nf+2)
M = rand(ComplexF64, 2*2*(2*Nf+1),2*2*(2*Nf+1),8*Nf+2)
# is there a reason for this loop?
# for ij = 1:2*(4*Nf+1)
# Mi[:,:,ij] = rand(ComplexF64, 2*2*(2*Nf+1),2*2*(2*Nf+1))
# M[:,:,ij] = rand(ComplexF64, 2*2*(2*Nf+1),2*2*(2*Nf+1))
# end
m1t = kron(I(2*(2*Nf+1)), sparse([1 0])); m1 = transpose(m1t); m2t = kron(I(2*(2*Nf+1)), sparse([0 1])); m2 = transpose(m2t)
j = zeros(ComplexF64, 2*Nf+1,2*Nf+1,2*(4*Nf+1))
j0 = zeros(ComplexF64, 4*(2*Nf+1),4*(2*Nf+1), 8*Nf+2)
j1 = zeros(ComplexF64, 2*(2*Nf+1),2*(2*Nf+1), 8*Nf+2)
tls = TaskLocalValue{Vector{Matrix{ComplexF64}}}( () ->
begin
t1, t2 = (similar(Gr) for _ in 1:2)
bs1 = zeros(ComplexF64, size(m2))
#ss1, ss2 = (zeros(ComplexF64, size(m1t,1), size(m1t,1)) for _ in 1:2)
[t1, t2, bs1]#, ss1, ss2]
end)
##---Calculation--
@time Threads.@threads for ij = 1:2*(4*Nf+1)
t1, t2, bs1 = tls[]
#j0[:,:,ij] .= -( (Mi[:,:,ij]*Gl) + (Vvi*( Gr*M[:,:,ij]*Gl + Gl*M[:,:,ij]*Ga )) )
mul!(t1, Gl, M[:,:,ij]); mul!(t2, t1, Ga) # t2 contains Gl * M[:,:,ij] * Ga
mul!(t1, Gr, M[:,:,ij]); mul!(t2, t1, Gl, true, true) # t2 contains Gr * M[:,:,ij] * Gl + Gl * M[:,:,ij] * Ga
mul!(j0[:,:,ij], Vvi, t2, -1, false) # multiply into j0[:,:,ij]
mul!(j0[:,:,ij], Mi[:,:,ij], Gl, -1, true) # and subtract other part directly
#j1[:,:,ij] .= m1t*j0[:,:,ij]*m1 + m2t*j0[:,:,ij]*m2
# this uses sparse matrices now!
mul!(bs1, j0[:,:,ij], m1); mul!(j1[:,:,ij], m1t, bs1)
mul!(bs1, j0[:,:,ij], m2); mul!(j1[:,:,ij], m2t, bs1, true, true)
# note the additional .
j[:,:,ij] .= j1[1:(2*Nf+1),1:(2*Nf+1),ij] .- j1[(2*Nf+1)+1:2*(2*Nf+1),(2*Nf+1)+1:2*(2*Nf+1),ij] #much faster than the prev. two lines. Not worth optimising
end
end
```