Once more, with feeling (and threads/SIMD/cache locality):
using LoopVectorization
function f4(M, G, J, H, A, B, ϕ)
# set up thread-local storage
 = factorize(A)
Âs = [deepcopy(Â) for i = 1:Threads.nthreads()]
tmp = [similar(ϕ, G-1) for i = 1:Threads.nthreads()]
# reinterpret arrays to get around LV.jl's limitations for complex numbers
Bf = reinterpret(reshape, Float64, B)
ϕf = reinterpret(reshape, Float64, ϕ)
tmpf = reinterpret.(reshape, Float64, tmp)
@inbounds for mm = 1:M
m_idx = M + 2 - mm
Threads.@threads for hh = 1:H+1
tmpl = tmp[Threads.threadid()]
tmpfl = tmpf[Threads.threadid()]
Âl = Âs[Threads.threadid()]
h_idx = (hh - 1)*jmax
for jj = 1:2*J + 1
@avx for gg = 1:G-1
tmpfl[1, gg] = ϕf[1, jj, gg+1, hh, m_idx] +
Bf[1, jj, gg, hh, m_idx]
tmpfl[2, gg] = ϕf[2, jj, gg+1, hh, m_idx] +
Bf[2, jj, gg, hh, m_idx]
end
ldiv!(Âl, tmpl)
@avx for gg = 1:G-1
ϕf[1, jj, gg + 1, hh, m_idx] = tmpfl[1, gg]
ϕf[2, jj, gg + 1, hh, m_idx] = tmpfl[2, gg]
end
end
end
end
end
julia> @btime f4($M, $G, $J, $H, $A, $B, $ϕ)
335.336 ms (8911 allocations: 915.33 KiB)