Performance of a key step in Strang Splitting method

Once more, with feeling (and threads/SIMD/cache locality):

using LoopVectorization

function f4(M, G, J, H, A, B, ϕ)
    # set up thread-local storage
    Â = factorize(A)
    Âs = [deepcopy(Â) for i = 1:Threads.nthreads()]
    tmp = [similar(ϕ, G-1) for i = 1:Threads.nthreads()]
    # reinterpret arrays to get around LV.jl's limitations for complex numbers
    Bf = reinterpret(reshape, Float64, B)
    ϕf = reinterpret(reshape, Float64, ϕ)
    tmpf = reinterpret.(reshape, Float64, tmp)

    @inbounds for mm = 1:M
        m_idx = M + 2 - mm
        Threads.@threads for hh = 1:H+1
            tmpl = tmp[Threads.threadid()]
            tmpfl = tmpf[Threads.threadid()]
            Âl = Âs[Threads.threadid()]
            h_idx = (hh - 1)*jmax
            for jj = 1:2*J + 1
                @avx for gg = 1:G-1
                    tmpfl[1, gg] = ϕf[1, jj, gg+1, hh, m_idx] +
                                   Bf[1, jj, gg,   hh, m_idx]
                    tmpfl[2, gg] = ϕf[2, jj, gg+1, hh, m_idx] +
                                   Bf[2, jj, gg,   hh, m_idx]
                end
                ldiv!(Âl, tmpl)
                @avx for gg = 1:G-1
                    ϕf[1, jj, gg + 1, hh, m_idx] = tmpfl[1, gg]
                    ϕf[2, jj, gg + 1, hh, m_idx] = tmpfl[2, gg]
                end
            end
        end
    end
end
julia> @btime f4($M, $G, $J, $H, $A, $B, $ϕ)
  335.336 ms (8911 allocations: 915.33 KiB)
6 Likes