I just decided to migrate from Python+Fortran to Julia as Julia was faster in my test

To get even further into the performance rabbit hole, it’s always fun to bring out LoopVectorization.jl. It doesn’t understand how to handle complex numbers yet, but fortunately we can just reinterpret a matrix of complex floats as a 3-D array of real floats and then operate on that. This example requires a new feature in VectorizationBase.jl, version 0.12.20 0.20.4, so make sure you use that version or newer if you try this.

using LoopVectorization: @tturbo

function eval_exp_tturbo(N)
    A = Matrix{Complex{Float64}}(undef, N, N)
    a = range(0, stop=2*pi, length=N)
    _A = reinterpret(reshape, Float64, A)
    @tturbo for i in 1:N, j in 1:N
        Aij_im, Aij_re = sincos(100*(a[i]^2 + a[j]^2))
        _A[1,i,j] = Aij_re
        _A[2,i,j] = Aij_im
    end
    A
end
print(string("running loop on ", Threads.nthreads(), " threads \n"))
for N in 1_000:1_000:10_000
    @show N
    A = @btime eval_exp($N)
    B = @btime eval_exp_tweaked_4($N)
    C = @btime eval_exp_tturbo($N)
    @assert A ≈ B ≈ C
    println()
end
running loop on 6 threads 
N = 1000
  7.497 ms (33 allocations: 15.26 MiB)
  2.720 ms (34 allocations: 15.27 MiB)
  999.171 μs (2 allocations: 15.26 MiB)

N = 2000
  36.936 ms (33 allocations: 61.04 MiB)
  13.246 ms (34 allocations: 61.05 MiB)
  6.112 ms (2 allocations: 61.04 MiB)

N = 3000
  74.096 ms (33 allocations: 137.33 MiB)
  29.550 ms (35 allocations: 137.35 MiB)
  13.667 ms (2 allocations: 137.33 MiB)

N = 4000
  131.016 ms (35 allocations: 244.14 MiB)
  52.159 ms (36 allocations: 244.17 MiB)
  24.066 ms (2 allocations: 244.14 MiB)

N = 5000
  215.135 ms (35 allocations: 381.47 MiB)
  80.854 ms (37 allocations: 381.51 MiB)
  37.622 ms (2 allocations: 381.47 MiB)

N = 6000
  319.400 ms (35 allocations: 549.32 MiB)
  122.928 ms (37 allocations: 549.37 MiB)
  62.631 ms (3 allocations: 549.32 MiB)

N = 7000
  491.836 ms (35 allocations: 747.68 MiB)
  164.792 ms (38 allocations: 747.74 MiB)
  79.859 ms (5 allocations: 747.68 MiB)

N = 8000
  883.225 ms (37 allocations: 976.57 MiB)
  227.527 ms (37 allocations: 976.63 MiB)
  114.281 ms (6 allocations: 976.56 MiB)

N = 9000
  827.222 ms (35 allocations: 1.21 GiB)
  283.081 ms (37 allocations: 1.21 GiB)
  162.987 ms (4 allocations: 1.21 GiB)

N = 10000
  1.095 s (37 allocations: 1.49 GiB)
  329.651 ms (38 allocations: 1.49 GiB)
  156.945 ms (4 allocations: 1.49 GiB)
6 Likes