To get even further into the performance rabbit hole, it’s always fun to bring out LoopVectorization.jl. It doesn’t understand how to handle complex numbers yet, but fortunately we can just reinterpret a matrix of complex floats as a 3-D array of real floats and then operate on that. This example requires a new feature in VectorizationBase.jl, version 0.12.20 0.20.4, so make sure you use that version or newer if you try this.
using LoopVectorization: @tturbo
function eval_exp_tturbo(N)
A = Matrix{Complex{Float64}}(undef, N, N)
a = range(0, stop=2*pi, length=N)
_A = reinterpret(reshape, Float64, A)
@tturbo for i in 1:N, j in 1:N
Aij_im, Aij_re = sincos(100*(a[i]^2 + a[j]^2))
_A[1,i,j] = Aij_re
_A[2,i,j] = Aij_im
end
A
end
print(string("running loop on ", Threads.nthreads(), " threads \n"))
for N in 1_000:1_000:10_000
@show N
A = @btime eval_exp($N)
B = @btime eval_exp_tweaked_4($N)
C = @btime eval_exp_tturbo($N)
@assert A ≈ B ≈ C
println()
end
running loop on 6 threads
N = 1000
7.497 ms (33 allocations: 15.26 MiB)
2.720 ms (34 allocations: 15.27 MiB)
999.171 μs (2 allocations: 15.26 MiB)
N = 2000
36.936 ms (33 allocations: 61.04 MiB)
13.246 ms (34 allocations: 61.05 MiB)
6.112 ms (2 allocations: 61.04 MiB)
N = 3000
74.096 ms (33 allocations: 137.33 MiB)
29.550 ms (35 allocations: 137.35 MiB)
13.667 ms (2 allocations: 137.33 MiB)
N = 4000
131.016 ms (35 allocations: 244.14 MiB)
52.159 ms (36 allocations: 244.17 MiB)
24.066 ms (2 allocations: 244.14 MiB)
N = 5000
215.135 ms (35 allocations: 381.47 MiB)
80.854 ms (37 allocations: 381.51 MiB)
37.622 ms (2 allocations: 381.47 MiB)
N = 6000
319.400 ms (35 allocations: 549.32 MiB)
122.928 ms (37 allocations: 549.37 MiB)
62.631 ms (3 allocations: 549.32 MiB)
N = 7000
491.836 ms (35 allocations: 747.68 MiB)
164.792 ms (38 allocations: 747.74 MiB)
79.859 ms (5 allocations: 747.68 MiB)
N = 8000
883.225 ms (37 allocations: 976.57 MiB)
227.527 ms (37 allocations: 976.63 MiB)
114.281 ms (6 allocations: 976.56 MiB)
N = 9000
827.222 ms (35 allocations: 1.21 GiB)
283.081 ms (37 allocations: 1.21 GiB)
162.987 ms (4 allocations: 1.21 GiB)
N = 10000
1.095 s (37 allocations: 1.49 GiB)
329.651 ms (38 allocations: 1.49 GiB)
156.945 ms (4 allocations: 1.49 GiB)