These two versions seem to be slightly faster (yet that may be noise):
function eval_exp_tweaked_4_serial(N) # ... and range collected
a = collect(range(0, stop=2*pi, length=N))
A = Matrix{ComplexF64}(undef, N, N)
for (j,aj) in pairs(a)
for i in eachindex(a)
@inbounds A[i,j] = cis(100*(a[i]^2 + aj^2))
end
end
return A
end
function eval_exp_tweaked_4_parallel(N) # ... and range collected
a = collect(range(0, stop=2*pi, length=N))
A = Matrix{ComplexF64}(undef, N, N)
Threads.@threads for j in eachindex(a)
for i in eachindex(a)
@inbounds A[i,j] = cis(100*(a[i]^2 + a[j]^2))
end
end
return A
end
Result:
tweaked3: 15.497 ms (3 allocations: 15.27 MiB)
tweaked3_p: 5.139 ms (24 allocations: 15.27 MiB)
tweaked4: 14.773 ms (3 allocations: 15.27 MiB)
tweaked4_p: 4.971 ms (24 allocations: 15.27 MiB)
But I wanted to show the iterations using pairs and eachindex, which are nice because they do not depend on the array index style and also because you can guarantee inbounds with that without having to use the flag. pairs probably gives some speedup depending on how indexes and values are used repeatedly in the loop.
Code
using Test, BenchmarkTools
function eval_exp_tweaked_3_parallel(N) # ... and range collected
a = collect(range(0, stop=2*pi, length=N))
A = Matrix{ComplexF64}(undef, N, N)
@inbounds Threads.@threads for j in 1:N
for i in 1:N
A[i,j] = cis(100*(a[i]^2 + a[j]^2))
end
end
return A
end
function eval_exp_tweaked_3_serial(N) # ... and range collected
a = collect(range(0, stop=2*pi, length=N))
A = Matrix{ComplexF64}(undef, N, N)
@inbounds for j in 1:N
for i in 1:N
A[i,j] = cis(100*(a[i]^2 + a[j]^2))
end
end
return A
end
function eval_exp_tweaked_4_serial(N) # ... and range collected
a = collect(range(0, stop=2*pi, length=N))
A = Matrix{ComplexF64}(undef, N, N)
for (j,aj) in pairs(a)
for i in eachindex(a)
@inbounds A[i,j] = cis(100*(a[i]^2 + aj^2))
end
end
return A
end
function eval_exp_tweaked_4_parallel(N) # ... and range collected
a = collect(range(0, stop=2*pi, length=N))
A = Matrix{ComplexF64}(undef, N, N)
Threads.@threads for j in eachindex(a)
for i in eachindex(a)
@inbounds A[i,j] = cis(100*(a[i]^2 + a[j]^2))
end
end
return A
end
N = 10000
@test eval_exp_tweaked_3_serial(N) ≈ eval_exp_tweaked_4_serial(N) ≈
eval_exp_tweaked_3_parallel(N) ≈ eval_exp_tweaked_4_parallel(N)
print("tweaked3:"); @btime eval_exp_tweaked_3_serial($N)
print("tweaked3_p:"); @btime eval_exp_tweaked_3_parallel($N)
print("tweaked4:"); @btime eval_exp_tweaked_4_serial($N)
print("tweaked4_p:"); @btime eval_exp_tweaked_4_parallel($N)
nothing