I just decided to migrate from Python+Fortran to Julia as Julia was faster in my test

These two versions seem to be slightly faster (yet that may be noise):

function eval_exp_tweaked_4_serial(N) # ... and range collected
    a = collect(range(0, stop=2*pi, length=N))
    A = Matrix{ComplexF64}(undef, N, N)
    for (j,aj) in pairs(a)
        for i in eachindex(a)
            @inbounds A[i,j] = cis(100*(a[i]^2 + aj^2))
        end
    end
    return A
end

function eval_exp_tweaked_4_parallel(N) # ... and range collected
    a = collect(range(0, stop=2*pi, length=N))
    A = Matrix{ComplexF64}(undef, N, N)
    Threads.@threads for j in eachindex(a)
        for i in eachindex(a)
            @inbounds A[i,j] = cis(100*(a[i]^2 + a[j]^2))
        end
    end
    return A
end

Result:

tweaked3:  15.497 ms (3 allocations: 15.27 MiB)
tweaked3_p:  5.139 ms (24 allocations: 15.27 MiB)
tweaked4:  14.773 ms (3 allocations: 15.27 MiB)
tweaked4_p:  4.971 ms (24 allocations: 15.27 MiB)

But I wanted to show the iterations using pairs and eachindex, which are nice because they do not depend on the array index style and also because you can guarantee inbounds with that without having to use the flag. pairs probably gives some speedup depending on how indexes and values are used repeatedly in the loop.

Code
using Test, BenchmarkTools

function eval_exp_tweaked_3_parallel(N) # ... and range collected
    a = collect(range(0, stop=2*pi, length=N))
    A = Matrix{ComplexF64}(undef, N, N)
    @inbounds Threads.@threads for j in 1:N
        for i in 1:N
            A[i,j] = cis(100*(a[i]^2 + a[j]^2))
        end
    end
    return A
end

function eval_exp_tweaked_3_serial(N) # ... and range collected
    a = collect(range(0, stop=2*pi, length=N))
    A = Matrix{ComplexF64}(undef, N, N)
    @inbounds for j in 1:N
        for i in 1:N
            A[i,j] = cis(100*(a[i]^2 + a[j]^2))
        end
    end
    return A
end

function eval_exp_tweaked_4_serial(N) # ... and range collected
    a = collect(range(0, stop=2*pi, length=N))
    A = Matrix{ComplexF64}(undef, N, N)
    for (j,aj) in pairs(a)
        for i in eachindex(a)
            @inbounds A[i,j] = cis(100*(a[i]^2 + aj^2))
        end
    end
    return A
end

function eval_exp_tweaked_4_parallel(N) # ... and range collected
    a = collect(range(0, stop=2*pi, length=N))
    A = Matrix{ComplexF64}(undef, N, N)
    Threads.@threads for j in eachindex(a)
        for i in eachindex(a)
            @inbounds A[i,j] = cis(100*(a[i]^2 + a[j]^2))
        end
    end
    return A
end

N = 10000
@test eval_exp_tweaked_3_serial(N) ≈ eval_exp_tweaked_4_serial(N) ≈
      eval_exp_tweaked_3_parallel(N) ≈ eval_exp_tweaked_4_parallel(N)

print("tweaked3:"); @btime eval_exp_tweaked_3_serial($N) 

print("tweaked3_p:"); @btime eval_exp_tweaked_3_parallel($N) 

print("tweaked4:"); @btime eval_exp_tweaked_4_serial($N)

print("tweaked4_p:"); @btime eval_exp_tweaked_4_parallel($N)

nothing

2 Likes