I have a code that is embarrassingly parallel, but I cannot get full its speed up.
The dummy code is here:
function points_on_sphere_surface(num_pts=100, R=2)
indices = 1:num_pts .+ 0.5
ϕ = acos.(1 .- 2*indices/num_pts)
θ = π*(1 + sqrt(5)).*indices
x, y, z = R.*cos.(θ).*sin.(ϕ), R.*sin.(θ).*sin.(ϕ), R.*cos.(ϕ)
return [x y z]
end
function original_foo(sphere, r, β)
N = length(β)
nx = sphere[1]
ny = sphere[2]
nz = sphere[3]
intensity = zeros(N, N)
for n=1:N
for m=1:N
rx = r[1,n] - r[1, m]
ry = r[2,n] - r[2, m]
rz = r[3,n] - r[3, m]
dot_n_r = nx*rx + ny*ry + nz*rz
intensity[n,m] = real(conj(β[n])*β[m]*exp(im*dot_n_r)) # I need to define a matrix
end
end
return sum(intensity)
end
N = 500
r = rand(3, N)
β = rand(ComplexF64, N)
sphere = points_on_sphere_surface(200, 10)
time_sequencial = @elapsed begin
x = zeros(200)
for i = 1:200
point_in_sphere = sphere[i,:]
x[i] = original_foo(point_in_sphere, r, β)
end
end
time_parallel = @elapsed begin
x = []
for i = 1:200
point_in_sphere = sphere[i,:]
push!(x, Threads.@spawn original_foo(point_in_sphere, r, β))
end
y = fetch.(x);
end
speedup = time_sequencial/time_parallel
My notebook has 4cores/8threads. and I initiate Julia with 8 threads.
However, my best case scenario I have around 4 times speed up.
Is there any limitation in my code that sets this upper boundary ?