Greetings,
I am translating a C++ particle simulation code into Julia. In each iteration, the algorithm computes forces that are given by pairwise interactions, which leads to an O(N^2) operation, with N the number of particles. I want to use multithreading to (roughly) evenly distribute the work across a few threads. With C++, for 1000 particles, I would be getting close-to-linear speedups up to 6 or 7 threads.
In Julia, however, using two threads is twice as slow as using one thread.
Am I doing something fundamentally wrong here?
function compute_forces!(system::ParticleSystem)
N = system.N_particles
box = system.boxlength
# Zero out forces, energy, laplacian
fill!(system.forces, 0.0)
forces_per_thread = system.forces_per_thread
n_threads = system.n_threads
fill!(forces_per_thread, 0.0)
# Parallelize over threads, distribute i indices cyclically for better load balance.
Threads.@threads for tid in 1:n_threads
r_ij = MVector{2, Float64}(0.0, 0.0)
@inbounds for i in tid:n_threads:N-1
@inbounds for j in (i+1):N
# Minimum image convention for 2D box
r_ij[1] = mod(system.positions[1,i] - system.positions[1,j] + box/2, box) - box/2
r_ij[2] = mod(system.positions[2,i] - system.positions[2,j] + box/2, box) - box/2
# Compute force for this pair
force = system.force_model(r_ij)
# Accumulate forces using Newton's 3rd law into thread-local buffer
@inbounds for k in 1:2
forces_per_thread[k, i, tid] += force[k]
forces_per_thread[k, j, tid] -= force[k]
end
# Accumulate pairwise energy and laplacian
energies_per_thread[tid] += energy
laplaceU_per_thread[tid] += 2 * laplace_U # Factor 2 for i and j.
end
end
end
# Reduction over threads: forces
@inbounds for i in 1:N
@inbounds for k in 1:2
acc = 0.0
@inbounds for t in 1:n_threads
acc += forces_per_thread[k, i, t]
end
system.forces[k, i] = acc
end
end
end
And the particle system struct:
mutable struct ParticleSystem{F <: AbstractForce}
# HYPERPARAMETERS.
boxlength :: Float64 # Length of the (cubic) simulation box [-L/2, L/2]^2.
N_particles :: Int64 # Number of particles.
# CONFIGURATION.
positions ::Matrix{Float64} # All matrices are of shape (2, N).
momenta ::Matrix{Float64}
forces ::Matrix{Float64}
energy ::Float64 # Potential energy in the system.
laplace_U ::Float64
n_threads ::Int
forces_per_thread ::Array{Float64,3} # Buffers for multithreading.
force_model ::F # Specify pairwise interactions.
end
function ParticleSystem(boxlength::Real, N_particles::Int, force_model::F) where {F<:AbstractForce}
positions = Matrix{Float64}(undef, 2, N_particles)
momenta = Matrix{Float64}(undef, 2, N_particles)
forces = Matrix{Float64}(undef, 2, N_particles)
energy = 0.0
laplaceU = 0.0
# Thread-local buffers: lay out as (2, N, n_threads) so each thread
# writes to its own contiguous slice, avoiding false sharing.
n_threads = Threads.nthreads()
forces_per_thread = zeros(Float64, 2, N_particles, n_threads)
return ParticleSystem{F}(boxlength, N_particles, positions, momenta,
forces, energy, laplaceU,
n_threads, forces_per_thread,
force_model)
end