Nbabel nbody integrator speed up

In the Fortran and Python implementations the loop to compute accelerations is done properly (N(N-1)/2) cycles:

     do i = 1, P%nstars - 1
         do j = i + 1, P%nstars
   for index_p0 in range(nb_particules - 1):
        for index_p1 in range(index_p0 + 1, nb_particules):

while in the Julia implementation it is not (N^2 cycles):

    @inbounds @Threads.threads for i = 1:N
        @fastmath @inbounds @simd for j = 1:N
            if i != j

Concerning the code:

My “base time is” 661 ms (8.41 MB allocations), for the code provided.

Removing all the decorations (simd, inbounds, threads, etc), actually made the code faster (586 ms).

Fixing the loop above and using static arrays took the time to 581 ms and allocations went to 174 kb. No allocations occur anymore in the main routines.

Now, adding back the @inbounds flag to the acceleration and energy computation loops, time goes to 500 ms, 171.78 KiB.

If one considers this speedup reliable (~30%), that would put the Julia benchmark there in pair with the python and rust codes.

I am running with this data, with

using BenchmarkTools
include("./nbabel.jl")
using .NB
@btime NBabel("./input128.dat")
Code


module NB

"""
This is an implementation of the NBabel N-body problem.
See nbabel.org for more information.
This is 'naive & native' Julia with type and loop annotations for
fastmath and vectorization, because this is really is no effort.
Without annotations the code will be as slow as naive Python/IDL.
    include("nbabel.jl")
    NBabel("IC/input128", show=true)
Julius Donnert INAF-IRA 2017
"""

using Printf
using DelimitedFiles
using StaticArrays

tbeg = 0.0
tend = 10.0
dt = 0.001

function NBabel(fname::String; tend = tend, dt = dt, show=false)

    if show
	    println("Reading file : $fname")
    end

    ID, mass, pos, vel = read_ICs(fname)

    return NBabelCalcs(ID, mass, pos, vel, tend, dt, show)
end

function NBabelCalcs(ID, mass, pos, vel, tend = tend, dt = dt, show=false)
    acc = similar(vel)
    acc = compute_acceleration(pos, mass, acc)
    last_acc = copy(acc)

    Ekin, Epot = compute_energy(pos, vel, mass)
    Etot_ICs = Ekin + Epot

    t = 0.0
    nstep = 0

    while t < tend

        pos = update_positions(pos, vel, acc, dt)

        acc, last_acc = last_acc, acc

        acc = compute_acceleration(pos, mass, acc)

        vel = update_velocities(vel, acc, last_acc, dt)
        
        t += dt
        nstep += 1

        if show && nstep%100 == 0

            Ekin, Epot = compute_energy(pos, vel, mass)
            Etot = Ekin + Epot
            dE = (Etot - Etot_ICs)/Etot_ICs

            @printf "t = %g, Etot=%g, Ekin=%g, Epot=%g, dE=%g \n" t Etot Ekin Epot dE
        end
    end

    Ekin, Epot = compute_energy(pos, vel, mass)
    Etot = Ekin + Epot
    return (; Ekin, Epot, Etot)
    # return size(pos,1)
end

function update_positions(pos, vel, acc, dt)
    N = length(pos)
    for i in 1:N
        pos[i] = (0.5 * acc[i] * dt + vel[i])*dt + pos[i]
    end
    return pos
end

function update_velocities(vel, acc, last_acc, dt)
    N = length(vel)
    for i in 1:N
        vel[i] = vel[i] + 0.5 * dt * (acc[i] + last_acc[i])
    end
    return vel
end

"""
Force calculation.
"""
function compute_acceleration(pos, mass, acc)
    N = length(pos)
    for i in 1:N
      acc[i] = zeros(eltype(acc)) 
    end
    @inbounds for i = 1:N-1
        for j = i+1:N
            dr = pos[i] - pos[j]
            rinv3 = 1/sqrt(sum(dr .^ 2)) ^ 3
            acc[i] = acc[i] - mass[i] * rinv3 * dr
            acc[j] = acc[j] + mass[j] * rinv3 * dr
        end
    end
    return acc
end


"""
Kinetic and potential energy.
"""
function compute_energy(pos, vel, mass)
    N = length(vel)

    Ekin = 0.0

    for i = 1:N
        Ekin += 0.5 * mass[i] * sum(vel[i] .^ 2)
    end

    Epot = 0.0

    @inbounds for i = 1:N-1
        for j = i+1:N
            dr = pos[i] - pos[j]
            rinv = 1/sqrt(sum(dr .^ 2))
            Epot -= mass[i] * mass[j] * rinv
        end
    end

    return Ekin, Epot
end

function read_ICs(fname::String)

    ICs = readdlm(fname)

    N = size(ICs,1)

    pos = Array{SVector{3,Float64}}(undef, N)
    vel = Array{SVector{3,Float64}}(undef, N)

    id = @view ICs[:, 1]
    mass = @view ICs[:, 2]

    for i in axes(ICs, 1)
        pos[i] = SVector{3,Float64}(ICs[i, 3], ICs[i, 4], ICs[i, 5])
    end

    for i in axes(ICs, 1)
        vel[i] = SVector{3,Float64}(ICs[i, 6], ICs[i, 7], ICs[i, 8])
    end

    return id, mass, pos, vel
end

export NBabel

end


#function main(args)
#    NBabel(args[1], show=true)
#end
#main(ARGS)
6 Likes