Nbabel nbody integrator speed up

lmiq · December 12, 2020, 5:52pm

In the Fortran and Python implementations the loop to compute accelerations is done properly (N(N-1)/2) cycles:

     do i = 1, P%nstars - 1
         do j = i + 1, P%nstars

   for index_p0 in range(nb_particules - 1):
        for index_p1 in range(index_p0 + 1, nb_particules):

while in the Julia implementation it is not (N^2 cycles):

    @inbounds @Threads.threads for i = 1:N
        @fastmath @inbounds @simd for j = 1:N
            if i != j

Concerning the code:

My “base time is” 661 ms (8.41 MB allocations), for the code provided.

Removing all the decorations (simd, inbounds, threads, etc), actually made the code faster (586 ms).

Fixing the loop above and using static arrays took the time to 581 ms and allocations went to 174 kb. No allocations occur anymore in the main routines.

Now, adding back the @inbounds flag to the acceleration and energy computation loops, time goes to 500 ms, 171.78 KiB.

If one considers this speedup reliable (~30%), that would put the Julia benchmark there in pair with the python and rust codes.

I am running with this data, with

using BenchmarkTools
include("./nbabel.jl")
using .NB
@btime NBabel("./input128.dat")

Code



module NB

"""
This is an implementation of the NBabel N-body problem.
See nbabel.org for more information.
This is 'naive & native' Julia with type and loop annotations for
fastmath and vectorization, because this is really is no effort.
Without annotations the code will be as slow as naive Python/IDL.
    include("nbabel.jl")
    NBabel("IC/input128", show=true)
Julius Donnert INAF-IRA 2017
"""

using Printf
using DelimitedFiles
using StaticArrays

tbeg = 0.0
tend = 10.0
dt = 0.001

function NBabel(fname::String; tend = tend, dt = dt, show=false)

    if show
	    println("Reading file : $fname")
    end

    ID, mass, pos, vel = read_ICs(fname)

    return NBabelCalcs(ID, mass, pos, vel, tend, dt, show)
end

function NBabelCalcs(ID, mass, pos, vel, tend = tend, dt = dt, show=false)
    acc = similar(vel)
    acc = compute_acceleration(pos, mass, acc)
    last_acc = copy(acc)

    Ekin, Epot = compute_energy(pos, vel, mass)
    Etot_ICs = Ekin + Epot

    t = 0.0
    nstep = 0

    while t < tend

        pos = update_positions(pos, vel, acc, dt)

        acc, last_acc = last_acc, acc

        acc = compute_acceleration(pos, mass, acc)

        vel = update_velocities(vel, acc, last_acc, dt)
        
        t += dt
        nstep += 1

        if show && nstep%100 == 0

            Ekin, Epot = compute_energy(pos, vel, mass)
            Etot = Ekin + Epot
            dE = (Etot - Etot_ICs)/Etot_ICs

            @printf "t = %g, Etot=%g, Ekin=%g, Epot=%g, dE=%g \n" t Etot Ekin Epot dE
        end
    end

    Ekin, Epot = compute_energy(pos, vel, mass)
    Etot = Ekin + Epot
    return (; Ekin, Epot, Etot)
    # return size(pos,1)
end

function update_positions(pos, vel, acc, dt)
    N = length(pos)
    for i in 1:N
        pos[i] = (0.5 * acc[i] * dt + vel[i])*dt + pos[i]
    end
    return pos
end

function update_velocities(vel, acc, last_acc, dt)
    N = length(vel)
    for i in 1:N
        vel[i] = vel[i] + 0.5 * dt * (acc[i] + last_acc[i])
    end
    return vel
end

"""
Force calculation.
"""
function compute_acceleration(pos, mass, acc)
    N = length(pos)
    for i in 1:N
      acc[i] = zeros(eltype(acc)) 
    end
    @inbounds for i = 1:N-1
        for j = i+1:N
            dr = pos[i] - pos[j]
            rinv3 = 1/sqrt(sum(dr .^ 2)) ^ 3
            acc[i] = acc[i] - mass[i] * rinv3 * dr
            acc[j] = acc[j] + mass[j] * rinv3 * dr
        end
    end
    return acc
end


"""
Kinetic and potential energy.
"""
function compute_energy(pos, vel, mass)
    N = length(vel)

    Ekin = 0.0

    for i = 1:N
        Ekin += 0.5 * mass[i] * sum(vel[i] .^ 2)
    end

    Epot = 0.0

    @inbounds for i = 1:N-1
        for j = i+1:N
            dr = pos[i] - pos[j]
            rinv = 1/sqrt(sum(dr .^ 2))
            Epot -= mass[i] * mass[j] * rinv
        end
    end

    return Ekin, Epot
end

function read_ICs(fname::String)

    ICs = readdlm(fname)

    N = size(ICs,1)

    pos = Array{SVector{3,Float64}}(undef, N)
    vel = Array{SVector{3,Float64}}(undef, N)

    id = @view ICs[:, 1]
    mass = @view ICs[:, 2]

    for i in axes(ICs, 1)
        pos[i] = SVector{3,Float64}(ICs[i, 3], ICs[i, 4], ICs[i, 5])
    end

    for i in axes(ICs, 1)
        vel[i] = SVector{3,Float64}(ICs[i, 6], ICs[i, 7], ICs[i, 8])
    end

    return id, mass, pos, vel
end

export NBabel

end


#function main(args)
#    NBabel(args[1], show=true)
#end
#main(ARGS)

Topic		Replies	Views
Speeding up some non-optimized Julia functions Performance	47	1163	July 19, 2022
Applying performance tips in library Performance	0	318	July 28, 2021
Very slow execution time in comparison even to Python Performance	35	2014	October 29, 2020
Any good OpenCL examples to demonstrate a speedup? GPU	4	1382	April 3, 2019
Improving Barnes-Hut n-body simulation performance Performance	14	2349	September 19, 2020

Nbabel nbody integrator speed up

Related topics