In the Fortran and Python implementations the loop to compute accelerations is done properly (N(N-1)/2) cycles:

```
do i = 1, P%nstars - 1
do j = i + 1, P%nstars
```

```
for index_p0 in range(nb_particules - 1):
for index_p1 in range(index_p0 + 1, nb_particules):
```

while in the Julia implementation it is not (N^2 cycles):

```
@inbounds @Threads.threads for i = 1:N
@fastmath @inbounds @simd for j = 1:N
if i != j
```

Concerning the code:

My “base time is” 661 ms (8.41 MB allocations), for the code provided.

Removing all the decorations (simd, inbounds, threads, etc), actually made the code faster (586 ms).

Fixing the loop above and using static arrays took the time to 581 ms and allocations went to 174 kb. No allocations occur anymore in the main routines.

Now, adding back the `@inbounds`

flag to the acceleration and energy computation loops, time goes to 500 ms, 171.78 KiB.

If one considers this speedup reliable (~30%), that would put the Julia benchmark there in pair with the python and rust codes.

I am running with this data, with

```
using BenchmarkTools
include("./nbabel.jl")
using .NB
@btime NBabel("./input128.dat")
```

##
Code

```
module NB
"""
This is an implementation of the NBabel N-body problem.
See nbabel.org for more information.
This is 'naive & native' Julia with type and loop annotations for
fastmath and vectorization, because this is really is no effort.
Without annotations the code will be as slow as naive Python/IDL.
include("nbabel.jl")
NBabel("IC/input128", show=true)
Julius Donnert INAF-IRA 2017
"""
using Printf
using DelimitedFiles
using StaticArrays
tbeg = 0.0
tend = 10.0
dt = 0.001
function NBabel(fname::String; tend = tend, dt = dt, show=false)
if show
println("Reading file : $fname")
end
ID, mass, pos, vel = read_ICs(fname)
return NBabelCalcs(ID, mass, pos, vel, tend, dt, show)
end
function NBabelCalcs(ID, mass, pos, vel, tend = tend, dt = dt, show=false)
acc = similar(vel)
acc = compute_acceleration(pos, mass, acc)
last_acc = copy(acc)
Ekin, Epot = compute_energy(pos, vel, mass)
Etot_ICs = Ekin + Epot
t = 0.0
nstep = 0
while t < tend
pos = update_positions(pos, vel, acc, dt)
acc, last_acc = last_acc, acc
acc = compute_acceleration(pos, mass, acc)
vel = update_velocities(vel, acc, last_acc, dt)
t += dt
nstep += 1
if show && nstep%100 == 0
Ekin, Epot = compute_energy(pos, vel, mass)
Etot = Ekin + Epot
dE = (Etot - Etot_ICs)/Etot_ICs
@printf "t = %g, Etot=%g, Ekin=%g, Epot=%g, dE=%g \n" t Etot Ekin Epot dE
end
end
Ekin, Epot = compute_energy(pos, vel, mass)
Etot = Ekin + Epot
return (; Ekin, Epot, Etot)
# return size(pos,1)
end
function update_positions(pos, vel, acc, dt)
N = length(pos)
for i in 1:N
pos[i] = (0.5 * acc[i] * dt + vel[i])*dt + pos[i]
end
return pos
end
function update_velocities(vel, acc, last_acc, dt)
N = length(vel)
for i in 1:N
vel[i] = vel[i] + 0.5 * dt * (acc[i] + last_acc[i])
end
return vel
end
"""
Force calculation.
"""
function compute_acceleration(pos, mass, acc)
N = length(pos)
for i in 1:N
acc[i] = zeros(eltype(acc))
end
@inbounds for i = 1:N-1
for j = i+1:N
dr = pos[i] - pos[j]
rinv3 = 1/sqrt(sum(dr .^ 2)) ^ 3
acc[i] = acc[i] - mass[i] * rinv3 * dr
acc[j] = acc[j] + mass[j] * rinv3 * dr
end
end
return acc
end
"""
Kinetic and potential energy.
"""
function compute_energy(pos, vel, mass)
N = length(vel)
Ekin = 0.0
for i = 1:N
Ekin += 0.5 * mass[i] * sum(vel[i] .^ 2)
end
Epot = 0.0
@inbounds for i = 1:N-1
for j = i+1:N
dr = pos[i] - pos[j]
rinv = 1/sqrt(sum(dr .^ 2))
Epot -= mass[i] * mass[j] * rinv
end
end
return Ekin, Epot
end
function read_ICs(fname::String)
ICs = readdlm(fname)
N = size(ICs,1)
pos = Array{SVector{3,Float64}}(undef, N)
vel = Array{SVector{3,Float64}}(undef, N)
id = @view ICs[:, 1]
mass = @view ICs[:, 2]
for i in axes(ICs, 1)
pos[i] = SVector{3,Float64}(ICs[i, 3], ICs[i, 4], ICs[i, 5])
end
for i in axes(ICs, 1)
vel[i] = SVector{3,Float64}(ICs[i, 6], ICs[i, 7], ICs[i, 8])
end
return id, mass, pos, vel
end
export NBabel
end
#function main(args)
# NBabel(args[1], show=true)
#end
#main(ARGS)
```