If anyone is curious, here are the portions of each script where the majority of the time is spent. Each of these functions is run on a vector of 5 Body
s in a loop something like:
@inbounds for i in 1:length(bodies), j in i+1:length(bodies)
advance_velocity!(bodies[i], bodies[j], 0.01)
end
Faster on old cpu
This was slightly modified to extract the contents of an inner loop into a discrete function, but doing so causes no performance difference on the new cpu.
mutable struct Body
x::Float64
y::Float64
z::Float64
vx::Float64
vy::Float64
vz::Float64
mass::Float64
end
function advance_velocity!(bi::Body, bj::Body, dt::Float64)
dx = bi.x - bj.x
dy = bi.y - bj.y
dz = bi.z - bj.z
dsq = dx^2 + dy^2 + dz^2
distance = sqrt(dsq)
mag = dt / (dsq * distance)
bi.vx -= dx * bj.mass * mag
bi.vy -= dy * bj.mass * mag
bi.vz -= dz * bj.mass * mag
bj.vx += dx * bi.mass * mag
bj.vy += dy * bi.mass * mag
bj.vz += dz * bi.mass * mag
end
Faster on new cpu
WARNING: this code commits type-piracy on tuples and should not be run in a REPL session you would like to preserve.
# 4 floats in tuple instead of 3 generates better SIMD instructions
const V3d = NTuple{4,Float64}
V3d(x=0.0, y=0.0, z=0.0) = (Float64(x), Float64(y), Float64(z), 0.0)
Base.sum(v::V3d) = @inbounds +(v[1], v[2], v[3])
struct Body
pos::V3d
v::V3d
m::Float64
end
Base.@propagate_inbounds function update_velocity(b1, b2, Δt)
Δpos = b1.pos .- b2.pos
d² = sum(Δpos .* Δpos)
mag = Δt / (d² * √d²)
(Body(b1.pos, muladd.(-b2.m * mag, Δpos, b1.v), b1.m),
Body(b2.pos, muladd.(b1.m * mag, Δpos, b2.v), b2.m))
end
And for the record, on the new cpu, modifying things between using a mutable Body
and an immutable Body
doesn’t make any appreciable difference in timing.