And once again, Julia with SArrays
challenges the king (aka Intel Fortran)!
I wrote the OP’s code using StaticArrays
after removing unused variables and with no structs. I also wrote the equivalent Fortran version using arrays. Here is the amazing result:
Time: 85.384 ms # Julia (-O3 --check-bounds=no)
Time: 88.630 ms # Intel Fortran (-Ofast)
For many years, I used to trust nothing but ifort
for writing any performance sensitive code, now Julia is becoming a real game changer. The only thing I dream of is being able to write vectorized code in Julia that doesn’t allocate without the need for adding @views
, @uviews
, or other ugly annotations. Here is the full code for both languages for reference:
# Julia Version
function vector1D(c1::Float64, c2::Float64, box_size::Float64)
if c1 < c2
return (c2 - c1) < (c1 - c2 + box_size) ? (c2 - c1) : (c2 - c1 - box_size)
else
return (c1 - c2) < (c2 - c1 + box_size) ? (c2 - c1) : (c2 - c1 + box_size)
end
end
@inline function SingleLJ2!(i, j, f1, coord1, coord2, ϵ::Float64, σ::Float64, box_size)
dx = vector1D(coord1[1], coord2[1], box_size[1])
dy = vector1D(coord1[2], coord2[2], box_size[2])
dz = vector1D(coord1[3], coord2[3], box_size[3])
rij_sq = dx*dx + dy*dy + dz*dz
(rij_sq > 1.0) && return
sr2 = σ^2 / rij_sq
# rmag = sqrt(rij_sq)
sr6 = sr2 ^ 3
sr12 = sr6 ^ 2
f = ((24 * ϵ) / rij_sq) * (2 * sr12 - sr6)
x = -f * dx
y = -f * dy
z = -f * dz
f1[i] += SVector(x,y,z)
f1[j] -= SVector(x,y,z)
nothing
end
function TotalEnergyStructOfArrays(f, r, typ, vdwTable, n, box_size)
for i = 1:(n-1)
ti = typ[i]
for j = (i+1):n
tj = typ[j]
SingleLJ2!(i, j, f, r[i], r[j], vdwTable[1][ti,tj], vdwTable[2][ti,tj], box_size)
end
end
return
end
################################################################################
# Some general parameters used by all trials
################################################################################
using BenchmarkTools, StaticArrays
box_size = (2.0,2.0,2.0)
x = 3011 # dummy number of atoms
m = 4 # dummy number of atom types (for table of parameters...)
f = [SVector(0.0,0,0) for i=1:x]
r = [SVector(rand(3)...) .* box_size[1] for i=1:x]
typ = trunc.(Int, [ceil(rand())*m for i=1:x])
vdwTable = [@SMatrix rand(m,m) for i=1:2]
################################################################################
# Testing Struct of Arrays
################################################################################
println("------------------------Start of Testing SoA------------------------------")
print("Struct of Arrays: ")
@btime TotalEnergyStructOfArrays($f, $r, $typ, $vdwTable, $x, $box_size)
println("--------------------------End of Testing SoA------------------------------")
# Output:
# ------------------------Start of Testing SoA------------------------------
# Struct of Arrays: 85.384 ms (0 allocations: 0 bytes)
# --------------------------End of Testing SoA------------------------------
And
! Fortran version
program structOfArrays
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
! Some general parameters used by all trials
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
integer, parameter :: x = 3011 ! dummy number of atoms
integer, parameter :: m = 4 ! dummy number of atom types (for table of parameters...)
real(8) :: box_size(3) = [2.0,2.0,2.0], r(3,x), f(3,x) = 0, vdwTable(m,m,2)
integer :: typ(x)
integer :: t0, t1, count_rate, count_max
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
! Start of trials
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
call random_number(r)
typ = m * ceiling(r(3,:))
r = r * box_size(1)
call random_number(vdwTable)
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
! Testing Struct of Arrays
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
print *, "------------------------Start of Testing SoA------------------------------"
print *, "Struct of Arrays: "
call system_clock(t0, count_rate, count_max)
do iter = 1,100
call TotalEnergyStructOfArrays(typ, f, r, vdwTable, x, box_size)
end do
call system_clock(t1)
print*, 'Time:', real(t1-t0)/count_rate
print *, "--------------------------End of Testing SoA------------------------------"
contains
real(8) function vector1D(c1, c2, box_size)
real(8) :: c1, c2, box_size
if (c1 < c2) then
if (c2 - c1 < c1 - c2 + box_size) then
vector1D = c2 - c1
else
vector1D = c2 - c1 - box_size
end if
else
if (c1 - c2 < c2 - c1 + box_size) then
vector1D = c2 - c1
else
vector1D = c2 - c1 + box_size
end if
end if
end
subroutine SingleLJ2(f1, f2, coord1, coord2, epsilon, sigma, box_size)
real(8) :: f1(:), f2(:), coord1(:), coord2(:), box_size(:), epsilon, sigma, dx, dy, dz, rij_sq, x, y, z, f
dx = vector1D( coord1(1), coord2(1), box_size(1) )
dy = vector1D( coord1(2), coord2(2), box_size(2) )
dz = vector1D( coord1(3), coord2(3), box_size(3) )
rij_sq = dx*dx + dy*dy + dz*dz
if (rij_sq > 1.0) return
sr2 = sigma**2 / rij_sq
! rmag = sqrt(rij_sq)
sr6 = sr2**3
sr12 = sr6**2
f = 24*epsilon / rij_sq * (2*sr12 - sr6)
x = -f * dx
y = -f * dy
z = -f * dz
f1 = f1 + [x, y, z]
f2 = f2 - [x, y, z]
end
subroutine TotalEnergyStructOfArrays(typ, f, r, vdwTable, n, box_size)
real(8) :: f(:,:), r(:,:), vdwTable(:,:,:), box_size(:)
integer :: typ(:), n
do i = 1, (n-1)
ti = typ(i)
do j = (i+1), n
tj = typ(j)
call SingleLJ2(f(:,i), f(:,j), r(:,i), r(:,j), vdwTable(ti,tj,1), vdwTable(ti,tj,2), box_size)
end do
end do
end
end program structOfArrays
! Output:
! ------------------------Start of Testing SoA------------------------------
! Struct of Arrays:
! Time: 8.863000 ! x 10 = 88.63 ms
! --------------------------End of Testing SoA------------------------------