I am trying to optimize a lennard-jones force calculation. When I first wrote it, It used millions of allocations and over 1 giB of memory, and these both increased if I looped over the algorithm. I have gotten it down to 2 allocations and 224 bytes of memory. The biggest factor was wrapping everything (functions and variable initialization) in a big function. However, it is not fast enough yet. I need another order of magnitude or else the code will be too long to execute. It will take weeks to do what Fortran or C can do in a day.

I tried adding @threads and using 4 threads but it speeds it up by less than 10% and the results are highly variable. I do not know if I am using it properly, and I am not knowledgeable enough on the other parallel options.

The code below is actually in two parts since I am also experimenting with Array of Structs vs. Struct of Arrays. They both seem about the same speedā¦ I have labelled all parts clearly. I leave both in (sorry it makes the code seem long) since one method may be preferable.

Any advice on how to either parallelize it, or write the code better would be appreciated! An order of magnitude speedup would be amazingā¦

Each test has 2 functions, the main begins with `TotalEnergy`

and the inner begins with `Single`

. These are the only two functions that can be played with, everything else just sets up the necessary structs and default values.

The first 95 lines can be skipped, sorry for the length, but they set it upā¦ The issue is only the functions that start with either `Single`

or `TotalEnergy`

/

i.e.,

```
function SingleLJ2!(f1::Vector{Float64},f2::Vector{Float64},
coord1::XYZ,coord2::XYZ, Ļµ::Float64, Ļ::Float64, box_size)
dx = vector1D(coord1.x, coord2.x, box_size[1])
dy = vector1D(coord1.y, coord2.y, box_size[2])
dz = vector1D(coord1.z, coord2.z, box_size[3])
rij_sq = dx*dx + dy*dy + dz*dz
if rij_sq > 1.0
fill!(f1,0.0)
fill!(f2,0.0)
return
else
sr2 = Ļ^2 / rij_sq
rmag = sqrt(rij_sq)
sr6 = sr2 ^ 3
sr12 = sr6 ^ 2
f = ((24 * Ļµ) / (rij_sq)) * (2 * sr12 - sr6)
f1[1] = -f * dx
f1[2] = -f * dy
f1[3] = -f * dz
f2 .= .-f1
end
nothing
end
function TotalEnergyStructOfArrays(array::StructOfArrays,vdwTable::Table,
n::Int64,box_size::Vector{Float64})
f1 = Vector{Float64}(undef,3)
f2 = Vector{Float64}(undef,3)
for k=1:1 # this loop is redundant now, it used to increase memory and allocations
@threads for i = 1:(n-1)
for j = (i+1):n
ti = array.type[i]
tj = array.type[j]
SingleLJ2!(f1,f2,array.r[i],array.r[i],vdwTable.Ļµij[ti,tj],
vdwTable.Ļij[ti,tj],box_size)
array.f[i].x += f1[1]
array.f[i].y += f1[2]
array.f[i].z += f1[3]
array.f[j].x += f2[1]
array.f[j].y += f2[2]
array.f[j].z += f2[3]
end
end
end
return
end
```

Here is the full code for if anyone wished to run it, but the above segment of code is all that I am interested in optimizing (and the equivalent Array of Structures version) and also parallelizing.

```
#############################################################################
#
# A general struct (Table) and two functions vector1D and vector.
# - vector1D and vector are taken from Molly
# https://github.com/jgreener64/Molly.jl/blob/master/src/md.jl
################################################################################
using Distributed
using Base.Threads
struct Table # given dummy values below
Ļµij::Array{Float64,2}
Ļij::Array{Float64,2}
end
mutable struct XYZ
x::Float64
y::Float64
z::Float64
end
mutable struct ArrayOfStructs
r::XYZ
v::XYZ
f::XYZ
type::Int64
cgnr::Int64
name::String
mass::Float64
qq::Float64
mol::String
end
mutable struct StructOfArrays
r::Vector{XYZ}
v::Vector{XYZ}
f::Vector{XYZ}
type::Vector{Int64}
cgnr::Vector{Int64}
name::Vector{String}
mass::Vector{Float64}
qq::Vector{Float64}
mol::Vector{String}
end
"Vector between two coordinate values, accounting for mirror image seperation"
function vector1D(c1::Float64, c2::Float64, box_size::Float64)
if c1 < c2
return (c2 - c1) < (c1 - c2 + box_size) ? (c2 - c1) : (c2 - c1 - box_size)
else
return (c1 - c2) < (c2 - c1 + box_size) ? (c2 - c1) : (c2 - c1 + box_size)
end
end
vector(coords_one::XYZ, coords_two::XYZ, box_size::Float64) =
[vector1D(coords_one.x, coords_two.x, box_size),
vector1D(coords_one.y, coords_two.y, box_size),
vector1D(coords_one.z, coords_two.z, box_size)]
################################################################################
# Some general parameters used by all trials
################################################################################
function main()
box_size = [2.0,2.0,2.0]
x = 3011 # dummy number of atoms
m = 4 # dummy number of atom types (for table of parameters...)
println("This program is using ", Threads.nthreads(), " threads")
vdwTable = Table([rand() for i=1:m,j=1:m],
[rand() for i=1:m,j=1:m]) #,
################################################################################
# Start of trials
################################################################################
arrayOfStructs = Vector{ArrayOfStructs}(undef,x)
for i=1:x
arrayOfStructs[i] = ArrayOfStructs( XYZ(box_size[1] * rand(), box_size[1] * rand(), box_size[1] * rand() ),
XYZ(0.0, 0.0, 0.0),
XYZ(0.0, 0.0, 0.0),
ceil(rand()*m),15,"hi",14.1,1.2,"mol" )
end
structOfArrays = StructOfArrays([XYZ(box_size[1] * rand(), box_size[1] * rand(), box_size[1] * rand()) for i=1:x],
[XYZ(0.0, 0.0, 0.0) for i=1:x ],
[XYZ(0.0, 0.0, 0.0) for i=1:x ],
[ceil(rand())*m for i=1:x],
[12 for i=1:x],
["hi" for i=1:x],
[rand()*5.1 for i=1:x],
[rand()*1.3 for i=1:x],
["string" for i=1:x])
################################################################################
#
# Testing Array of Structs
#
################################################################################
function SingleLJ!(f1::Vector{Float64},f2::Vector{Float64},
coord1::XYZ,coord2::XYZ, Ļµ::Float64, Ļ::Float64, box_size)
# method 1 # 9.99 M allocations: 685.959 MiB
#diff = Vector{Float64}(undef,3)
#diff[1] = vector1D(coord1.x, coord2.x, box_size[1])
#diff[2] = vector1D(coord1.y, coord2.y, box_size[2])
#diff[3] = vector1D(coord1.z, coord2.z, box_size[3])
# method 2 # 9.99 M allocations: 685.959 MiB
# diff[1] = coord1.x - coord2.x
# diff[2] = coord1.y - coord2.y
# diff[3] = coord1.z - coord2.z
# diff .= diff .- round.(diff./box_size) .* box_size # technically a bit more rigorous than method 1 and 3 # mirror image seperation
#method 3 29.97 M allocations: 1.414 GiB
#diff = vector(coord1, coord2, box_size[1]) # THis is very heavy on allocations
#rij_sq = sum(diff.^2) # use with methods 1,2,3
# method 4 2 allocations: 224 bytes
dx = vector1D(coord1.x, coord2.x, box_size[1])
dy = vector1D(coord1.y, coord2.y, box_size[2])
dz = vector1D(coord1.z, coord2.z, box_size[3])
rij_sq = dx*dx + dy*dy + dz*dz # use only with method 4
if rij_sq > 1.0
fill!(f1,0.0)
fill!(f2,0.0)
return
else
sr2 = Ļ^2 / rij_sq
rmag = sqrt(rij_sq)
sr6 = sr2 ^ 3
sr12 = sr6 ^ 2
f = ((24 * Ļµ) / (rij_sq)) * (2 * sr12 - sr6)
#f1[1] = -f * diff[1]
#f1[2] = -f * diff[2]
#f1[3] = -f * diff[3]
#f2 .= .-f1
f1[1] = -f * dx
f1[2] = -f * dy
f1[3] = -f * dz
f2 .= .-f1
end
nothing
end
function TotalEnergyArrayOfStructs(array::Vector{ArrayOfStructs},vdwTable::Table,
n::Int64,box_size::Vector{Float64})
f1 = Vector{Float64}(undef,3)
f2 = Vector{Float64}(undef,3)
for k=1:1 # this loop is redundant... I do not understand why allocations increase linearly with this loop...
@threads for i = 1:(n-1)
for j = (i+1):n
ti = array[i].type
tj = array[j].type
SingleLJ!(f1,f2,array[i].r,array[j].r,vdwTable.Ļµij[ti,tj],
vdwTable.Ļij[ti,tj],box_size)
array[i].f.x += f1[1]
array[i].f.y += f1[2]
array[i].f.z += f1[3]
array[j].f.x += f2[1]
array[j].f.y += f2[2]
array[j].f.z += f2[3]
end
end
end
return
end
TotalEnergyArrayOfStructs(arrayOfStructs,vdwTable,x,box_size) # precompile
println("------------------------Start of Testing AoS------------------------------")
print("array of structs: ")
@time TotalEnergyArrayOfStructs(arrayOfStructs,vdwTable,x,box_size)
println("--------------------------End of Testing AoS------------------------------")
################################################################################
#
# Testing Struct of Arrays
#
################################################################################
function SingleLJ2!(f1::Vector{Float64},f2::Vector{Float64},
coord1::XYZ,coord2::XYZ, Ļµ::Float64, Ļ::Float64, box_size)
dx = vector1D(coord1.x, coord2.x, box_size[1])
dy = vector1D(coord1.y, coord2.y, box_size[2])
dz = vector1D(coord1.z, coord2.z, box_size[3])
rij_sq = dx*dx + dy*dy + dz*dz
if rij_sq > 1.0
fill!(f1,0.0)
fill!(f2,0.0)
return
else
sr2 = Ļ^2 / rij_sq
rmag = sqrt(rij_sq)
sr6 = sr2 ^ 3
sr12 = sr6 ^ 2
f = ((24 * Ļµ) / (rij_sq)) * (2 * sr12 - sr6)
f1[1] = -f * dx
f1[2] = -f * dy
f1[3] = -f * dz
f2 .= .-f1
end
nothing
end
function TotalEnergyStructOfArrays(array::StructOfArrays,vdwTable::Table,
n::Int64,box_size::Vector{Float64})
f1 = Vector{Float64}(undef,3)
f2 = Vector{Float64}(undef,3)
for k=1:1 # this loop is redundant now, it used to increase memory and allocations
@threads for i = 1:(n-1)
for j = (i+1):n
ti = array.type[i]
tj = array.type[j]
SingleLJ2!(f1,f2,array.r[i],array.r[i],vdwTable.Ļµij[ti,tj],
vdwTable.Ļij[ti,tj],box_size)
array.f[i].x += f1[1]
array.f[i].y += f1[2]
array.f[i].z += f1[3]
array.f[j].x += f2[1]
array.f[j].y += f2[2]
array.f[j].z += f2[3]
end
end
end
return
end
TotalEnergyStructOfArrays(structOfArrays,vdwTable,x,box_size) # precompile
println("------------------------Start of Testing SoA------------------------------")
print("Struct of Arrays: ")
@time TotalEnergyStructOfArrays(structOfArrays,vdwTable,x,box_size)
println("--------------------------End of Testing SoA------------------------------")
end
main()
```