I have a function that is going to be called many hundreds of thousands of times so I want it optimized maximally. To reduce allocations, I decided I would create a struct
the stored some buffer arrays, then I would assign values to them “in-place” to avoid repeated allocations. Here is the function:
function propose_shear_step(atoms::ase.atoms,stepsize::Float64; buffers::Union{NS_buffers,Nothing} = nothing)
rmv_vec = rand(1:3)
# rmv_vec = rand(Xoshiro(),1:3)
# Don't allocate a vector of indices, a tuple is better for memory management.
remaining_vecs = rmv_vec == 1 ? (2,3) : rmv_vec == 2 ? (1,3) : (1,2)
new_cell = buffers !== nothing ? buffers.new_cell : MMatrix{3,3,Float64,9}(undef)
new_cell .= atoms.lVecs
v1 = SVector{3,Float64}(atoms.lVecs[:,remaining_vecs[1]])
v2 = SVector{3,Float64}(atoms.lVecs[:,remaining_vecs[2]])
norm_v1 = sqrt(dot(v1,v1))
v1 = v1 / norm_v1
v2 = v2 - v1 * dot(v1, v2) # Make v2 orthogonal to v1
norm_v2 = sqrt(dot(v2,v2))
v2 = v2/norm_v2
if abs(dot(v1, v2)) > 1e-4
println("v1 and v2 are not orthogonal, something is wrong.")
end
rv = MVector{2,Float64}(undef)
rand!(Xoshiro(), Normal(0, stepsize), rv)
rv1, rv2 = rv
# rv1,rv2 = rand(Normal(0,stepsize),2)
new_cell[:,rmv_vec] = atoms.lVecs[:,rmv_vec] + rv1 * v1 + rv2 * v2
inv_lVecs = buffers !== nothing ? buffers.inv_lVecs : inv(SMatrix{3,3,Float64,9}(atoms.lVecs))
if buffers !== nothing
mul!(buffers.transform,new_cell,buffers.inv_lVecs)
return (1.0,SMatrix{3,3,Float64,9}(buffers.transform))
else
inv_lVecs = inv(SMatrix{3,3,Float64,9}(atoms.lVecs))
transform = new_cell * inv_lVecs
return (1.0,SMatrix{3,3,Float64,9}(transform))
end
end
and here is what ase.atoms
, and NS_buffers
look like:
struct NS_buffers
new_cell :: MMatrix{3,3,Float64,9}
transform :: MMatrix{3,3,Float64,9}
inv_lVecs :: SMatrix{3,3,Float64,9}
end
mutable struct atoms
title::String
latpar::Float64
lVecs::MMatrix{3,3,Float64,9}
coordSys::Vector{String} # 'D' for Direct or 'C' for cartesian
positions:: Vector{SVector{3,Float64}} # List of all the atomic basis vector separated by type
velocities:: Vector{SVector{3,Float64}} # Velocities of the atoms
masses:: Vector{Float64}
atomTypes:: Vector{Int64} # Integers representing the type for each basis atom
species:: Vector{String} # Species of the atoms
FP_total_energy:: Float64 # First principles total or peratom energy
model_energy:: Float64 # Energy that should be used in the fitting process
lj_vec::Vector{Float64} # Vector of 1/r^6 and 1/r^12 values for each unique pair of atom types
end
But the results of @btime indicate that the performance is worse when I send in the optional buffers as compared to not sending buffers in at all. I’ve always had a hard time identifying allocations. Any help here? Thanks in advance