I would like to ask you for help to improve a call to a critical routine written in Fortran. A minimal version of the library is the following
Fortran library source
module fortran_lib
contains
subroutine density_gradient( &
point, &
nmo, &
ncent, &
nprims, &
mgrp, &
ngtoH, &
ngroup, &
ityp, &
nzexp, &
nlm, &
nuexp, &
occ, &
oexp, &
xyz, &
rcutte, &
coef, &
grad_rho) &
bind(C, name='density_gradient')
use iso_c_binding ! C-compatible types
implicit none
! input variables:
real(c_double), dimension(3), intent(in) :: point
integer(c_long), intent(in) :: nmo, ncent, nprims, mgrp, ngtoH
integer(c_long), dimension(ncent), intent(in) :: ngroup
integer(c_long), dimension(nprims), intent(in) :: ityp
integer(c_long), dimension(ncent,mgrp), intent(in) :: nzexp
integer(c_long), dimension(56,3), intent(in) :: nlm ! magic number
integer(c_long), dimension(ncent,mgrp,ngtoH), intent(in) :: nuexp
real(c_double), dimension(nmo), intent(in) :: occ
real(c_double), dimension(nprims), intent(in) :: oexp
real(c_double), dimension(ncent,3), intent(in) :: xyz
real(c_double), dimension(ncent,mgrp), intent(in) :: rcutte
real(c_double), dimension(2*nmo,nprims), intent(in) :: coef
! output variables:
real(c_double), dimension(3), intent(inout) :: grad_rho
! some code goes here
end subroutine
end module fortran_lib
The wrapper is:
Wrapper code
const RHOGRADLIB = joinpath(@__DIR__, "libdensity_gradient.so")
const RHOGRADSOURCE = joinpath(@__DIR__, "density_gradient.f90")
run(`gfortran -Wall -Wno-unused-variable -Wno-unused-dummy-argument -shared -O3 -fPIC -o $RHOGRADLIB $RHOGRADSOURCE`)
function fortran_density_gradient(∇ρ, point, wfn::AIM_WFN_Gamess_Mol, t)
ngtoH = size(wfn.nuexp, 3)
@static if VERSION >= v"1.5.0" # clearer syntax
@ccall RHOGRADLIB.density_gradient(
point :: Ptr{MArray{Tuple{3},Cdouble,1,3}},
wfn.nmo :: Ref{Clong},
wfn.ncent :: Ref{Clong},
wfn.nprims :: Ref{Clong},
wfn.maxgrp :: Ref{Clong},
ngtoH :: Ref{Clong},
wfn.ngroup :: Ptr{Array{Clong,1}},
wfn.ityp :: Ptr{Array{Clong,1}},
wfn.nzexp :: Ptr{Array{Clong,2}},
wfn.nlm :: Ptr{Array{Clong,2}},
wfn.nuexp :: Ptr{Array{Clong,3}},
wfn.occ :: Ptr{Array{Cdouble,1}},
wfn.oexp :: Ptr{Array{Cdouble,1}},
wfn.xyz :: Ptr{Array{Cdouble,2}},
wfn.rcutte :: Ptr{Array{Cdouble,2}},
wfn.coef :: Ptr{Array{Cdouble,2}},
∇ρ :: Ptr{MArray{Tuple{3},Cdouble,1,3}}, # inout
)::Cvoid
else # works anyway
# Argument types are specified with a tuple,
# (Any,) isa Tuple => true
# (Any) isa Tuple => false
ccall( (:density_gradient, RHOGRADLIB), Cvoid, (
Ptr{SArray{Tuple{3},Cdouble,1,3}},
Ref{Clong},
Ref{Clong},
Ref{Clong},
Ref{Clong},
Ref{Clong},
Ptr{Array{Clong,1}},
Ptr{Array{Clong,1}},
Ptr{Array{Clong,2}},
Ptr{Array{Clong,2}},
Ptr{Array{Clong,3}},
Ptr{Array{Cdouble,1}},
Ptr{Array{Cdouble,1}},
Ptr{Array{Cdouble,2}},
Ptr{Array{Cdouble,2}},
Ptr{Array{Cdouble,2}},
Ptr{SArray{Tuple{3},Cdouble,1,3}}, ),
point ,
wfn.nmo ,
wfn.ncent ,
wfn.nprims ,
mgrp ,
ngtoH ,
wfn.ngroup ,
wfn.ityp ,
wfn.nzexp ,
wfn.nlm ,
wfn.nuexp ,
wfn.occ ,
wfn.oexp ,
wfn.xyz ,
wfn.rcutte ,
wfn.coef ,
grad_rho # inout
)
end
∇ρ
end
Finally, I call it with
using StaticArrays
using BenchmarkTools
# Data structure
struct AIM_WFN_Gamess_Mol
nmo; nprims; ncent; atnam; xyz; charge; icen; ityp; oexp;
occ; eorb; coef; tote; virial; npc; icenat; nlm; ngroup;
nzexp; nuexp; npcant; maxgrp; numshells; cuttz; rcutte
end
# Wrapper around Fortran routine
include("density_gradient.jl")
# Load wfn data [just a dummy example]
nmo, nprims, ncent, mgrp, numshells, ngtoH = 2, 2, 1, 3, 2, 4
cuttz = 1e-15
wfn = AIM_WFN_Gamess_Mol(nmo, nprims, ncent, ["H","H"], zeros(ncent,3),
zeros(ncent), zeros(Int,ncent), zeros(Int,ncent),
zeros(nprims), zeros(nmo), zeros(nmo), zeros(nmo,nmo),
0.0, 0.0, 3, zeros(Int,nprims, ncent), zeros(Int,56,3),
zeros(Int,ncent), zeros(Int, ncent, mgrp),
zeros(Int, ncent, mgrp, ngtoH), 0, maxgrp, numshells,
cuttz, zeros(ncent, maxgrp)
)
v = zeros(MVector{3}) # choose a dummy point
@benchmark fortran_density_gradient($v, $v, $wfn, 0)
#BenchmarkTools.Trial:
# memory estimate: 320 bytes # I would like to reduce this to 0
# allocs estimate: 20
# --------------
# minimum time: 7.319 μs (0.00% GC)
# median time: 7.388 μs (0.00% GC)
# mean time: 7.525 μs (0.00% GC)
# maximum time: 46.887 μs (0.00% GC)
# --------------
# samples: 10000
# evals/sample: 4
# When the volume of calls is large (our case), those 320 bytes matter a lot.
@time for i in 1:1e6 fortran_density_gradient(v, v, wfn, 0) end
# 7.380677 seconds (20.00 M allocations: 305.176 MiB, 0.13% gc time)
# Becomes very slow and memory hungry
In a normal execution of the application, this routine has to be called millions of times. Therefore, it has to be very efficient. Could allocations be avoided in this case?
Version info
Julia Version 1.5.0
Commit 96786e22cc (2020-08-01 23:44 UTC)
Platform Info:
OS: macOS (x86_64-apple-darwin18.7.0)
CPU: Intel(R) Core(TM) i5-7360U CPU @ 2.30GHz
WORD_SIZE: 64
LIBM: libopenlibm
LLVM: libLLVM-9.0.1 (ORCJIT, skylake)
Environment:
JULIA_NUM_THREADS = 1
JULIA_EDITOR = view