Help eliminating allocations in a call to a Fortran subroutine

I would like to ask you for help to improve a call to a critical routine written in Fortran. A minimal version of the library is the following

Fortran library source
module fortran_lib

contains

  subroutine density_gradient(        &
                              point,  &
                              nmo,    &
                              ncent,  &
                              nprims, &
                              mgrp,   &
                              ngtoH,  &
                              ngroup, &
                              ityp,   &
                              nzexp,  &
                              nlm,    &
                              nuexp,  &
                              occ,    &
                              oexp,   &
                              xyz,    &
                              rcutte, &
                              coef,   &
                              grad_rho) &
                              bind(C, name='density_gradient')

     use iso_c_binding ! C-compatible types
     implicit none
     ! input variables:
     real(c_double),  dimension(3),                  intent(in) :: point
     integer(c_long),                                intent(in) :: nmo, ncent, nprims, mgrp, ngtoH
     integer(c_long), dimension(ncent),              intent(in) :: ngroup
     integer(c_long), dimension(nprims),             intent(in) :: ityp
     integer(c_long), dimension(ncent,mgrp),         intent(in) :: nzexp
     integer(c_long), dimension(56,3),               intent(in) :: nlm ! magic number
     integer(c_long), dimension(ncent,mgrp,ngtoH),   intent(in) :: nuexp
     real(c_double),  dimension(nmo),                intent(in) :: occ
     real(c_double),  dimension(nprims),             intent(in) :: oexp
     real(c_double),  dimension(ncent,3),            intent(in) :: xyz
     real(c_double),  dimension(ncent,mgrp),         intent(in) :: rcutte
     real(c_double),  dimension(2*nmo,nprims),       intent(in) :: coef

     ! output variables:
     real(c_double),  dimension(3),               intent(inout) :: grad_rho

     ! some code goes here

  end subroutine
end module fortran_lib

The wrapper is:

Wrapper code

const RHOGRADLIB    = joinpath(@__DIR__, "libdensity_gradient.so")
const RHOGRADSOURCE = joinpath(@__DIR__,    "density_gradient.f90")

run(`gfortran -Wall -Wno-unused-variable -Wno-unused-dummy-argument -shared -O3 -fPIC -o $RHOGRADLIB $RHOGRADSOURCE`)

function fortran_density_gradient(∇ρ, point, wfn::AIM_WFN_Gamess_Mol, t)

   ngtoH = size(wfn.nuexp, 3)

   @static if VERSION >= v"1.5.0"  # clearer syntax

      @ccall RHOGRADLIB.density_gradient(
                 point      :: Ptr{MArray{Tuple{3},Cdouble,1,3}},
                 wfn.nmo    :: Ref{Clong},
                 wfn.ncent  :: Ref{Clong},
                 wfn.nprims :: Ref{Clong},
                 wfn.maxgrp :: Ref{Clong},
                 ngtoH      :: Ref{Clong},
                 wfn.ngroup :: Ptr{Array{Clong,1}},
                 wfn.ityp   :: Ptr{Array{Clong,1}},
                 wfn.nzexp  :: Ptr{Array{Clong,2}},
                 wfn.nlm    :: Ptr{Array{Clong,2}},
                 wfn.nuexp  :: Ptr{Array{Clong,3}},
                 wfn.occ    :: Ptr{Array{Cdouble,1}},
                 wfn.oexp   :: Ptr{Array{Cdouble,1}},
                 wfn.xyz    :: Ptr{Array{Cdouble,2}},
                 wfn.rcutte :: Ptr{Array{Cdouble,2}},
                 wfn.coef   :: Ptr{Array{Cdouble,2}},
                 ∇ρ         :: Ptr{MArray{Tuple{3},Cdouble,1,3}}, # inout
             )::Cvoid
   else # works anyway
      # Argument types are specified with a tuple,
      # (Any,) isa Tuple => true
      # (Any)  isa Tuple => false
      ccall( (:density_gradient, RHOGRADLIB), Cvoid, (
                 Ptr{SArray{Tuple{3},Cdouble,1,3}},
                 Ref{Clong},
                 Ref{Clong},
                 Ref{Clong},
                 Ref{Clong},
                 Ref{Clong},
                 Ptr{Array{Clong,1}},
                 Ptr{Array{Clong,1}},
                 Ptr{Array{Clong,2}},
                 Ptr{Array{Clong,2}},
                 Ptr{Array{Clong,3}},
                 Ptr{Array{Cdouble,1}},
                 Ptr{Array{Cdouble,1}},
                 Ptr{Array{Cdouble,2}},
                 Ptr{Array{Cdouble,2}},
                 Ptr{Array{Cdouble,2}},
                 Ptr{SArray{Tuple{3},Cdouble,1,3}}, ),
                 point      ,
                 wfn.nmo    ,
                 wfn.ncent  ,
                 wfn.nprims ,
                 mgrp       ,
                 ngtoH      ,
                 wfn.ngroup ,
                 wfn.ityp   ,
                 wfn.nzexp  ,
                 wfn.nlm    ,
                 wfn.nuexp  ,
                 wfn.occ    ,
                 wfn.oexp   ,
                 wfn.xyz    ,
                 wfn.rcutte ,
                 wfn.coef   ,
                 grad_rho     # inout
           )
   end
   ∇ρ
end

Finally, I call it with

using StaticArrays
using BenchmarkTools

# Data structure
struct AIM_WFN_Gamess_Mol
   nmo; nprims; ncent; atnam; xyz; charge; icen; ityp; oexp;
   occ; eorb; coef; tote; virial; npc; icenat; nlm; ngroup;
   nzexp; nuexp; npcant; maxgrp; numshells; cuttz; rcutte
end

# Wrapper around Fortran routine
include("density_gradient.jl")

# Load wfn data [just a dummy example]
nmo, nprims, ncent, mgrp, numshells, ngtoH = 2, 2, 1, 3, 2, 4
cuttz = 1e-15
wfn = AIM_WFN_Gamess_Mol(nmo, nprims, ncent, ["H","H"], zeros(ncent,3),
                         zeros(ncent), zeros(Int,ncent), zeros(Int,ncent),
                         zeros(nprims), zeros(nmo), zeros(nmo), zeros(nmo,nmo),
                         0.0, 0.0, 3, zeros(Int,nprims, ncent), zeros(Int,56,3),
                         zeros(Int,ncent), zeros(Int, ncent, mgrp),
                         zeros(Int, ncent, mgrp, ngtoH), 0, maxgrp, numshells,
                         cuttz, zeros(ncent, maxgrp)
                        )

v = zeros(MVector{3}) # choose a dummy point

@benchmark fortran_density_gradient($v, $v, $wfn, 0)
#BenchmarkTools.Trial:
#  memory estimate:  320 bytes     # I would like to reduce this to 0
#  allocs estimate:  20
#  --------------
#  minimum time:     7.319 μs (0.00% GC)
#  median time:      7.388 μs (0.00% GC)
#  mean time:        7.525 μs (0.00% GC)
#  maximum time:     46.887 μs (0.00% GC)
#  --------------
#  samples:          10000
#  evals/sample:     4

# When the volume of calls is large (our case), those 320 bytes matter a lot.
@time for i in 1:1e6 fortran_density_gradient(v, v, wfn, 0) end
#   7.380677 seconds (20.00 M allocations: 305.176 MiB, 0.13% gc time) 
# Becomes very slow and memory hungry

In a normal execution of the application, this routine has to be called millions of times. Therefore, it has to be very efficient. Could allocations be avoided in this case?

Version info
Julia Version 1.5.0
Commit 96786e22cc (2020-08-01 23:44 UTC)
Platform Info:
  OS: macOS (x86_64-apple-darwin18.7.0)
  CPU: Intel(R) Core(TM) i5-7360U CPU @ 2.30GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-9.0.1 (ORCJIT, skylake)
Environment:
  JULIA_NUM_THREADS = 1
  JULIA_EDITOR = view

(This doesn’t look correct. It should just be Ptr{Clong} if you are passing an array of Clong to Fortran.)

2 Likes

Yes, that is the case. That makes sense. Thank you!