I am able to match your moments2 function while avoiding globals by using SizedArray from the StaticArrays package, so that the array dimensions are known at compile time:
using Einsum
using BenchmarkTools
using StaticArrays
const H = 20
const W = 20
const f = zeros(H, W, 9)
const rho = ones(H, W)
function moments2()
let n, m, i
@inbounds for m=1:W, n=1:H
local a = 0.0
for i=1:9 a += f[n, m, i] end
rho[n, m] = a
end
end
return nothing
end
# This function doesn't use any globals
function moments1!(rho, f)
@inbounds for m ∈ axes(f, 2), n ∈ axes(f, 1)
a = zero(eltype(f))
for i ∈ axes(f, 3)
a += f[n, m, i]
end
rho[n, m] = a
end
return nothing
end
@btime moments2() # 276.107 ns (0 allocations: 0 bytes)
@btime moments1!($rho, $f) # 1.971 μs (0 allocations: 0 bytes)
# Now try passing SizedArrays to the same function
rho_s = SizedArray{Tuple{H, W}}(rho)
f_s = SizedArray{Tuple{H, W, 9}}(f)
@btime moments1!($rho_s, $f_s) # 277.280 ns (0 allocations: 0 bytes)