Immutable struct with an Array slower than a constant global Array

I am able to match your moments2 function while avoiding globals by using SizedArray from the StaticArrays package, so that the array dimensions are known at compile time:

using Einsum
using BenchmarkTools
using StaticArrays

const H  = 20
const W  = 20
const f  = zeros(H, W, 9)
const rho = ones(H, W)

function moments2()
    let n, m, i
        @inbounds for m=1:W, n=1:H
                local a = 0.0
                for i=1:9 a += f[n, m, i] end
                rho[n, m] = a
        end
    end
    return nothing
end

# This function doesn't use any globals
function moments1!(rho, f)
    @inbounds for m ∈ axes(f, 2), n ∈ axes(f, 1)
        a = zero(eltype(f))
        for i ∈ axes(f, 3)
            a += f[n, m, i]
        end
        rho[n, m] = a
    end
    return nothing
end

@btime moments2()  #  276.107 ns (0 allocations: 0 bytes)
@btime moments1!($rho, $f)  # 1.971 μs (0 allocations: 0 bytes)

# Now try passing SizedArrays to the same function
rho_s = SizedArray{Tuple{H, W}}(rho)
f_s = SizedArray{Tuple{H, W, 9}}(f)

@btime moments1!($rho_s, $f_s)  #  277.280 ns (0 allocations: 0 bytes)
2 Likes