Ok, so the culpit here was thesum function. I achieve the same performance of 250 μs with this one, which uses a normal for loop
function energy_arbitrary(Λ::AbstractArray{T,N}) where {T,N}
E = 0
@simd for r ∈ CartesianIndices(Λ)
E += @inbounds Λ[r] * (+)((@inbounds Λ[ntuple(i -> i == d ? bc(r[i], size(Λ, i)) : r[i], N)...] for d ∈ 1:N)...)
end
return -E
end