Speed up multidimensional code without Base.Cartesian

Ok, so the culpit here was thesum function. I achieve the same performance of 250 μs with this one, which uses a normal for loop

function energy_arbitrary(Λ::AbstractArray{T,N}) where {T,N}
	E = 0
	@simd for r ∈ CartesianIndices(Λ) 
			E += @inbounds Λ[r] * (+)((@inbounds Λ[ntuple(i -> i == d ? bc(r[i], size(Λ, i)) : r[i], N)...] for d ∈ 1:N)...)
	end
	return -E
end