I’m going to look into increment!
.
I’m getting effectively no penalty using CartesianIndex for x[I] += ϵ[I]
. However, updating the residual requires doing a matrix multiplication, and this is where CartesianIndex seems to cost us.
Edit: sum(f for i in 1:N)
seemed to be causing trouble again (see Macro to sum over short loop - #9 by ffevotte). CartesianIndex is just as fast if I manually unroll the loop. Of course, this is no longer dimensionally independent…
Edit 2: @inbounds!
Summary
@inline δ(a,N::Int) = CartesianIndex(ntuple(i -> i==a ? 1 : 0, N))
@fastmath function incrementCI!(ϵ::Array{T,N},r::Array{T,N},L::Array,D::Array{T,N}) where {T,N}
R = CartesianIndex(size(r))
@inbounds @simd for I ∈ 2oneunit(R):R-oneunit(R)
σ = (
ϵ[I]*D[I]
+sum(@inbounds(ϵ[I-δ(i,N)]*L[I,i]) for i=1:N)
+sum(@inbounds(ϵ[I+δ(i,N)]*L[I+δ(i,N),i]) for i=1:N))
r[I] -= σ
end
end
@fastmath function increment3D!(ϵ::Array,r::Array,L::Array,D::Array)
(ni,nj,nk) = size(r)
@inbounds @simd for k = 2:nk-1
for j = 2:nj-1
for i = 2:ni-1
σ = (
D[i,j,k]*ϵ[i,j,k]
+L[i,j,k,1]*ϵ[i-1,j,k]
+L[i,j,k,2]*ϵ[i,j-1,k]
+L[i,j,k,3]*ϵ[i,j,k-1]
+L[i+1,j,k,1]*ϵ[i+1,j,k]
+L[i,j+1,k,2]*ϵ[i,j+1,k]
+L[i,j,k+1,3]*ϵ[i,j,k+1])
r[i,j,k] -= σ
end
end
end
end
function test3D(n)
L = ones(2^n+2,2^n+2,2^n+2,3)
D = -6*ones(2^n+2,2^n+2,2^n+2)
r = zeros(2^n+2,2^n+2,2^n+2)
ϵ = rand(2^n+2,2^n+2,2^n+2)
return ϵ,r,L,D
end
ϵ,r,L,D = test3D(7)
julia> @btime incrementCI!($ϵ,$r,$L,$D)
4.388 ms (0 allocations: 0 bytes)
julia> @btime increment3D!($ϵ,$r,$L,$D)
4.382 ms (0 allocations: 0 bytes)
Edit: So this code is just as fast as the manually indexed version, and it’s the most important.