… I still observe serious slowdown with CartesianIndexes in the 3D case:
function res_loop!(rlc::Array{Float64,3},rlf::Array{Float64,3})
(nx,ny,nz) = size(rlc)
@inbounds @simd for k=2:nz-1
fk=2k-2
for j=2:ny-1
fj=2j-2
for i=2:nx-1
fi=2i-2
rlc[i,j,k]=
rlf[fi ,fj ,fk ]+rlf[fi+1,fj ,fk ]+
rlf[fi ,fj+1,fk ]+rlf[fi+1,fj+1,fk ]+
rlf[fi ,fj ,fk+1]+rlf[fi+1,fj ,fk+1]+
rlf[fi ,fj+1,fk+1]+rlf[fi+1,fj+1,fk+1]
end
end
end
end
The following MWE returns:
Summary
using BenchmarkTools
using LinearAlgebra
using Random
using LoopVectorization
using Printf
@inline CI(a...) = CartesianIndex(a...)
@inline δ(a,d::Int) = CI(ntuple(i -> i==a ? 1 : 0, d))
@inline δ(a,I::CartesianIndex{N}) where {N} = δ(a,N)
@inline CR(a...) = CartesianIndices(a...)
@inline inside(M::NTuple{N,Int}) where {N} = CR(ntuple(i-> 2:M[i]-1,N))
@inline inside(a::Array; reverse::Bool=false) =
reverse ? Iterators.reverse(inside(size(a))) : inside(size(a))
@inline inside_u(N::NTuple{n,T}) where {n,T} = CR(ntuple(i->2:N[i],n-1))
@inline near(I::CartesianIndex,a=0) = (2I-2oneunit(I)):(2I-oneunit(I)-δ(a,I))
function res_loop!(rlc::Array{Float64,2},rlf::Array{Float64,2})
(nrows,ncols) = size(rlc)
@inbounds @simd for j=2:ncols-1
fj=2j-2
for i=2:nrows-1
fi=2i-2
rlc[i,j]=(rlf[fi,fj]+rlf[fi+1,fj]+rlf[fi,fj+1]+rlf[fi+1,fj+1])
end
end
end
function res_loop!(rlc::Array{Float64,3},rlf::Array{Float64,3})
(nx,ny,nz) = size(rlc)
@inbounds @simd for k=2:nz-1
fk=2k-2
for j=2:ny-1
fj=2j-2
for i=2:nx-1
fi=2i-2
rlc[i,j,k]=
rlf[fi ,fj ,fk ]+rlf[fi+1,fj ,fk ]+
rlf[fi ,fj+1,fk ]+rlf[fi+1,fj+1,fk ]+
rlf[fi ,fj ,fk+1]+rlf[fi+1,fj ,fk+1]+
rlf[fi ,fj+1,fk+1]+rlf[fi+1,fj+1,fk+1]
end
end
end
end
function res_ci!(rlc,rlf)
# @inside rlc[I] = sum(@inbounds(rlf[J]) for J ∈ near(I))
@fastmath @inbounds @simd for I ∈ inside(rlc)
rlc[I]=sum(@inbounds(rlf[J]) for J ∈ near(I))
end
end
#function that call the benchmarks
function benchmark_res(rlf_size)
# initialize two 2D arrays (fine and coarse)
rlc_size = rlf_size .>>> 1
Random.seed!(1234);
rlf=rand(Float64,(rlf_size))
rlc=rand(Float64,(rlc_size))
# the following blocks checks that res and res_ci produce same results
rlc_ci=deepcopy(rlc)
res_loop!(rlc,rlf)
res_ci!(rlc_ci,rlf)
# @show @code_native debuginfo=:none res!(rlc,rlf)
# @show @code_native debuginfo=:none res_ci!(rlc,rlf)
@assert norm(rlc-rlc_ci)≈0.0
# #The actual timings
tloop=@belapsed res_loop!($rlc,$rlf)
tci=@belapsed res_ci!($rlc,$rlf)
@printf("tloop=%.3e, tci=%.3e, slowdown=%.3f ,size=%s \n",tloop,tci,tci/tloop,size(rlf))
# @show nf,ss(tloop),ss(tci),ss(tloop/tci)
nothing
end
for n in [10,34,66,258]
benchmark_res((n,n))
benchmark_res((n,n,n))
end
returns
tloop=2.230e-08, tci=5.239e-08, slowdown=2.350 ,size=(10, 10)
tloop=9.610e-08, tci=2.301e-07, slowdown=2.395 ,size=(10, 10, 10)
tloop=1.687e-07, tci=1.039e-06, slowdown=6.161 ,size=(34, 34)
tloop=5.703e-06, tci=2.601e-05, slowdown=4.561 ,size=(34, 34, 34)
tloop=6.217e-07, tci=4.423e-06, slowdown=7.115 ,size=(66, 66)
tloop=4.554e-05, tci=2.232e-04, slowdown=4.901 ,size=(66, 66, 66)
tloop=1.174e-05, tci=7.279e-05, slowdown=6.202 ,size=(258, 258)
tloop=7.787e-03, tci=1.898e-02, slowdown=2.438 ,size=(258, 258, 258)