Hi,
I try to implement efficiently the sum of 2D arrays of size (4,N).
I fail to obtain good performance with different implementations:

msum_native
corresponds to the std Juliasum
method, 
msum_2D1D
corresponds to a single loop after 2D>1D transformation, 
mum_fast4_sa
corresponds to an attempt to force SIMD instructions via SVector of size 4, 
msum_fast4
corresponds to a manual unroll of the inner loop, 
msum_basic
corresponds to a straightforward implementation with 2 nested loops.
I think that I should be able to reach around 16Gflops for this operation on my machine (4GHz and 4 additions par cycle for Float64 and AVX units). Best performance does not exceed 4GFlopsâ€¦
The attached code produces the following curves:
Having a look at the result of @code_native, it seems (I am not very good at reading assembly code) that
LLVM failed to produce efficient SIMD code and performs an horizontal reduction (vhaddpd) for each SIMD packs (which is not good).
Any clues ?
P.S. If you want to reproduce the curves you have to replace the line
nset=[2^(i+9) for i=1:2]
by
nset=[2^i for i=1:21]
int the following code (but it takes around 7 minutes)
struct TBSM{T,N}
data::Array{T,2}
end
function TBSM{T,N}(s::Int) where T where N
TBSM{T,N}(rand(T,N,s))
end
function msum_basic(a::TBSM{T,N}) where T where N
s=zero(T)
nx,ny=size(a.data)
@simd for j=1:ny
@simd for i=1:N
@inbounds s+=a.data[i,j]
end
end
s
end
function msum_fast4(a::TBSM{T,N}) where T where N
s=zero(T)
nx,ny=size(a.data)
@simd for j=1:ny
@inbounds s+=a.data[1,j]+a.data[2,j]+a.data[3,j]+a.data[4,j]
end
s
end
using StaticArrays
function msum_fast4_sa(a::TBSM{T,N}) where T where N
# println("N=",N)
s=zero(T)
sa=SVector(0.0,0.0,0.0,0.0)
nx,ny=size(a.data)
@simd for j=1:ny
@inbounds sa+=SVector(a.data[1,j],a.data[2,j],a.data[3,j],a.data[4,j])
end
s=sum(sa)
end
function msum_2D1D(a::TBSM{T,N}) where T where N
nx,ny=size(a.data)
a1D=reshape(a.data,nx*ny)
s=zero(T)
@simd for ai in a1D
@inbounds s+=ai
end
s
end
function msum_native(a::TBSM{T,N}) where T where N
return sum(a.data)
end
#******************** Benchmark machinery ******************
abstract type BTLAction{T} end
type SumTBSMAction{T,N} <: BTLAction{T} end
function initialize(action::SumTBSMAction{T,N}, n::Int) where T where N
return (TBSM{T,N}(n),)
end
function checkresult(action::SumTBSMAction{T,N}, my_function, inputs::Tuple) where T where N
(a,)=inputs
sref=msum_native(a)
s=my_function(a)
# println("diff=",abs(ssref))
return isapprox(s,sref,atol=1.e6)
end
function nflops(action::SumTBSMAction{T,N}, n::Int) where T where N
return BigInt(n)
end
using BenchmarkTools
using Plots
#plotly()
pyplot()
struct Measurement
action_name::String
function_name::String
float_name::String
gflops::Vector{Float64}
sizes::Vector{Int}
source::String
end
maxgflops(m::Measurement) = maximum(m.gflops)
using Sugar
function benchmark_action(action::BTLAction{T}, ss::Vector{Int}, my_function) where T
gflops=[]
sizes=[]
# println(ss)
for s in ss
inputs=initialize(action,s)
correct=checkresult(action,my_function,inputs)
assert(correct)
inputs=initialize(action,s)
bt=@belapsed $my_function($inputs...)
gf=Float64(nflops(action,s)/(1.e9*bt))
println("GFLOPS=",gf,"\t",string(my_function)," ",string(T)," size=",s)
push!(gflops,gf)
push!(sizes,s)
end
inputs=initialize(action,sizes[1])
#Build a tuple of arg types from a tuple of args
mytypes=[]
for e in inputs
push!(mytypes,typeof(e))
end
typetuples=(mytypes...)
#pass this tuple of type to Sugar
m=Sugar.get_method(my_function,typetuples)
code,str=Sugar.get_source(m) #str contains the source code of my_function
return Measurement(string(typeof(action)),string(my_function),string(T),gflops,sizes,str)
end
function plotit(measurements::Array{Measurement,1})
maxi=1.1*maximum([maximum(m.gflops) for m in measurements])
ms=sort(measurements,by=maxgflops,rev=true)
p1=Plots.plot(ylims=(0,maxi),ylabel="GFLOPS",xlabel="arraySize",title="Performance")
for i=1:length(ms)
m=ms[i]
plot!(p1,m.sizes,linewidth=4,marker="o",markersize=5,markercolor="white",m.gflops,xscale = :log10,label=string(string(m.function_name)," ",string(m.float_name)))
end
p1
end
#******************** data for the bench ******************
using BenchmarkTools
fset=[msum_native,msum_basic,msum_2D1D,msum_fast4,msum_fast4_sa]
#nset=[2^i for i=1:21]
nset=[2^(i+9) for i=1:2]
rset=[Float64]
#rset=[Float64,Float32]
m=Vector{Measurement}(0)
for f in fset
for r in rset
ms=benchmark_action(SumTBSMAction{r,4}(),nset,f)
# println(ms.gflops)
push!(m,ms)
end
end
p1=plotit(m)
show(p1)
savefig(p1,"allsum.pdf")
a=TBSM{Float64,4}(1000)
@code_native(msum_2D1D(a))
@code_native(msum_fast4_sa(a))type or paste code here