I am trying hard to improve the performances of this function that just sum a serie of multiplications, but where all the data is given in terms of indices:
This is my data:
using BenchmarkTools, LoopVectorization
x = [1.0 2.5; 3 4]
y = zeros(2,3)
w = [-0.55248;-0.229514;;0.292818;-0.792014;;; -0.107403;-0.0289423;;0.457216;0.735226;;;0.0260589;-0.008588;;0.348731;0.335272;;;]
y_to_x_ids = [ [[(1, 1), (1, 2)]] [[(1, 1), (1, 2)]] [[(1, 1), (1, 2)]]; [[(1, 1), (2, 1), (1, 2), (2, 2)]] [[(1, 1), (2, 1), (1, 2), (2, 2)]] [[(1, 1), (2, 1), (1, 2), (2, 2)]] ]
y_to_w_ids = [ [[(2, 1, 1), (2, 2, 1)]] [[(2, 1, 2), (2, 2, 2)]] [[(2, 1, 3), (2, 2, 3)]]; [[(1, 1, 1), (2, 1, 1), (1, 2, 1), (2, 2, 1)]] [[(1, 1, 2), (2, 1, 2), (1, 2, 2), (2, 2, 2)]] [[(1, 1, 3), (2, 1, 3), (1, 2, 3), (2, 2, 3)]]]
And these are the functions I have already attempted. Note I get an error if I try to apply @turbo
or @simd
on the first 3 functions…
function _zComp1!(y,y_to_x_ids,y_to_w_ids,w,x)
for y_idx in CartesianIndices(y)
y_idx_tuple = Tuple(y_idx)
yi = 0.0
n = length(y_to_x_ids[y_idx_tuple...])
@inbounds for idx in 1:n
yi += x[y_to_x_ids[y_idx_tuple...][idx]...] * w[y_to_w_ids[y_idx_tuple...][idx]...]
end
y[y_idx_tuple...] = yi
end
end
function _zComp2!(y,y_to_x_ids,y_to_w_ids,w,x)
for y_idx in CartesianIndices(y)
y_idx_tuple = Tuple(y_idx)
yi = 0.0
@inbounds for (x_id,w_id) in zip(y_to_x_ids[y_idx_tuple...],y_to_w_ids[y_idx_tuple...])
yi += x[x_id...] * w[w_id...]
end
y[y_idx_tuple...] = yi
end
end
function _zComp3!(y,y_to_x_ids,y_to_w_ids,w,x)
for y_idx in CartesianIndices(y)
y_idx_tuple = Tuple(y_idx)
y[y_idx_tuple...] = @inbounds sum(x[x_id...] * w[w_id...] for (x_id,w_id) in zip(y_to_x_ids[y_idx_tuple...],y_to_w_ids[y_idx_tuple...]))
end
end
function _zComp4!(y,y_to_x_ids,y_to_w_ids,w,x)
for y_idx in CartesianIndices(y)
y_idx_tuple = Tuple(y_idx)
xv = [x[x_id...] for x_id in y_to_x_ids[y_idx_tuple...]]
wv = [w[w_id...] for w_id in y_to_w_ids[y_idx_tuple...]]
N = length(xv)
yi = 0.0
@turbo for n in 1:N
yi += xv[n] * wv[n]
end
y[y_idx_tuple...] = yi
end
end
# From @abraemer comment:
function _zComp5!(y,y_to_x_ids,y_to_w_ids,w,x)
for y_idx in eachindex(y)
yi = 0.0
@inbounds for (x_id,w_id) in zip(y_to_x_ids[y_idx], y_to_w_ids[y_idx])
yi += x[x_id...] * w[w_id...]
end
y[y_idx] = yi # this was not guaranteed to be inbounds before
end
end
And here their benchmarks… is it possible to do better ?
_zComp1!(y,y_to_x_ids,y_to_w_ids,w,x) # [-2.20955 1.80912 0.829592; -3.67703 3.88971 2.21321]
@btime _zComp1!($y,$y_to_x_ids,$y_to_w_ids,$w,$x) # 74.180 ns ( 0 allocations: 0 bytes)
@btime _zComp2!($y,$y_to_x_ids,$y_to_w_ids,$w,$x) # 73.848 ns ( 0 allocations: 0 bytes)
@btime _zComp3!($y,$y_to_x_ids,$y_to_w_ids,$w,$x) # 113.619 ns ( 0 allocations: 0 bytes)
@btime _zComp4!($y,$y_to_x_ids,$y_to_w_ids,$w,$x) # 504.446 ns (12 allocations: 1.03 KiB)
@btime _zComp5!($y,$y_to_x_ids,$y_to_w_ids,$w,$x) # 60.256 ns ( 0 allocations: 0 bytes)