Good question! I tried this with TransducersNext.jl and it worked pretty great:
using Base.Cartesian: @nexprs
using TransducersNext
using TransducersNext: Executor, complete, next, combine, @unroll
struct BlockedCommutativeEx <: Executor end
function TransducersNext.__fold__(rf::RF, init::T, A, ::BlockedCommutativeEx) where {RF, T}
op(x, y) = next(rf, x, y)
inds = eachindex(A)
i1, iN = firstindex(inds), lastindex(inds)
n = length(inds)
v = init
# statically peel off the first iteration for type stability reasons if we don't have a numeric `init`
@unroll 1 for batch in 0:(n>>4)-1
i = i1 + batch*16
@nexprs 16 N-> a_N = @inbounds A[inds[i+(N-1)]]
@nexprs 2 N-> v_N = op(
op(op(op(op(op(op(op(init,
a_{(N-1)*8+1}),
a_{(N-1)*8+2}),
a_{(N-1)*8+3}),
a_{(N-1)*8+4}),
a_{(N-1)*8+5}),
a_{(N-1)*8+6}),
a_{(N-1)*8+7}),
a_{(N-1)*8+8})
v = combine(rf, v, combine(rf, v_1, v_2))
end
i = i1 + (n>>4)*16 - 1
i == iN && return v
for i in i+1:iN
ai = @inbounds A[inds[i]]
v = op(v, ai)
end
return complete(rf, v)
end
julia> @btime fold(+, $A; executor=BlockedCommutativeEx())
1.146 μs (0 allocations: 0 bytes)
5003.618592521285
julia> @btime fold(+, $B; executor=BlockedCommutativeEx())
767.919 ns (0 allocations: 0 bytes)
2509.5895307786136
Basically the same timings as with @mbauman’s function for A
, but surprisingly it does better than Matt’s version for B
:
julia> @btime reduce_mb_4(+, $A)
1.144 μs (0 allocations: 0 bytes)
5003.618592521285
julia> @btime reduce_mb_4(+, $B)
1.110 μs (0 allocations: 0 bytes)
2509.5895307786136
What’s especially cool about doing this with TransducersNext is that you can do stuff like
executor = ChunkedEx(BlockedCommutativeEx; chunksize=1024)
or whatever to do further outer-chunking of the algorithm in a composable way, with minimal performance loss:
julia> @btime fold(+, $B; executor=ChunkedEx(BlockedCommutativeEx(); chunksize=1024), init=0)
869.175 ns (0 allocations: 0 bytes)
2507.819648819429