Hello guys!
I’ve hit a bit of a roadblock and need a bit of help to overcome it. I’ve made a parallelized reduction function in which I am using Polyester.@batch
. It works very well for 1 dimensional arrays, but now I am expanding it to work for StaticArrays. I have the following code:
using Base.Threads
using JET
using Bumper
using StrideArrays # Not necessary, but can make operations like broadcasting with Bumper.jl faster.
using Polyester
using BenchmarkTools
using LoopVectorization
using ChunkSplitters
using StaticArrays
using StructArrays
struct DimensionalData{D, T <: AbstractFloat}
vectors::Tuple{Vararg{Vector{T}, D}}
V::StructArray{SVector{D, T}, 1, Tuple{Vararg{Vector{T}, D}}}
# General constructor for vectors
function DimensionalData(vectors::Vector{T}...) where {T}
D = length(vectors)
V = StructArray{SVector{D, T}}(vectors)
new{D, T}(Tuple(vectors), V)
end
# Constructor for initializing with all zeros, adapting to dimension D
function DimensionalData{D, T}(len::Int) where {D, T}
vectors = ntuple(d -> zeros(T, len), D) # Create D vectors of zeros
V = StructArray{SVector{D, T}}(vectors)
new{D, T}(vectors, V)
end
end
function ReductionFunctionChunk!(dρdtI, I, J, drhopLp, drhopLn)
XT = eltype(dρdtI); XL = length(dρdtI); X0 = zero(XT)
nchunks = nthreads() # Assuming nchunks is defined somewhere as nthreads()
@inbounds @no_escape begin
local_X = @alloc(XT, XL, nchunks)
fill!(local_X,X0)
# Directly iterate over the chunks
@batch for ichunk in 1:nchunks
chunk_inds = getchunk(I, ichunk; n=nchunks)
for idx in chunk_inds
i = I[idx]
j = J[idx]
# Accumulate the contributions into the correct place
local_X[i, ichunk] += drhopLp[idx]
local_X[j, ichunk] += drhopLn[idx]
end
end
# Reduction step
# using @tturbo is slightly faster than batch (28 mus), but @batch (30 mus) works for svector, so we prefer this.
@batch for ix in 1:XL
for chunk in 1:nchunks
dρdtI[ix] += local_X[ix, chunk]
end
end
end
return nothing
end
function ReductionFunctionChunkNoBatch!(dρdtI, I, J, drhopLp, drhopLn)
XT = eltype(dρdtI); XL = length(dρdtI); X0 = zero(XT)
nchunks = nthreads() # Assuming nchunks is defined somewhere as nthreads()
@inbounds @no_escape begin
local_X = @alloc(XT, XL, nchunks)
fill!(local_X,X0)
# Directly iterate over the chunks
for ichunk in 1:nchunks
chunk_inds = getchunk(I, ichunk; n=nchunks)
for idx in chunk_inds
i = I[idx]
j = J[idx]
# Accumulate the contributions into the correct place
local_X[i, ichunk] += drhopLp[idx]
local_X[j, ichunk] += drhopLn[idx]
end
end
# Reduction step
# using @tturbo is slightly faster than batch (28 mus), but @batch (30 mus) works for svector, so we prefer this.
for ix in 1:XL
for chunk in 1:nchunks
dρdtI[ix] += local_X[ix, chunk]
end
end
end
return nothing
end
begin
ProblemScaleFactor = 1
NumberOfPoints = 6195*ProblemScaleFactor
NumberOfInterations = 50000*ProblemScaleFactor
I = rand(1:NumberOfPoints, NumberOfInterations)
J = I; #rand(1:NumberOfPoints, NumberOfInterations)
V = zeros(SVector{2,Float64},NumberOfPoints)
VL = rand(eltype(V),NumberOfInterations)
VD = DimensionalData{2, Float64}(NumberOfPoints)
VDL = DimensionalData{2, Float64}(NumberOfInterations)
VDL.V .= VL
V .*= 0
ReductionFunctionChunk!(V,I,J,VL,VL)
println("Value when doing chunk svector reduction: ", sum(V))
println(@report_opt ReductionFunctionChunk!(V,I,J,VL,VL))
println(@report_call ReductionFunctionChunk!(V,I,J,VL,VL))
#println(@code_warntype ReductionFunctionChunk!(V,I,J,VL,VL))
VD.V .*= 0
ReductionFunctionChunk!(VD.V,I,J,VDL.V,VDL.V)
println("Value when doing chunk svector reduction with dimensional data: ", sum(VD.V))
println(@report_opt ReductionFunctionChunk!(VD.V,I,J,VDL.V,VDL.V))
println(@report_call ReductionFunctionChunk!(VD.V,I,J,VDL.V,VDL.V))
#println(@code_warntype ReductionFunctionChunk!(VD.V,I,J,VDL.V,VDL.V))
VD.V .*= 0
ReductionFunctionChunkNoBatch!(VD.V,I,J,VDL.V,VDL.V)
println("Value when doing chunk svector reduction with dimensional data and no @batch: ", sum(VD.V))
println(@report_opt ReductionFunctionChunkNoBatch!(VD.V,I,J,VDL.V,VDL.V))
println(@report_call ReductionFunctionChunkNoBatch!(VD.V,I,J,VDL.V,VDL.V))
#println(@code_warntype ReductionFunctionChunkNoBatch!(VD.V,I,J,VDL.V,VDL.V))
println("Chunk function:")
display(@benchmark ReductionFunctionChunk!($V,$I,$J,$VL,$VL))
println("Chunk function with dimensional data:")
display(@benchmark ReductionFunctionChunk!($(VD.V),$I,$J,$(VDL.V),$(VDL.V)))
println("Chunk function with dimensional data and no @batch:")
display(@benchmark ReductionFunctionChunkNoBatch!($(VD.V),$I,$J,$(VDL.V),$(VDL.V)))
end
It outputs the results as follows:
So all calculations are correct. In the benchmarking:
*Chunk function:*
BenchmarkTools.Trial: 10000 samples with 1 evaluation.
Range (min … max): 37.900 μs … 437.500 μs ┊ GC (min … max): 0.00% … 0.00%
Time (median): 55.700 μs ┊ GC (median): 0.00%
Time (mean ± σ): 55.356 μs ± 9.583 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
▃ ▂ █
▁▃▂▁▁▁▁▁▁▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂█▄▅█▅▆▄▂▄█▆▂▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁ ▂
37.9 μs Histogram: frequency by time 67.5 μs <
Memory estimate: 0 bytes, allocs estimate: 0.
*Chunk function with dimensional data:*
BenchmarkTools.Trial: 10000 samples with 1 evaluation.
Range (min … max): 48.000 μs … 388.700 μs ┊ GC (min … max): 0.00% … 0.00%
Time (median): 78.100 μs ┊ GC (median): 0.00%
Time (mean ± σ): 72.682 μs ± 14.848 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
▂▂ █ ▂▆█
██▂▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▆█████▆▄▃▃▂▂▂▂▂▂▂▂▂▂▂ ▃
48 μs Histogram: frequency by time 90.9 μs <
Memory estimate: 176 bytes, allocs estimate: 2.
*Chunk function with dimensional data and no @batch:*
BenchmarkTools.Trial: 10000 samples with 1 evaluation.
Range (min … max): 157.500 μs … 562.500 μs ┊ GC (min … max): 0.00% … 0.00%
Time (median): 163.700 μs ┊ GC (median): 0.00%
Time (mean ± σ): 178.801 μs ± 43.349 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
▅█▄▅▄▅▄▃▂▁ ▁
████████████▇▇▇▇▇▆▆▆▆▆▃▄▆▅▅▇▇▇▇▆▆▆▅▅▄▄▄▄▅▄▃▄▄▄▃▄▃▃▃▄▃▄▄▂▄▆▇██ █
158 μs Histogram: log(frequency) by time 402 μs <
Memory estimate: 0 bytes, allocs estimate: 0.
The first benchmark shows what happens when I use Vector{SVector{2,Float64}}
directly. The second benchmark, allocates, and shows when I use a struct, DimensionalData
, which holds a struct array with the field, V
and type Vector{SVector{2,Float64}}
. The third benchmark shows when I remove @batch
that no allocations are happening.
Comparing the @code_warntype
for the first and second benchmark:
Everything is the same other than what is shown above.
Could anyone kindly show me how to fix this, so that I can use DimensionalData
without allocating?
Kind regards