Need perf help on paralel var

Hello, I’m trying to add Statistics to AcceleratedKernels.jl in Add Statistics in KA, (only mean and var implemented) by yolhan83 · Pull Request #64 · JuliaGPU/AcceleratedKernels.jl · GitHub, GPU results seems nice but big issue on CPU var implementation not looking good if someone has an idea I’m in.
Also giving the implementation for now

@kernel inbounds=true cpu=false unsafe_indices=true function _slicing_mean1d!(src,@Const(m))
    N = @groupsize()[1]
    iblock = @index(Group, Linear)
    ithread = @index(Local, Linear)
    i = ithread + (iblock - 0x1) * N
    if i <= length(src)
        src[i] -= m
    end
end


@kernel inbounds=true cpu=false unsafe_indices=true function _slicing_meannd!(src, @Const(m), @Const(dims))
    src_strides = strides(src)
    m_strides   = strides(m)
    nd          = length(src_strides)
    N       = @groupsize()[1]
    iblock  = @index(Group, Linear) - 0x1
    ithread = @index(Local, Linear) - 0x1
    tid     = ithread + iblock * N

    if tid < length(src)
        tmp    = tid
        midx0  = typeof(tid)(0)  
        KernelAbstractions.Extras.@unroll for i in nd:-1i16:1i16
            idxi = tmp ÷ src_strides[i]   
            if i != dims
                midx0 += idxi * m_strides[i]
            end
            tmp = tmp % src_strides[i]
        end
        src[tid + 0x1] -= m[midx0 + 0x1]
    end
end
function _slicing_meannd_cpu!(src::AbstractArray{T,N}, m, dims::Int;max_tasks,min_elems) where {T<:Real, N}
    src_strides ::NTuple{N,Int} = strides(src)
    m_strides  ::NTuple{N,Int} = strides(m)
    nd          = length(src_strides)
    foreachindex(src; max_tasks=max_tasks, min_elems=min_elems) do isrc
        tmp   = isrc - 1
        midx0  = 0
        KernelAbstractions.Extras.@unroll for i in nd:-1:1
            @inbounds idxi = tmp ÷ src_strides[i]
            if i != dims
                @inbounds midx0 += idxi * m_strides[i]
            end
            @inbounds tmp = tmp % src_strides[i]
        end
        @inbounds src[isrc] -= m[midx0 + 1]
    end
end

function var!(
    src::AbstractArray{T,N},backend::Backend=get_backend(src);
    dims::Union{Nothing, Int}=nothing,
    corrected ::Bool = true,
    # CPU settings - ignored here
    max_tasks::Int = Threads.nthreads(),
    min_elems::Int = 1,
    prefer_threads::Bool=true,

    # GPU settings
    block_size::Int = 256,
    temp::Union{Nothing, AbstractArray} = nothing,
    switch_below::Int=0,
)  where {T<:Real,N}  
    m = mean(
        src,backend;
        dims=dims,
        # CPU settings - ignored here
        max_tasks = max_tasks,
        min_elems = min_elems,
        prefer_threads = prefer_threads,
        # GPU settings
        block_size = block_size,
        temp = temp,
        switch_below = switch_below
    )
    if T<:Integer
        src = Float32.(src)
    end
    if isnothing(dims)
        src .-= m
    else
        if use_gpu_algorithm(backend, prefer_threads)
            if N == 1
                _slicing_mean1d!(backend,block_size)(src,m;ndrange=length(src))
            else 
                _slicing_meannd!(backend,block_size)(src,m,dims;ndrange=length(src))
            end
        else
            if N ==1
                foreachindex(src; max_tasks=max_tasks, min_elems=min_elems) do i
                    src[i] -= m
                end
            else
                _slicing_meannd_cpu!(src,m,dims;max_tasks=max_tasks,min_elems=min_elems)
            end
        end
    end
    ntmp = isnothing(dims) ? nothing : m
    res = mapreduce(x->x*x,+,src,backend;
        init=zero(eltype(src)),
        dims=dims,
        max_tasks=max_tasks,
        min_elems=min_elems,
        prefer_threads=prefer_threads,
        block_size=block_size,
        temp=ntmp,
        switch_below=switch_below)
    if isnothing(dims) || N == 1
        res /= (length(src) - ifelse(corrected , 1 , 0))
        return res 
    end
    s = eltype(src)(1)/ (size(src,dims) - ifelse(corrected , 1 , 0))
    res .*= s
    return res
end

the benchmark plots are given in the PR

I missed the reason why we need CPU ak.

I guess KA.jl itself is getting a new POCL (OpenCL) based CPU overhaul, so these numbers likely to change a lot when that lands

Yes, but I would like to add it without adding Statistics.jl to deps, its just a matter of having something that works actually and not as bad as the one I did.

I think I will stick to this for now, its not too bad now I revisited perf boosts and the gpu perfs really looks cool

Naive way to compute variance scales poorly in parallel. Did you see Algorithms for calculating variance - Wikipedia? Also, we have an example of parallel variance in MPI.jl docs: Reduce · MPI.jl. I seem to remember that’s inspired by MPI Reduce and Allreduce · MPI Tutorial

Thanks a lot completly fixed now

1 Like