Hello, I’m trying to add Statistics to AcceleratedKernels.jl in Add Statistics in KA, (only mean and var implemented) by yolhan83 · Pull Request #64 · JuliaGPU/AcceleratedKernels.jl · GitHub, GPU results seems nice but big issue on CPU var implementation not looking good if someone has an idea I’m in.
Also giving the implementation for now
@kernel inbounds=true cpu=false unsafe_indices=true function _slicing_mean1d!(src,@Const(m))
N = @groupsize()[1]
iblock = @index(Group, Linear)
ithread = @index(Local, Linear)
i = ithread + (iblock - 0x1) * N
if i <= length(src)
src[i] -= m
end
end
@kernel inbounds=true cpu=false unsafe_indices=true function _slicing_meannd!(src, @Const(m), @Const(dims))
src_strides = strides(src)
m_strides = strides(m)
nd = length(src_strides)
N = @groupsize()[1]
iblock = @index(Group, Linear) - 0x1
ithread = @index(Local, Linear) - 0x1
tid = ithread + iblock * N
if tid < length(src)
tmp = tid
midx0 = typeof(tid)(0)
KernelAbstractions.Extras.@unroll for i in nd:-1i16:1i16
idxi = tmp ÷ src_strides[i]
if i != dims
midx0 += idxi * m_strides[i]
end
tmp = tmp % src_strides[i]
end
src[tid + 0x1] -= m[midx0 + 0x1]
end
end
function _slicing_meannd_cpu!(src::AbstractArray{T,N}, m, dims::Int;max_tasks,min_elems) where {T<:Real, N}
src_strides ::NTuple{N,Int} = strides(src)
m_strides ::NTuple{N,Int} = strides(m)
nd = length(src_strides)
foreachindex(src; max_tasks=max_tasks, min_elems=min_elems) do isrc
tmp = isrc - 1
midx0 = 0
KernelAbstractions.Extras.@unroll for i in nd:-1:1
@inbounds idxi = tmp ÷ src_strides[i]
if i != dims
@inbounds midx0 += idxi * m_strides[i]
end
@inbounds tmp = tmp % src_strides[i]
end
@inbounds src[isrc] -= m[midx0 + 1]
end
end
function var!(
src::AbstractArray{T,N},backend::Backend=get_backend(src);
dims::Union{Nothing, Int}=nothing,
corrected ::Bool = true,
# CPU settings - ignored here
max_tasks::Int = Threads.nthreads(),
min_elems::Int = 1,
prefer_threads::Bool=true,
# GPU settings
block_size::Int = 256,
temp::Union{Nothing, AbstractArray} = nothing,
switch_below::Int=0,
) where {T<:Real,N}
m = mean(
src,backend;
dims=dims,
# CPU settings - ignored here
max_tasks = max_tasks,
min_elems = min_elems,
prefer_threads = prefer_threads,
# GPU settings
block_size = block_size,
temp = temp,
switch_below = switch_below
)
if T<:Integer
src = Float32.(src)
end
if isnothing(dims)
src .-= m
else
if use_gpu_algorithm(backend, prefer_threads)
if N == 1
_slicing_mean1d!(backend,block_size)(src,m;ndrange=length(src))
else
_slicing_meannd!(backend,block_size)(src,m,dims;ndrange=length(src))
end
else
if N ==1
foreachindex(src; max_tasks=max_tasks, min_elems=min_elems) do i
src[i] -= m
end
else
_slicing_meannd_cpu!(src,m,dims;max_tasks=max_tasks,min_elems=min_elems)
end
end
end
ntmp = isnothing(dims) ? nothing : m
res = mapreduce(x->x*x,+,src,backend;
init=zero(eltype(src)),
dims=dims,
max_tasks=max_tasks,
min_elems=min_elems,
prefer_threads=prefer_threads,
block_size=block_size,
temp=ntmp,
switch_below=switch_below)
if isnothing(dims) || N == 1
res /= (length(src) - ifelse(corrected , 1 , 0))
return res
end
s = eltype(src)(1)/ (size(src,dims) - ifelse(corrected , 1 , 0))
res .*= s
return res
end
the benchmark plots are given in the PR