I need to find the indices and values of the two largest elements in a 2D array. I whipped up a foldl
-based solution for the CPU that’s much faster than Base.findmax
and Base.partialsortperm
, but it runs like molasses on the GPU compared to CUDA.findmax
.
function findmax2(prev, curr)
i, v = curr
v > prev.first.v && return (first=(; i, v), second=prev.first)
v > prev.second.v && return (first=prev.first, second=(; i, v))
return prev
end
julia> dA = CUDA.rand(512, 512); cA = Array(dA);
CPU:
julia> @btime foldl(findmax2, enumerate($cA), init=(first=(i=0, v=0.0f0), second=(i=0, v=0.0f0)))
284.799 μs (0 allocations: 0 bytes)
(first = (i = 20521, v = 0.99999803f0), second = (i = 23437, v = 0.9999958f0))
julia> @btime findmax($cA)
798.600 μs (0 allocations: 0 bytes)
(0.99999803f0, CartesianIndex(41, 41))
julia> @btime partialsortperm(vec($cA), 1:2, rev=true)
3.654 ms (7 allocations: 2.00 MiB)
2-element view(::Array{Int64,1}, 1:2) with eltype Int64:
20521
23437
GPU:
GPU benchmark wrappers
function run_findmax(A)
CUDA.@sync begin
ans = findmax(A)
end
return ans
end
function run_findmax2(A)
CUDA.@sync begin
ans = foldl(findmax2, enumerate(A), init=(first=(i=0, v=0.0f0), second=(i=0, v=0.0f0)))
end
return ans
end
julia> @btime run_findmax2($dA)
2.548 s (262148 allocations: 24.00 MiB)
(first = (i = 20521, v = 0.99999803f0), second = (i = 23437, v = 0.9999958f0))
julia> @btime run_findmax($dA)
120.799 μs (123 allocations: 4.80 KiB)
(0.99999803f0, CartesianIndex(41, 41))
I tried to get something working with @tkf’s FoldsCUDA.jl, and it produces fast but slightly incorrect results on the GPU, throws an error with threaded CPU execution, and produces slow but correct results with single-threaded CPU execution.
Floops.jl / FoldsCUDA.jl
function folds_findmax2(xs, ex = xs isa CuArray ? CUDAEx() : ThreadedEx())
xtypemin = typemin(eltype(xs))
@floop ex for (i, x) in zip(eachindex(xs), xs)
@reduce() do (xmax1=xtypemin; x), (imax1=-1; i)
if isless(xmax1, x)
imax2, xmax2 = imax1, xmax1
imax1, xmax1 = i, x
end
end
i == imax1 && continue
@reduce() do (xmax2=xtypemin; x), (imax2=-1; i)
if isless(xmax2, x)
imax2, xmax2 = i, x
end
end
end
return ((imax1, xmax1), (imax2, xmax2))
end
function run_folds_findmax2(dA)
CUDA.@sync begin
ans = folds_findmax2(dA)
end
return ans
end
julia> folds_findmax2(cA)
ERROR: ArgumentError: `halve(zip(...))` requires collections with identical `size`
Stacktrace:
[1] shape at C:\Users\alexa\.julia\packages\SplittablesBase\dZ3Qr\src\implementations.jl:226 [inlined]
...
julia> @btime run_folds_findmax2($dA)
173.600 μs (137 allocations: 5.67 KiB)
((20521, 0.99999803f0), (98901, 0.9993288f0))
julia> @btime folds_findmax2($cA, SequentialEx(simd = Val(true)))
1.118 ms (0 allocations: 0 bytes)
((20521, 0.99999803f0), (23437, 0.9999958f0))
Any insights on where I went wrong with FoldsCUDA, or how to avert (invisible?) scalar indexing for my original foldl
-based solution?
(edited to include single-threaded FLoops.jl results)