I need to find the indices and values of the two largest elements in a 2D array. I whipped up a foldl-based solution for the CPU that’s much faster than Base.findmax and Base.partialsortperm, but it runs like molasses on the GPU compared to CUDA.findmax.
function findmax2(prev, curr)
    i, v = curr
    v > prev.first.v && return (first=(; i, v), second=prev.first)
    v > prev.second.v && return (first=prev.first, second=(; i, v))
    return prev
end
julia> dA = CUDA.rand(512, 512); cA = Array(dA);
CPU:
julia> @btime foldl(findmax2, enumerate($cA), init=(first=(i=0, v=0.0f0), second=(i=0, v=0.0f0)))
  284.799 μs (0 allocations: 0 bytes)
(first = (i = 20521, v = 0.99999803f0), second = (i = 23437, v = 0.9999958f0))
julia> @btime findmax($cA)
  798.600 μs (0 allocations: 0 bytes)
(0.99999803f0, CartesianIndex(41, 41))
julia> @btime partialsortperm(vec($cA), 1:2, rev=true)
  3.654 ms (7 allocations: 2.00 MiB)
2-element view(::Array{Int64,1}, 1:2) with eltype Int64:
 20521
 23437
GPU:
GPU benchmark wrappers
function run_findmax(A)
    CUDA.@sync begin
        ans = findmax(A)
    end
    return ans
end
function run_findmax2(A)
    CUDA.@sync begin
        ans = foldl(findmax2, enumerate(A), init=(first=(i=0, v=0.0f0), second=(i=0, v=0.0f0)))
    end
    return ans
end
julia> @btime run_findmax2($dA)
  2.548 s (262148 allocations: 24.00 MiB)
(first = (i = 20521, v = 0.99999803f0), second = (i = 23437, v = 0.9999958f0))
julia> @btime run_findmax($dA)
  120.799 μs (123 allocations: 4.80 KiB)
(0.99999803f0, CartesianIndex(41, 41))
I tried to get something working with @tkf’s FoldsCUDA.jl, and it produces fast but slightly incorrect results on the GPU, throws an error with threaded CPU execution, and produces slow but correct results with single-threaded CPU execution.
Floops.jl / FoldsCUDA.jl
function folds_findmax2(xs, ex = xs isa CuArray ? CUDAEx() : ThreadedEx())
    xtypemin = typemin(eltype(xs))
    @floop ex for (i, x) in zip(eachindex(xs), xs)
        @reduce() do (xmax1=xtypemin; x), (imax1=-1; i)
            if isless(xmax1, x)
                imax2, xmax2 = imax1, xmax1
                imax1, xmax1 = i, x
            end
        end
        i == imax1 && continue
        @reduce() do (xmax2=xtypemin; x), (imax2=-1; i)
            if isless(xmax2, x)
                imax2, xmax2 = i, x
            end
        end
    end
    return ((imax1, xmax1), (imax2, xmax2))
end
function run_folds_findmax2(dA)
    CUDA.@sync begin
        ans = folds_findmax2(dA)
    end
    return ans
end
julia> folds_findmax2(cA)
ERROR: ArgumentError: `halve(zip(...))` requires collections with identical `size`
Stacktrace:
 [1] shape at C:\Users\alexa\.julia\packages\SplittablesBase\dZ3Qr\src\implementations.jl:226 [inlined]
...
julia> @btime run_folds_findmax2($dA)
  173.600 μs (137 allocations: 5.67 KiB)
((20521, 0.99999803f0), (98901, 0.9993288f0))
julia> @btime folds_findmax2($cA, SequentialEx(simd = Val(true)))
  1.118 ms (0 allocations: 0 bytes)
((20521, 0.99999803f0), (23437, 0.9999958f0))
Any insights on where I went wrong with FoldsCUDA, or how to avert (invisible?) scalar indexing for my original foldl-based solution?
(edited to include single-threaded FLoops.jl results)
