Error with 3-args dot based on FoldsCUDA

Hi,

I am trying to run a 3 args dot on GPU ( in fact CUDA). I am trying FoldsCUDA for simpler development of reductions but I dont understand why it errors on GPU but ot o CPU. Can someone give me a hint please?

Thanks a lot

using Folds, FoldsThreads, FLoops


function dotf(x,A,y, exec = SequentialEx())
	nx, ny = size(A)
	T = eltype(x)
	@floop exec for i in eachindex(x), j in eachindex(y)
		r = x[i] * A[i,j] * y[j]
		@reduce() do (acc = 0; r)
		   acc += r
	   end
	end
	acc
end


A = rand(1000,1000);x=rand(1000);y=rand(1000)
@time dot(x,A,y)
@time dotf(x,A,y, ThreadedEx())

works well on CPU but not on GPU

using CUDA
CUDA.allowscalar(false)
Ag = CuArray(A)
xg = CuArray(x)
yg = CuArray(y)

dot(xg,Ag,yg) # this does not exists. If it did, I'd be happier

dotf(xg,Ag,yg, CUDAEx())

the FoldsCUDA version errors with

julia> dotf(xg,Ag,yg, CUDAEx())
ERROR: MethodError: no method matching keys(::Base.Iterators.ProductIterator{Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}})
Closest candidates are:
  keys(::Union{Tables.AbstractColumns, Tables.AbstractRow}) at ~/.julia/packages/Tables/PxO1m/src/Tables.jl:184
  keys(::Missings.EachReplaceMissing) at ~/.julia/packages/Missings/r1STI/src/Missings.jl:94
  keys(::Missings.EachFailMissing) at ~/.julia/packages/Missings/r1STI/src/Missings.jl:154
  ...
Stacktrace:
  [1] eachindex(itrs::Base.Iterators.ProductIterator{Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}})
    @ Base ./abstractarray.jl:276
  [2] _transduce!(buf::Nothing, rf::Transducers.Reduction{Transducers.Map{typeof(first)}, Transducers.BottomRF{Transducers.AdHocRF{var"#__##oninit_function#293#5", typeof(identity), InitialValues.AdjoinIdentity{var"#__##reducing_function#294#6"{CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}}}, typeof(identity), typeof(identity), var"#__##combine_function#295#7"}}}, init::Transducers.InitOf{Transducers.DefaultInitOf}, arrays::Base.Iterators.ProductIterator{Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}})
    @ FoldsCUDA ~/.julia/packages/FoldsCUDA/Mo35m/src/kernels.jl:120
  [3] transduce_impl(rf::Transducers.Reduction{Transducers.Map{typeof(first)}, Transducers.BottomRF{Transducers.AdHocRF{var"#__##oninit_function#293#5", typeof(identity), InitialValues.AdjoinIdentity{var"#__##reducing_function#294#6"{CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}}}, typeof(identity), typeof(identity), var"#__##combine_function#295#7"}}}, init::Transducers.InitOf{Transducers.DefaultInitOf}, arrays::Base.Iterators.ProductIterator{Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}})
    @ FoldsCUDA ~/.julia/packages/FoldsCUDA/Mo35m/src/kernels.jl:32
  [4] _transduce_cuda(op::Function, init::Transducers.InitOf{Transducers.DefaultInitOf}, xs::Base.Iterators.ProductIterator{Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}})
    @ FoldsCUDA ~/.julia/packages/FoldsCUDA/Mo35m/src/kernels.jl:18
  [5] _transduce_cuda(xf::Transducers.IdentityTransducer, op::Function, init::Transducers.InitOf{Transducers.DefaultInitOf}, xs::Base.Iterators.ProductIterator{Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}; kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
    @ FoldsCUDA ~/.julia/packages/FoldsCUDA/Mo35m/src/kernels.jl:1
  [6] _transduce_cuda(xf::Transducers.IdentityTransducer, op::Function, init::Transducers.InitOf{Transducers.DefaultInitOf}, xs::Base.Iterators.ProductIterator{Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}})
    @ FoldsCUDA ~/.julia/packages/FoldsCUDA/Mo35m/src/kernels.jl:1
  [7] transduce(xf::Transducers.IdentityTransducer, rf::Transducers.AdHocRF{var"#__##oninit_function#293#5", typeof(identity), var"#__##reducing_function#294#6"{CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}}, typeof(identity), typeof(identity), var"#__##combine_function#295#7"}, init::Transducers.InitOf{Transducers.DefaultInitOf}, xs::Base.Iterators.ProductIterator{Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}, exc::CUDAEx{NamedTuple{(:simd,), Tuple{Val{false}}}})
    @ FoldsCUDA ~/.julia/packages/FoldsCUDA/Mo35m/src/api.jl:45
  [8] _fold
    @ ~/.julia/packages/FLoops/3ZEuy/src/reduce.jl:851 [inlined]
  [9] macro expansion
    @ ~/.julia/packages/FLoops/3ZEuy/src/reduce.jl:829 [inlined]
 [10] dotf(x::CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, A::CuArray{Float64, 2, CUDA.Mem.DeviceBuffer}, y::CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, exec::CUDAEx{NamedTuple{(), Tuple{}}})
    @ Main ./REPL[4]:4
 [11] top-level scope
    @ REPL[32]:1

One way to port this to GPU is to use

function dot2(x,A,y,tmp) 
	mul!(tmp, A,y)
	dot(x,tmp)
end

maybe @tkf has a better suggestion for FoldsCUDA