Hi,
I am trying to run a 3 args dot
on GPU ( in fact CUDA). I am trying FoldsCUDA
for simpler development of reductions but I dont understand why it errors on GPU but ot o CPU. Can someone give me a hint please?
Thanks a lot
using Folds, FoldsThreads, FLoops
function dotf(x,A,y, exec = SequentialEx())
nx, ny = size(A)
T = eltype(x)
@floop exec for i in eachindex(x), j in eachindex(y)
r = x[i] * A[i,j] * y[j]
@reduce() do (acc = 0; r)
acc += r
end
end
acc
end
A = rand(1000,1000);x=rand(1000);y=rand(1000)
@time dot(x,A,y)
@time dotf(x,A,y, ThreadedEx())
works well on CPU but not on GPU
using CUDA
CUDA.allowscalar(false)
Ag = CuArray(A)
xg = CuArray(x)
yg = CuArray(y)
dot(xg,Ag,yg) # this does not exists. If it did, I'd be happier
dotf(xg,Ag,yg, CUDAEx())
the FoldsCUDA version errors with
julia> dotf(xg,Ag,yg, CUDAEx())
ERROR: MethodError: no method matching keys(::Base.Iterators.ProductIterator{Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}})
Closest candidates are:
keys(::Union{Tables.AbstractColumns, Tables.AbstractRow}) at ~/.julia/packages/Tables/PxO1m/src/Tables.jl:184
keys(::Missings.EachReplaceMissing) at ~/.julia/packages/Missings/r1STI/src/Missings.jl:94
keys(::Missings.EachFailMissing) at ~/.julia/packages/Missings/r1STI/src/Missings.jl:154
...
Stacktrace:
[1] eachindex(itrs::Base.Iterators.ProductIterator{Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}})
@ Base ./abstractarray.jl:276
[2] _transduce!(buf::Nothing, rf::Transducers.Reduction{Transducers.Map{typeof(first)}, Transducers.BottomRF{Transducers.AdHocRF{var"#__##oninit_function#293#5", typeof(identity), InitialValues.AdjoinIdentity{var"#__##reducing_function#294#6"{CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}}}, typeof(identity), typeof(identity), var"#__##combine_function#295#7"}}}, init::Transducers.InitOf{Transducers.DefaultInitOf}, arrays::Base.Iterators.ProductIterator{Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}})
@ FoldsCUDA ~/.julia/packages/FoldsCUDA/Mo35m/src/kernels.jl:120
[3] transduce_impl(rf::Transducers.Reduction{Transducers.Map{typeof(first)}, Transducers.BottomRF{Transducers.AdHocRF{var"#__##oninit_function#293#5", typeof(identity), InitialValues.AdjoinIdentity{var"#__##reducing_function#294#6"{CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}}}, typeof(identity), typeof(identity), var"#__##combine_function#295#7"}}}, init::Transducers.InitOf{Transducers.DefaultInitOf}, arrays::Base.Iterators.ProductIterator{Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}})
@ FoldsCUDA ~/.julia/packages/FoldsCUDA/Mo35m/src/kernels.jl:32
[4] _transduce_cuda(op::Function, init::Transducers.InitOf{Transducers.DefaultInitOf}, xs::Base.Iterators.ProductIterator{Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}})
@ FoldsCUDA ~/.julia/packages/FoldsCUDA/Mo35m/src/kernels.jl:18
[5] _transduce_cuda(xf::Transducers.IdentityTransducer, op::Function, init::Transducers.InitOf{Transducers.DefaultInitOf}, xs::Base.Iterators.ProductIterator{Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}; kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
@ FoldsCUDA ~/.julia/packages/FoldsCUDA/Mo35m/src/kernels.jl:1
[6] _transduce_cuda(xf::Transducers.IdentityTransducer, op::Function, init::Transducers.InitOf{Transducers.DefaultInitOf}, xs::Base.Iterators.ProductIterator{Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}})
@ FoldsCUDA ~/.julia/packages/FoldsCUDA/Mo35m/src/kernels.jl:1
[7] transduce(xf::Transducers.IdentityTransducer, rf::Transducers.AdHocRF{var"#__##oninit_function#293#5", typeof(identity), var"#__##reducing_function#294#6"{CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}}, typeof(identity), typeof(identity), var"#__##combine_function#295#7"}, init::Transducers.InitOf{Transducers.DefaultInitOf}, xs::Base.Iterators.ProductIterator{Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}, exc::CUDAEx{NamedTuple{(:simd,), Tuple{Val{false}}}})
@ FoldsCUDA ~/.julia/packages/FoldsCUDA/Mo35m/src/api.jl:45
[8] _fold
@ ~/.julia/packages/FLoops/3ZEuy/src/reduce.jl:851 [inlined]
[9] macro expansion
@ ~/.julia/packages/FLoops/3ZEuy/src/reduce.jl:829 [inlined]
[10] dotf(x::CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, A::CuArray{Float64, 2, CUDA.Mem.DeviceBuffer}, y::CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, exec::CUDAEx{NamedTuple{(), Tuple{}}})
@ Main ./REPL[4]:4
[11] top-level scope
@ REPL[32]:1