Here is a very simple version of trying to compute dot product in GPU kernel. Having tried
c[i] = transpose(a[i, :]) * b[:, i]
c[i] = CUDA.dot(a[i, :], b[:, i])
-
c[i] = CUBLAS.dot(4, a[i, :], b[:, i])
but all reported errors. Is this a bug or am I doing anything wrong? Thanks!
using CUDA
a = CUDA.rand(4, 4)
b = CUDA.rand(4, 4)
c = CUDA.zeros(4)
function foo!(a, b, c)
i = threadIdx().x
@inbounds c[i] = transpose(a[i, :]) * b[:, i]
return nothing
end
@cuda threads = 4 foo!(a, b, c)
ERROR: InvalidIRError: compiling MethodInstance for foo!(::CuDeviceMatrix{Float32, 1}, ::CuDeviceMatrix{Float32, 1}, ::CuDeviceVector{Float32, 1}) resulted in invalid LLVM IR
Reason: unsupported call through a literal pointer (call to )
Stacktrace:
[1] dot
@ ~/julia-1.9.0/share/julia/stdlib/v1.9/LinearAlgebra/src/blas.jl:345
[2] dot
@ ~/julia-1.9.0/share/julia/stdlib/v1.9/LinearAlgebra/src/blas.jl:395
[3] dot
@ ~/julia-1.9.0/share/julia/stdlib/v1.9/LinearAlgebra/src/matmul.jl:14
[4] *
@ ~/julia-1.9.0/share/julia/stdlib/v1.9/LinearAlgebra/src/adjtrans.jl:434
[5] foo!
@ ~/trixi_cuda/test.jl:10
Reason: unsupported dynamic function invocation (call to var"#sprint#484"(context, sizehint::Integer, ::typeof(sprint), f::Function, args...) @ Base strings/io.jl:107)
Stacktrace:
[1] sprint
@ ./strings/io.jl:107
[2] String
@ ./strings/lazy.jl:83
[3] convert
@ ./strings/basic.jl:232
[4] DimensionMismatch
@ ./array.jl:12
[5] dot
@ ~/julia-1.9.0/share/julia/stdlib/v1.9/LinearAlgebra/src/blas.jl:394
[6] dot
@ ~/julia-1.9.0/share/julia/stdlib/v1.9/LinearAlgebra/src/matmul.jl:14
[7] *
@ ~/julia-1.9.0/share/julia/stdlib/v1.9/LinearAlgebra/src/adjtrans.jl:434
[8] foo!
@ ~/trixi_cuda/test.jl:10
Reason: unsupported call through a literal pointer (call to ijl_alloc_array_1d)
Stacktrace:
[1] Array
@ ./boot.jl:477
[2] Array
@ ./boot.jl:486
[3] similar
@ ./abstractarray.jl:847
[4] similar
@ ./abstractarray.jl:836
[5] _unsafe_getindex
@ ./multidimensional.jl:873
[6] _getindex
@ ./multidimensional.jl:861
[7] getindex
@ ./abstractarray.jl:1294
[8] foo!
@ ~/trixi_cuda/test.jl:10
Reason: unsupported dynamic function invocation (call to print_to_string(xs...) @ Base strings/io.jl:133)
Stacktrace:
[1] string
@ ./strings/io.jl:185
[2] throw_checksize_error
@ ./multidimensional.jl:910
[3] multiple call sites
@ unknown:0
Hint: catch this exception as `err` and call `code_typed(err; interactive = true)` to introspect the erronous code with Cthulhu.jl
Stacktrace:
[1] check_ir(job::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, args::LLVM.Module)
@ GPUCompiler ~/.julia/packages/GPUCompiler/NVLGB/src/validation.jl:149
[2] macro expansion
@ ~/.julia/packages/GPUCompiler/NVLGB/src/driver.jl:411 [inlined]
[3] macro expansion
@ ~/.julia/packages/TimerOutputs/RsWnF/src/TimerOutput.jl:253 [inlined]
[4] macro expansion
@ ~/.julia/packages/GPUCompiler/NVLGB/src/driver.jl:410 [inlined]
[5] emit_llvm(job::GPUCompiler.CompilerJob; libraries::Bool, toplevel::Bool, optimize::Bool, cleanup::Bool, only_entry::Bool, validate::Bool, ctx::LLVM.ThreadSafeContext)
@ GPUCompiler ~/.julia/packages/GPUCompiler/NVLGB/src/utils.jl:89
[6] codegen(output::Symbol, job::GPUCompiler.CompilerJob; libraries::Bool, toplevel::Bool, optimize::Bool, cleanup::Bool, strip::Bool, validate::Bool, only_entry::Bool, parent_job::Nothing, ctx::LLVM.ThreadSafeContext)
@ GPUCompiler ~/.julia/packages/GPUCompiler/NVLGB/src/driver.jl:118
[7] codegen
@ ~/.julia/packages/GPUCompiler/NVLGB/src/driver.jl:92 [inlined]
[8] compile(target::Symbol, job::GPUCompiler.CompilerJob; libraries::Bool, toplevel::Bool, optimize::Bool, cleanup::Bool, strip::Bool, validate::Bool, only_entry::Bool, ctx::LLVM.ThreadSafeContext)
@ GPUCompiler ~/.julia/packages/GPUCompiler/NVLGB/src/driver.jl:88
[9] compile
@ ~/.julia/packages/GPUCompiler/NVLGB/src/driver.jl:79 [inlined]
[10] compile(job::GPUCompiler.CompilerJob, ctx::LLVM.ThreadSafeContext)
@ CUDA ~/.julia/packages/CUDA/pCcGc/src/compiler/compilation.jl:125
[11] #1032
@ ~/.julia/packages/CUDA/pCcGc/src/compiler/compilation.jl:120 [inlined]
[12] LLVM.ThreadSafeContext(f::CUDA.var"#1032#1033"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}})
@ LLVM ~/.julia/packages/LLVM/5aiiG/src/executionengine/ts_module.jl:14
[13] JuliaContext
@ ~/.julia/packages/GPUCompiler/NVLGB/src/driver.jl:35 [inlined]
[14] compile
@ ~/.julia/packages/CUDA/pCcGc/src/compiler/compilation.jl:119 [inlined]
[15] actual_compilation(cache::Dict{Any, Any}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, compiler::typeof(CUDA.compile), linker::typeof(CUDA.link))
@ GPUCompiler ~/.julia/packages/GPUCompiler/NVLGB/src/execution.jl:125
[16] cached_compilation(cache::Dict{Any, Any}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, compiler::Function, linker::Function)
@ GPUCompiler ~/.julia/packages/GPUCompiler/NVLGB/src/execution.jl:103
[17] macro expansion
@ ~/.julia/packages/CUDA/pCcGc/src/compiler/execution.jl:318 [inlined]
[18] macro expansion
@ ./lock.jl:267 [inlined]
[19] cufunction(f::typeof(foo!), tt::Type{Tuple{CuDeviceMatrix{Float32, 1}, CuDeviceMatrix{Float32, 1}, CuDeviceVector{Float32, 1}}}; kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
@ CUDA ~/.julia/packages/CUDA/pCcGc/src/compiler/execution.jl:313
[20] cufunction(f::typeof(foo!), tt::Type{Tuple{CuDeviceMatrix{Float32, 1}, CuDeviceMatrix{Float32, 1}, CuDeviceVector{Float32, 1}}})
@ CUDA ~/.julia/packages/CUDA/pCcGc/src/compiler/execution.jl:310
[21] top-level scope
@ ~/.julia/packages/CUDA/pCcGc/src/compiler/execution.jl:104