Hi all,
I want to write a CUDA kernel that uses some local memory per thread. I read here and here that a possible way to do it is using StaticArrays, but I couldn’t find a MWE of this.
The code I want to run is this, where I have to allocate some intermediate results in the vector S
using CuArrays, CUDAnative, CUDAdrv, StaticArrays
const M=128
const N=56
function kernel_staticarrays!(M, N, x_d, n_d)
index = threadIdx().x
stride = blockDim().x
# S = MVector{N+1, Float32}(undef)
S = SizedVector{N+1, Float32}(undef)
for p=index:stride:M
S[1] = 1.0f0
for i=2:N+1
S[i] = 0.0f0
end
for i=1:M
i == p && continue
for j=min(1, N):-1:max(1, N-M+1)
S[j+1] += x_d[i]*S[j]
end
end
n_d[p] = S[N+1]
end
return nothing
end
function main(M, N)
x_d = CuArrays.rand(Float32, M)
n_d = CuArrays.fill(0.0f0, M)
numthreads = 256
@cuda threads=numthreads kernel_staticarrays!(M, N, x_d, n_d)
n = Array(n_d)
display(n)
end
main(M, N)
The error stacktrace when I use SizedVector
is
ERROR: LoadError: InvalidIRError: compiling kernel_staticarrays!(Int64, Int64, CuDeviceArray{Float32,1,CUDAnative.AS.Global}, CuDeviceArray{Float32,1,CUDAnative.AS.Global}) resulted in invalid LLVM IR
Reason: unsupported call to the Julia runtime (call to jl_f_apply_type)
Stacktrace:
[1] kernel_staticarrays! at /scratch-global/arubio/950034/agp_gpu.jl:11
Reason: unsupported dynamic function invocation (call to setindex!)
Stacktrace:
[1] kernel_staticarrays! at /scratch-global/arubio/950034/agp_gpu.jl:11
Reason: unsupported dynamic function invocation (call to setindex!)
Stacktrace:
[1] kernel_staticarrays! at /scratch-global/arubio/950034/agp_gpu.jl:14
Reason: unsupported dynamic function invocation (call to setindex!)
Stacktrace:
[1] kernel_staticarrays! at /scratch-global/arubio/950034/agp_gpu.jl:16
Reason: unsupported dynamic function invocation (call to setindex!)
Stacktrace:
[1] kernel_staticarrays! at /scratch-global/arubio/950034/agp_gpu.jl:21
Reason: unsupported dynamic function invocation (call to setindex!)
Stacktrace:
[1] kernel_staticarrays! at /scratch-global/arubio/950034/agp_gpu.jl:24
Reason: unsupported dynamic function invocation (call to setindex!)
Stacktrace:
[1] macro expansion at /home/iff/arubio/.julia/packages/LLVM/DAnFH/src/interop/base.jl:52
[2] macro expansion at /home/iff/arubio/.julia/packages/CUDAnative/Phjco/src/device/pointer.jl:167
[3] unsafe_store! at /home/iff/arubio/.julia/packages/CUDAnative/Phjco/src/device/pointer.jl:167
[4] setindex! at /home/iff/arubio/.julia/packages/CUDAnative/Phjco/src/device/array.jl:84
[5] kernel_staticarrays! at /scratch-global/arubio/950034/agp_gpu.jl:24
Reason: unsupported call to the Julia runtime (call to jl_type_error)
Stacktrace:
[1] macro expansion at /home/iff/arubio/.julia/packages/LLVM/DAnFH/src/interop/base.jl:52
[2] macro expansion at /home/iff/arubio/.julia/packages/CUDAnative/Phjco/src/device/pointer.jl:167
[3] unsafe_store! at /home/iff/arubio/.julia/packages/CUDAnative/Phjco/src/device/pointer.jl:167
[4] setindex! at /home/iff/arubio/.julia/packages/CUDAnative/Phjco/src/device/array.jl:84
[5] kernel_staticarrays! at /scratch-global/arubio/950034/agp_gpu.jl:24
Stacktrace:
[1] check_ir(::CUDAnative.CompilerJob, ::LLVM.Module) at /home/iff/arubio/.julia/packages/CUDAnative/Phjco/src/compiler/validation.jl:114
[2] macro expansion at /home/iff/arubio/.julia/packages/CUDAnative/Phjco/src/compiler/driver.jl:188 [inlined]
[3] macro expansion at /home/iff/arubio/.julia/packages/TimerOutputs/7Id5J/src/TimerOutput.jl:228 [inlined]
[4] #codegen#156(::Bool, ::Bool, ::Bool, ::Bool, ::Bool, ::typeof(CUDAnative.codegen), ::Symbol, ::CUDAnative.CompilerJob) at /home/iff/arubio/.julia/packages/CUDAnative/Phjco/src/compiler/driver.jl:186
[5] #codegen at /home/iff/arubio/.julia/packages/CUDAnative/Phjco/src/compiler/driver.jl:0 [inlined]
[6] #compile#155(::Bool, ::Bool, ::Bool, ::Bool, ::Bool, ::typeof(CUDAnative.compile), ::Symbol, ::CUDAnative.CompilerJob) at /home/iff/arubio/.julia/packages/CUDAnative/Phjco/src/compiler/driver.jl:47
[7] #compile at ./none:0 [inlined]
[8] #compile#154 at /home/iff/arubio/.julia/packages/CUDAnative/Phjco/src/compiler/driver.jl:28 [inlined]
[9] #compile at ./none:0 [inlined] (repeats 2 times)
[10] macro expansion at /home/iff/arubio/.julia/packages/CUDAnative/Phjco/src/execution.jl:392 [inlined]
[11] #cufunction#200(::Nothing, ::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}, ::typeof(cufunction), ::typeof(kernel_staticarrays!), ::Type{Tuple{Int64,Int64,CuDeviceArray{Float32,1,CUDAnative.AS.Global},CuDeviceArray{Float32,1,CUDAnative.AS.Global}}}) at /home/iff/arubio/.julia/packages/CUDAnative/Phjco/src/execution.jl:359
[12] cufunction(::Function, ::Type) at /home/iff/arubio/.julia/packages/CUDAnative/Phjco/src/execution.jl:359
[13] macro expansion at /home/iff/arubio/.julia/packages/CUDAnative/Phjco/src/execution.jl:176 [inlined]
[14] macro expansion at ./gcutils.jl:87 [inlined]
[15] macro expansion at /home/iff/arubio/.julia/packages/CUDAnative/Phjco/src/execution.jl:173 [inlined]
[16] main(::Int64, ::Int64) at /scratch-global/arubio/950034/agp_gpu.jl:35
[17] top-level scope at /scratch-global/arubio/950034/agp_gpu.jl:41
[18] include at ./boot.jl:328 [inlined]
[19] include_relative(::Module, ::String) at ./loading.jl:1094
[20] include(::Module, ::String) at ./Base.jl:31
[21] exec_options(::Base.JLOptions) at ./client.jl:295
[22] _start() at ./client.jl:464
in expression starting at /scratch-global/arubio/950034/agp_gpu.jl:41
and the error stacktrace when I use MVector
is
ERROR: LoadError: InvalidIRError: compiling kernel_staticarrays!(Int64, Int64, CuDeviceArray{Float32,1,CUDAnative.AS.Global}, CuDeviceArray{Float32,1,CUDAnative.AS.Global}) resulted in invalid LLVM IR
Reason: unsupported call to the Julia runtime (call to jl_f_apply_type)
Stacktrace:
[1] kernel_staticarrays! at /scratch-global/arubio/950033/agp_gpu.jl:10
Reason: unsupported dynamic function invocation (call to setindex!)
Stacktrace:
[1] kernel_staticarrays! at /scratch-global/arubio/950033/agp_gpu.jl:10
Reason: unsupported dynamic function invocation (call to setindex!)
Stacktrace:
[1] kernel_staticarrays! at /scratch-global/arubio/950033/agp_gpu.jl:14
Reason: unsupported dynamic function invocation (call to setindex!)
Stacktrace:
[1] kernel_staticarrays! at /scratch-global/arubio/950033/agp_gpu.jl:16
Reason: unsupported dynamic function invocation (call to setindex!)
Stacktrace:
[1] kernel_staticarrays! at /scratch-global/arubio/950033/agp_gpu.jl:21
Reason: unsupported dynamic function invocation (call to setindex!)
Stacktrace:
[1] kernel_staticarrays! at /scratch-global/arubio/950033/agp_gpu.jl:24
Reason: unsupported dynamic function invocation (call to setindex!)
Stacktrace:
[1] macro expansion at /home/iff/arubio/.julia/packages/LLVM/DAnFH/src/interop/base.jl:52
[2] macro expansion at /home/iff/arubio/.julia/packages/CUDAnative/Phjco/src/device/pointer.jl:167
[3] unsafe_store! at /home/iff/arubio/.julia/packages/CUDAnative/Phjco/src/device/pointer.jl:167
[4] setindex! at /home/iff/arubio/.julia/packages/CUDAnative/Phjco/src/device/array.jl:84
[5] kernel_staticarrays! at /scratch-global/arubio/950033/agp_gpu.jl:24
Reason: unsupported call to the Julia runtime (call to jl_type_error)
Stacktrace:
[1] macro expansion at /home/iff/arubio/.julia/packages/LLVM/DAnFH/src/interop/base.jl:52
[2] macro expansion at /home/iff/arubio/.julia/packages/CUDAnative/Phjco/src/device/pointer.jl:167
[3] unsafe_store! at /home/iff/arubio/.julia/packages/CUDAnative/Phjco/src/device/pointer.jl:167
[4] setindex! at /home/iff/arubio/.julia/packages/CUDAnative/Phjco/src/device/array.jl:84
[5] kernel_staticarrays! at /scratch-global/arubio/950033/agp_gpu.jl:24
Stacktrace:
[1] check_ir(::CUDAnative.CompilerJob, ::LLVM.Module) at /home/iff/arubio/.julia/packages/CUDAnative/Phjco/src/compiler/validation.jl:114
[2] macro expansion at /home/iff/arubio/.julia/packages/CUDAnative/Phjco/src/compiler/driver.jl:188 [inlined]
[3] macro expansion at /home/iff/arubio/.julia/packages/TimerOutputs/7Id5J/src/TimerOutput.jl:228 [inlined]
[4] #codegen#156(::Bool, ::Bool, ::Bool, ::Bool, ::Bool, ::typeof(CUDAnative.codegen), ::Symbol, ::CUDAnative.CompilerJob) at /home/iff/arubio/.julia/packages/CUDAnative/Phjco/src/compiler/driver.jl:186
[5] #codegen at /home/iff/arubio/.julia/packages/CUDAnative/Phjco/src/compiler/driver.jl:0 [inlined]
[6] #compile#155(::Bool, ::Bool, ::Bool, ::Bool, ::Bool, ::typeof(CUDAnative.compile), ::Symbol, ::CUDAnative.CompilerJob) at /home/iff/arubio/.julia/packages/CUDAnative/Phjco/src/compiler/driver.jl:47
[7] #compile at ./none:0 [inlined]
[8] #compile#154 at /home/iff/arubio/.julia/packages/CUDAnative/Phjco/src/compiler/driver.jl:28 [inlined]
[9] #compile at ./none:0 [inlined] (repeats 2 times)
[10] macro expansion at /home/iff/arubio/.julia/packages/CUDAnative/Phjco/src/execution.jl:392 [inlined]
[11] #cufunction#200(::Nothing, ::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}, ::typeof(cufunction), ::typeof(kernel_staticarrays!), ::Type{Tuple{Int64,Int64,CuDeviceArray{Float32,1,CUDAnative.AS.Global},CuDeviceArray{Float32,1,CUDAnative.AS.Global}}}) at /home/iff/arubio/.julia/packages/CUDAnative/Phjco/src/execution.jl:359
[12] cufunction(::Function, ::Type) at /home/iff/arubio/.julia/packages/CUDAnative/Phjco/src/execution.jl:359
[13] macro expansion at /home/iff/arubio/.julia/packages/CUDAnative/Phjco/src/execution.jl:176 [inlined]
[14] macro expansion at ./gcutils.jl:87 [inlined]
[15] macro expansion at /home/iff/arubio/.julia/packages/CUDAnative/Phjco/src/execution.jl:173 [inlined]
[16] main(::Int64, ::Int64) at /scratch-global/arubio/950033/agp_gpu.jl:35
[17] top-level scope at /scratch-global/arubio/950033/agp_gpu.jl:41
[18] include at ./boot.jl:328 [inlined]
[19] include_relative(::Module, ::String) at ./loading.jl:1094
[20] include(::Module, ::String) at ./Base.jl:31
[21] exec_options(::Base.JLOptions) at ./client.jl:295
[22] _start() at ./client.jl:464
in expression starting at /scratch-global/arubio/950033/agp_gpu.jl:41
What am I doing wrong? Also, do I need to specify that M
and N
are constants? I have also read something about cuDynamicSharedMem
, but I think using local thread memory might perform better.
Thanks in advance