Hi,
I’m trying to run a simple kernel with shared static memory having dimension 2.
The trouble starts when I try to assing a value to the shared static memory, the kernel fail to compile.
module GPU_SHARED_BITMAP
function kernel(image)
shared = @cuStaticSharedMem(Float32, (16,16))
shared[threadIdx().x][threadIdx().y] = 1.0f0
return nothing
end
....
end
@device_code_warntype @cuda blocks=(64,64) threads=(16,16) GPU_SHARED_BITMAP.kernel(CUDA.zeros(Float32, 512,512))
This returns:
PTX CompilerJob of kernel kernel(CuDeviceArray{Float32,2,1}) for sm_50
Variables
#self#::Core.Compiler.Const(Main.GPU_SHARED_BITMAP.kernel, false)
image::CuDeviceArray{Float32,2,1}
len::Int64
ptr::Core.LLVMPtr{Float32,3}
shared::CuDeviceArray{Float32,2,3}
Body::Union{}
1 ─ %1 = Core.tuple(16, 16)::Core.Compiler.Const((16, 16), false)
│ (len = CUDA.prod(%1))
│ %3 = CUDA.Val(Symbol("##static_shmem#292"))::Core.Compiler.Const(Val{Symbol("##static_shmem#292")}(), true)
│ %4 = CUDA.Val(len::Core.Compiler.Const(256, false))::Core.Compiler.Const(Val{256}(), true)
│ (ptr = CUDA.emit_shmem(%3, Main.GPU_SHARED_BITMAP.Float32, %4))
│ %6 = Core.tuple(16, 16)::Core.Compiler.Const((16, 16), false)
│ (shared = CUDA.CuDeviceArray(%6, ptr))
│ %8 = shared::Core.Compiler.PartialStruct(CuDeviceArray{Float32,2,3}, Any[Core.Compiler.Const((16, 16), false), Core.LLVMPtr{Float32,3}])::Core.Compiler.PartialStruct(CuDeviceArray{Float32,2,3}, Any[Core.Compiler.Const((16, 16), false), Core.LLVMPtr{Float32,3}])
│ %9 = Main.GPU_SHARED_BITMAP.threadIdx()::NamedTuple{(:x, :y, :z),Tuple{Int64,Int64,Int64}}
│ %10 = Base.getproperty(%9, :x)::Int64
│ %11 = Base.getindex(%8, %10)::Float32
│ %12 = Main.GPU_SHARED_BITMAP.threadIdx()::NamedTuple{(:x, :y, :z),Tuple{Int64,Int64,Int64}}
│ %13 = Base.getproperty(%12, :y)::Int64
│ Base.setindex!(%11, 1.0f0, %13)
└── Core.Compiler.Const(:(return Main.GPU_SHARED_BITMAP.nothing), false)
ERROR: GPU compilation of kernel kernel(CuDeviceArray{Float32,2,1}) failed
KernelError: kernel returns a value of type `Union{}`
Make sure your kernel function ends in `return`, `return nothing` or `nothing`.
If the returned value is of type `Union{}`, your Julia code probably throws an exception.
Inspect the code with `@device_code_warntype` for more details.
Stacktrace:
[1] check_method(::GPUCompiler.CompilerJob) at C:\Users\Nicola\.julia\packages\GPUCompiler\5xT46\src\validation.jl:18
[2] macro expansion at C:\Users\Nicola\.julia\packages\TimerOutputs\dVnaw\src\TimerOutput.jl:206 [inlined]
[3] codegen(::Symbol, ::GPUCompiler.CompilerJob; libraries::Bool, deferred_codegen::Bool, optimize::Bool, strip::Bool, validate::Bool, only_entry::Bool) at C:\Users\Nicola\.julia\packages\GPUCompiler\5xT46\src\driver.jl:63
[4] compile(::Symbol, ::GPUCompiler.CompilerJob; libraries::Bool, deferred_codegen::Bool, optimize::Bool, strip::Bool, validate::Bool, only_entry::Bool) at C:\Users\Nicola\.julia\packages\GPUCompiler\5xT46\src\driver.jl:39
[5] compile at C:\Users\Nicola\.julia\packages\GPUCompiler\5xT46\src\driver.jl:35 [inlined]
[6] _cufunction(::GPUCompiler.FunctionSpec{typeof(Main.GPU_SHARED_BITMAP.kernel),Tuple{CuDeviceArray{Float32,2,1}}}; kwargs::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}) at C:\Users\Nicola\.julia\packages\CUDA\gKMm0\src\compiler\execution.jl:311
[7] _cufunction at C:\Users\Nicola\.julia\packages\CUDA\gKMm0\src\compiler\execution.jl:305 [inlined]
[8] check_cache(::typeof(CUDA._cufunction), ::GPUCompiler.FunctionSpec{typeof(Main.GPU_SHARED_BITMAP.kernel),Tuple{CuDeviceArray{Float32,2,1}}}, ::UInt64; kwargs::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}) at C:\Users\Nicola\.julia\packages\GPUCompiler\5xT46\src\cache.jl:24
[9] kernel at c:\Users\Nicola\juno projects\Julia-GPU-Examples\src\gpu_shared_bitmap.jl:17 [inlined]
[10] cached_compilation(::typeof(CUDA._cufunction), ::GPUCompiler.FunctionSpec{typeof(Main.GPU_SHARED_BITMAP.kernel),Tuple{CuDeviceArray{Float32,2,1}}}, ::UInt64; kwargs::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}})
at C:\Users\Nicola\.julia\packages\GPUCompiler\5xT46\src\cache.jl:0
[11] cached_compilation at C:\Users\Nicola\.julia\packages\GPUCompiler\5xT46\src\cache.jl:40 [inlined]
[12] cufunction(::typeof(Main.GPU_SHARED_BITMAP.kernel), ::Type{Tuple{CuDeviceArray{Float32,2,1}}}; name::Nothing, kwargs::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}) at C:\Users\Nicola\.julia\packages\CUDA\gKMm0\src\compiler\execution.jl:299
[13] cufunction(::typeof(Main.GPU_SHARED_BITMAP.kernel), ::Type{Tuple{CuDeviceArray{Float32,2,1}}}) at C:\Users\Nicola\.julia\packages\CUDA\gKMm0\src\compiler\execution.jl:294
[14] top-level scope at C:\Users\Nicola\.julia\packages\CUDA\gKMm0\src\compiler\execution.jl:109
[15] top-level scope at C:\Users\Nicola\.julia\packages\GPUCompiler\5xT46\src\reflection.jl:144
Windows 10
Julia 1.5.1
CUDA.jl 2.0.2