Consider the following code where I create a tuple of structures and pass it into the GPU kernel:
import CUDA
struct Foo{T}
a :: T
b :: T
end
function func(u, foos)
N = length(u)
function get_config(kernel)
fun = kernel.fun
config = CUDA.launch_configuration(fun)
blocks = cld(N, config.threads)
return (threads=config.threads, blocks=blocks)
end
CUDA.@cuda config=get_config kernel(u, foos)
return nothing
end
function kernel(u, foos)
id = (CUDA.blockIdx().x - 1) * CUDA.blockDim().x + CUDA.threadIdx().x
stride = CUDA.blockDim().x * CUDA.gridDim().x
for i=id:stride:length(u)
u[i] = foos[i].a + foos[i].b
end
return nothing
end
# ------------------------------------------------------------------------------
N = 10
# N = 511 # causes an error
foos = Array{Foo}(undef, N)
for i=1:N
foos[i] = Foo(1f0, 2f0)
end
foos = tuple(foos...)
u = CUDA.zeros(N)
func(u, foos)
With N
up to 500 everything works fine. However, starting from N = 511
I start to see the following error:
julia test_gpu_tuple_of_struct.jl
ERROR: LoadError: CUDA error: device kernel image is invalid (code 200, ERROR_INVALID_IMAGE)
Stacktrace:
[1] CUDA.CuModule(::String, ::Dict{CUDA.CUjit_option_enum,Any}) at /home/fedoroff/.julia/packages/CUDA/7vLVC/lib/cudadrv/module.jl:40
[2] _cufunction(::GPUCompiler.FunctionSpec{typeof(kernel),Tuple{CUDA.CuDeviceArray{Float32,1,CUDA.AS.Global},NTuple{511,Foo{Float32}}}}; kwargs::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}) at /home/fedoroff/.julia/packages/CUDA/7vLVC/src/compiler/execution.jl:337
[3] _cufunction at /home/fedoroff/.julia/packages/CUDA/7vLVC/src/compiler/execution.jl:304 [inlined]
[4] check_cache(::typeof(CUDA._cufunction), ::GPUCompiler.FunctionSpec{typeof(kernel),Tuple{CUDA.CuDeviceArray{Float32,1,CUDA.AS.Global},NTuple{511,Foo{Float32}}}}, ::UInt64; kwargs::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}) at /home/fedoroff/.julia/packages/GPUCompiler/pCBTA/src/cache.jl:24
[5] kernel at /media/storage/julia/test_suite/equations/13_plasma_ensemble/test_gpu_tuple_of_struct.jl:26 [inlined]
[6] cached_compilation(::typeof(CUDA._cufunction), ::GPUCompiler.FunctionSpec{typeof(kernel),Tuple{CUDA.CuDeviceArray{Float32,1,CUDA.AS.Global},NTuple{511,Foo{Float32}}}}, ::UInt64; kwargs::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}) at /home/fedoroff/.julia/packages/GPUCompiler/pCBTA/src/cache.jl:0
[7] cached_compilation at /home/fedoroff/.julia/packages/GPUCompiler/pCBTA/src/cache.jl:40 [inlined]
[8] cufunction(::typeof(kernel), ::Type{Tuple{CUDA.CuDeviceArray{Float32,1,CUDA.AS.Global},NTuple{511,Foo{Float32}}}}; name::Nothing, kwargs::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}) at /home/fedoroff/.julia/packages/CUDA/7vLVC/src/compiler/execution.jl:298
[9] cufunction(::typeof(kernel), ::Type{Tuple{CUDA.CuDeviceArray{Float32,1,CUDA.AS.Global},NTuple{511,Foo{Float32}}}}) at /home/fedoroff/.julia/packages/CUDA/7vLVC/src/compiler/execution.jl:293
[10] macro expansion at /home/fedoroff/.julia/packages/CUDA/7vLVC/src/compiler/execution.jl:109 [inlined]
[11] func(::CUDA.CuArray{Float32,1}, ::NTuple{511,Foo{Float32}}) at /media/storage/julia/test_suite/equations/13_plasma_ensemble/test_gpu_tuple_of_struct.jl:20
[12] top-level scope at /media/storage/julia/test_suite/equations/13_plasma_ensemble/test_gpu_tuple_of_struct.jl:46
[13] include(::Function, ::Module, ::String) at ./Base.jl:380
[14] include(::Module, ::String) at ./Base.jl:368
[15] exec_options(::Base.JLOptions) at ./client.jl:296
[16] _start() at ./client.jl:506
Can you please explain me what is wrong.