Passing too long tuples into CUDA kernel causes an error

Consider the following code where I create a tuple of structures and pass it into the GPU kernel:

import CUDA


struct Foo{T}
    a :: T
    b :: T
end


function func(u, foos)
    N = length(u)

    function get_config(kernel)
        fun = kernel.fun
        config = CUDA.launch_configuration(fun)
        blocks = cld(N, config.threads)
        return (threads=config.threads, blocks=blocks)
    end

    CUDA.@cuda config=get_config kernel(u, foos)
    return nothing
end


function kernel(u, foos)
    id = (CUDA.blockIdx().x - 1) * CUDA.blockDim().x + CUDA.threadIdx().x
    stride = CUDA.blockDim().x * CUDA.gridDim().x
    for i=id:stride:length(u)
        u[i] = foos[i].a + foos[i].b
    end
    return nothing
end


# ------------------------------------------------------------------------------
N = 10
# N = 511   # causes an error

foos = Array{Foo}(undef, N)
for i=1:N
    foos[i] = Foo(1f0, 2f0)
end
foos = tuple(foos...)

u = CUDA.zeros(N)

func(u, foos)

With N up to 500 everything works fine. However, starting from N = 511 I start to see the following error:

julia test_gpu_tuple_of_struct.jl 
ERROR: LoadError: CUDA error: device kernel image is invalid (code 200, ERROR_INVALID_IMAGE)

Stacktrace:
 [1] CUDA.CuModule(::String, ::Dict{CUDA.CUjit_option_enum,Any}) at /home/fedoroff/.julia/packages/CUDA/7vLVC/lib/cudadrv/module.jl:40
 [2] _cufunction(::GPUCompiler.FunctionSpec{typeof(kernel),Tuple{CUDA.CuDeviceArray{Float32,1,CUDA.AS.Global},NTuple{511,Foo{Float32}}}}; kwargs::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}) at /home/fedoroff/.julia/packages/CUDA/7vLVC/src/compiler/execution.jl:337
 [3] _cufunction at /home/fedoroff/.julia/packages/CUDA/7vLVC/src/compiler/execution.jl:304 [inlined]
 [4] check_cache(::typeof(CUDA._cufunction), ::GPUCompiler.FunctionSpec{typeof(kernel),Tuple{CUDA.CuDeviceArray{Float32,1,CUDA.AS.Global},NTuple{511,Foo{Float32}}}}, ::UInt64; kwargs::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}) at /home/fedoroff/.julia/packages/GPUCompiler/pCBTA/src/cache.jl:24
 [5] kernel at /media/storage/julia/test_suite/equations/13_plasma_ensemble/test_gpu_tuple_of_struct.jl:26 [inlined]
 [6] cached_compilation(::typeof(CUDA._cufunction), ::GPUCompiler.FunctionSpec{typeof(kernel),Tuple{CUDA.CuDeviceArray{Float32,1,CUDA.AS.Global},NTuple{511,Foo{Float32}}}}, ::UInt64; kwargs::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}) at /home/fedoroff/.julia/packages/GPUCompiler/pCBTA/src/cache.jl:0
 [7] cached_compilation at /home/fedoroff/.julia/packages/GPUCompiler/pCBTA/src/cache.jl:40 [inlined]
 [8] cufunction(::typeof(kernel), ::Type{Tuple{CUDA.CuDeviceArray{Float32,1,CUDA.AS.Global},NTuple{511,Foo{Float32}}}}; name::Nothing, kwargs::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}) at /home/fedoroff/.julia/packages/CUDA/7vLVC/src/compiler/execution.jl:298
 [9] cufunction(::typeof(kernel), ::Type{Tuple{CUDA.CuDeviceArray{Float32,1,CUDA.AS.Global},NTuple{511,Foo{Float32}}}}) at /home/fedoroff/.julia/packages/CUDA/7vLVC/src/compiler/execution.jl:293
 [10] macro expansion at /home/fedoroff/.julia/packages/CUDA/7vLVC/src/compiler/execution.jl:109 [inlined]
 [11] func(::CUDA.CuArray{Float32,1}, ::NTuple{511,Foo{Float32}}) at /media/storage/julia/test_suite/equations/13_plasma_ensemble/test_gpu_tuple_of_struct.jl:20
 [12] top-level scope at /media/storage/julia/test_suite/equations/13_plasma_ensemble/test_gpu_tuple_of_struct.jl:46
 [13] include(::Function, ::Module, ::String) at ./Base.jl:380
 [14] include(::Module, ::String) at ./Base.jl:368
 [15] exec_options(::Base.JLOptions) at ./client.jl:296
 [16] _start() at ./client.jl:506

Can you please explain me what is wrong.

Plain tuples of large enough length can cause the same error:

import CUDA


function func(u, foos)
    N = length(u)

    function get_config(kernel)
        fun = kernel.fun
        config = CUDA.launch_configuration(fun)
        blocks = cld(N, config.threads)
        return (threads=config.threads, blocks=blocks)
    end

    CUDA.@cuda config=get_config kernel(u, foos)
    return nothing
end


function kernel(u, foos)
    id = (CUDA.blockIdx().x - 1) * CUDA.blockDim().x + CUDA.threadIdx().x
    stride = CUDA.blockDim().x * CUDA.gridDim().x
    for i=id:stride:length(u)
        u[i] = foos[i]
    end
    return nothing
end


# ------------------------------------------------------------------------------
N = 1020   # fine
# N = 1021   # causes an error

foos = tuple(ones(Float32, N)...)

u = CUDA.zeros(N)

func(u, foos)

This time error appears at N larger or equal to 1021.

There is a limit to the maximum size of input argument of a kernel. At this time it appears to be 4KB.

source: Programming Guide :: CUDA Toolkit Documentation

Thank you. It is good to know.