Help debugging Runtime error 1

I wrote an implementation of im2col kernel to use with a custom convolution-like operation on GPU with CUDA.jl.
After testing it for correctness (comparing with the NNlib cpu implementation), I am trying to use it in my program and I am running into a CUDA Error Code 1 (ERROR_INVALID_VALUE).
As far as I could gather from forums and CUDA manual, it is supposed to indicate an invalid kernel parameter (threads/blocks). Running with 1 thread for testing seemed to “cure” the issue, so I was thinking it was the excessive register usage (255) when dealing with large inputs (so many threads…). I couldn’t find a way to reduce register usage so I tried the maxreg @cuda option, but the problem persists.

Thank you for any advice,
code below:

using Flux
function Flux.NNlib.im2col!(col::CUDA.AbstractGPUArray{T,2}, x::CUDA.AbstractGPUArray{T,4}, cdims::ConvDims) where {T}
    if Flux.NNlib.spatial_dims(cdims) != 3
        throw(DimensionMismatch("im2col!() only accepts 3d convoluitional inputs"))
    end
    # Reshape col for easy access.
    col_reshaped = reshape(col, (Flux.NNlib.output_size(cdims)..., Flux.NNlib.kernel_size(cdims)..., Flux.NNlib.channels_in(cdims)))
    xsz = prod(Flux.NNlib.input_size(cdims))
    k = prod(Flux.NNlib.kernel_size(cdims))
    cin = prod(Flux.NNlib.channels_in(cdims))

    input_size = Flux.NNlib.input_size(cdims) .|> Int32
    C_in = Flux.NNlib.channels_in(cdims) |> Int32
    kernel_size = Flux.NNlib.kernel_size(cdims) .|> Int32
    p1l, p1h, p2l, p2h, p3l, p3h = Flux.NNlib.padding(cdims) .|> Int32
    pad_lo = (p1l, p2l, p3l) # low paddings
    pad_hi = (p1h, p2h, p3h) # high paddings
    stride = Flux.NNlib.stride(cdims) .|> Int32
    dilation = Flux.NNlib.dilation(cdims) .|> Int32
    flipkernel = Flux.NNlib.flipkernel(cdims)
    output_size = Flux.NNlib.output_size(cdims) .|> Int32

    @boundscheck @assert size(col) == Flux.NNlib.im2col_dims(cdims)[1:2]
    @boundscheck @assert size(x) == (input_size..., C_in)
    kernel = @cuda launch=false maxregs=30 always_inline=true im2col_kernel(col_reshaped, x, input_size, C_in, kernel_size, pad_lo, pad_hi, stride, dilation, flipkernel, output_size)
    # config = launch_configuration(kernel.fun; max_threads=256)
    config = launch_configuration(kernel.fun)
    available_threads = config.threads
    tc = min(cin, config.threads)
    tk = 1 # min(k, fld(available_threads, tc)) # set to 1 to avoid branching
    tx = min(cld(xsz, prod(stride)), fld(available_threads, tc*tk))
    threads = (tx, tk ,tc)
    n_blocks = ceil.(Int, (xsz, 1, cin) ./ threads)
    kernel(col_reshaped, x, input_size, C_in, kernel_size, pad_lo, pad_hi, stride, dilation, flipkernel, output_size; threads=threads, blocks=n_blocks)
end

# Helper function for flipkernel-induced dyslexia
@inline kernel_index(idx, kernel_size, flipkernel) = flipkernel ? idx : kernel_size - idx + Int32(1)
# amount of input (last few items on end of array dimension)
# that is not "touched" during convolution
@inline function wasted(I, K, Pl, Ph, S, D)
    assume(S > Int32(0))
    (I + Pl + Ph - (K - Int32(1))*D - Int32(1))%S
end
@inline effective_input_size(I, K, Pl, Ph, S, D)= I + Pl + Ph - wasted(I, K, Pl, Ph, S, D)
# # A helper function to project from input (input_w, input_h) to output (w, h)
@inline function invproject(idx, S)
    assume(S > Int32(0))
    fld(idx - Int32(1), S) + Int32(1)
end
@inline int32range(endidx, startindex=Int32(1), stride=Int32(1)) = startindex:stride:endidx

function im2col_kernel(col_reshaped, x, input_size, C_in, kernel_size, pad_lo, pad_hi, stride, dilation, flipkernel, output_size)
    tidx = (
        x = (blockIdx().x-Int32(1)) * blockDim().x + threadIdx().x,
        y = (blockIdx().y-Int32(1)) * blockDim().y + threadIdx().y,
        z = (blockIdx().z-Int32(1)) * blockDim().z + threadIdx().z,
    )
    cuda_strides = (
        x = blockDim().x * gridDim().x,
        y = blockDim().y * gridDim().y,
        z = blockDim().z * gridDim().z,
    )

    # "wasted" input points do not participate in the convolution so we don't iterate on them
    # effective_input_size takes that into account
    # each thread takes care of input_size/block_size points
    assume.(kernel_size .>= Int32(1))
    cart_ind_k = CartesianIndices(int32range.(kernel_size))
    eff_input_size = effective_input_size.(input_size, kernel_size, pad_lo, pad_hi, stride, dilation)
    assume.(eff_input_size .>= Int32(1))

    @inbounds for lin_idx_k in tidx.y:cuda_strides.y:length(cart_ind_k)
        k = cart_ind_k[lin_idx_k]
        kidxs = kernel_index.(Int32.(Tuple(k)), kernel_size, flipkernel)
        dilated_kidxs = (kidxs .- Int32(1)) .* dilation .+ Int32(1) # take account of dilation

        cart_ind = CartesianIndices(int32range.(eff_input_size .- ((kernel_size .- Int32(1)).*dilation .+ Int32(1) .- dilated_kidxs), dilated_kidxs, stride))
        @inbounds for lin_idx in tidx.x:cuda_strides.x:length(cart_ind)
            idx = cart_ind[lin_idx]
            unpadded_idx = idx - CartesianIndex(pad_lo)
            out_of_bounds = eltype(x)(
                any(Int32.(Tuple(idx)) .<= pad_lo) || # inside lower pad
                any(Int32.(Tuple(idx)) .>  input_size .+ pad_lo) # inside higher pad
            )
            inbounds = (eltype(x)(1)-out_of_bounds)
            out_idx = invproject.(Int32.(Tuple(idx)) .- (dilated_kidxs .- Int32(1)), stride)
            for c in tidx.z:cuda_strides.z:C_in
                # for the input and kernel locations, calc the output location
                @inbounds col_reshaped[out_idx..., kidxs..., c] = x[unpadded_idx, c] * inbounds
            end
        end
    end
    return nothing
end

stacktrace:

ERROR: LoadError: CUDA error: invalid argument (code 1, ERROR_INVALID_VALUE)
Stacktrace:
  [1] throw_api_error(res::CUDA.cudaError_enum)
    @ CUDA ~/.julia/packages/CUDA/YIj5X/lib/cudadrv/libcuda.jl:27
  [2] check
    @ ~/.julia/packages/CUDA/YIj5X/lib/cudadrv/libcuda.jl:34 [inlined]
  [3] cuLaunchKernel
    @ ~/.julia/packages/CUDA/YIj5X/lib/utils/call.jl:26 [inlined]
  [4] (::CUDA.var"#891#892"{Bool, Int64, CuStream, CuFunction, CuDim3, CuDim3})(kernelParams::Vector{Ptr{Nothing}})
    @ CUDA ~/.julia/packages/CUDA/YIj5X/lib/cudadrv/execution.jl:69
  [5] macro expansion
    @ ~/.julia/packages/CUDA/YIj5X/lib/cudadrv/execution.jl:33 [inlined]
  [6] macro expansion
    @ ./none:0 [inlined]
  [7] pack_arguments(::CUDA.var"#891#892"{Bool, Int64, CuStream, CuFunction, CuDim3, CuDim3}, ::CUDA.KernelState, ::CuDeviceArray{Float32, 7, 1}, ::CuDeviceArray{Float32, 4, 1}, ::Tuple{Int32, Int32, Int32}, ::Int32, ::Tuple{Int32, Int32, Int32}, ::Tuple{Int32, Int32, Int32}, ::Tuple{Int32, Int32, Int32}, ::Tuple{Int32, Int32, Int32}, ::Tuple{Int32, Int32, Int32}, ::Bool, ::Tuple{Int32, Int32, Int32})
    @ CUDA ./none:0
  [8] #launch#890
    @ ~/.julia/packages/CUDA/YIj5X/lib/cudadrv/execution.jl:62 [inlined]
  [9] #896
    @ CUDA ~/.julia/packages/CUDA/YIj5X/lib/cudadrv/execution.jl:136 [inlined]
 [10] macro expansion
    @ CUDA ~/.julia/packages/CUDA/YIj5X/lib/cudadrv/execution.jl:95 [inlined]
 [11] macro expansion
    @ CUDA ./none:0 [inlined]
 [12] convert_arguments
    @ CUDA ./none:0 [inlined]
 [13] #cudacall#895
    @ CUDA ~/.julia/packages/CUDA/YIj5X/lib/cudadrv/execution.jl:135 [inlined]
 [14] cudacall
    @ CUDA ~/.julia/packages/CUDA/YIj5X/lib/cudadrv/execution.jl:134 [inlined]
 [15] macro expansion
    @ CUDA ~/.julia/packages/CUDA/YIj5X/src/compiler/execution.jl:281 [inlined]
 [16] macro expansion
    @ CUDA ./none:0 [inlined]
 [17] call(::CUDA.HostKernel{typeof(OpeNet.im2col_kernel), Tuple{CuDeviceArray{Float32, 7, 1}, CuDeviceArray{Float32, 4, 1}, Tuple{Int32, Int32, Int32}, Int32, Tuple{Int32, Int32, Int32}, Tuple{Int32, Int32, Int32}, Tuple{Int32, Int32, Int32}, Tuple{Int32, Int32, Int32}, Tuple{Int32, Int32, Int32}, Bool, Tuple{Int32, Int32, Int32}}}, ::CuDeviceArray{Float32, 7, 1}, ::CuDeviceArray{Float32, 4, 1}, ::Tuple{Int32, Int32, Int32}, ::Int32, ::Tuple{Int32, Int32, Int32}, ::Tuple{Int32, Int32, Int32}, ::Tuple{Int32, Int32, Int32}, ::Tuple{Int32, Int32, Int32}, ::Tuple{Int32, Int32, Int32}, ::Bool, ::Tuple{Int32, Int32, Int32}; call_kwargs::@Kwargs{threads::Tuple{Int64, Int64, Int64}, blocks::Tuple{Int64, Int64, Int64}})
    @ CUDA ./none:0
 [18] (::CUDA.HostKernel{typeof(OpeNet.im2col_kernel), Tuple{CuDeviceArray{Float32, 7, 1}, CuDeviceArray{Float32, 4, 1}, Tuple{Int32, Int32, Int32}, Int32, Tuple{Int32, Int32, Int32}, Tuple{Int32, Int32, Int32}, Tuple{Int32, Int32, Int32}, Tuple{Int32, Int32, Int32}, Tuple{Int32, Int32, Int32}, Bool, Tuple{Int32, Int32, Int32}}})(::CuArray{Float32, 7, CUDA.Mem.DeviceBuffer}, ::Vararg{Any}; threads::Tuple{Int64, Int64, Int64}, blocks::Tuple{Int64, Int64, Int64}, kwargs::@Kwargs{})
    @ CUDA ~/.julia/packages/CUDA/YIj5X/src/compiler/execution.jl:404
 [19] HostKernel
    @ ~/.julia/packages/CUDA/YIj5X/src/compiler/execution.jl:403 [inlined]
 [20] im2col!(col::CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, x::CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, cdims::DenseConvDims{3, 3, 3, 6, 3})

Excessive use of registers lowers the amount of threads you can launch in a single block, so you shouldn’t ever have to use maxreg to overcome a launch error. Instead, if you use the occupancy API (which you already are), the block size will be adapted automatically to overcome device limits.

Can you verify that with your block size computation tx * tk * tc doesn’t exceed available_threads?

Here is the result:

┌ Info: kernel launch config
│   threads = (10, 1, 100)
│   blocks = (103, 1, 1)
│   available_threads = 1024
└   total_used_threads = 1000

Also here is the system which I’m running on:

julia> versioninfo()
Julia Version 1.10.0
Commit c67ed11612* (2023-12-26 21:52 UTC)
Platform Info:
  OS: Linux (x86_64-linux-gnu)
  CPU: 4 × Intel(R) Xeon(R) Platinum 8259CL CPU @ 2.50GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-15.0.7 (ORCJIT, cascadelake)
  Threads: 1 on 4 virtual cores
Environment:
  LD_LIBRARY_PATH = /usr/local/nvidia/lib:/usr/local/nvidia/lib64

julia> CUDA.versioninfo()
CUDA runtime 12.3, artifact installation
CUDA driver 12.3
NVIDIA driver 535.104.12, originally for CUDA 12.2

CUDA libraries: 
- CUBLAS: 12.3.4
- CURAND: 10.3.4
- CUFFT: 11.0.12
- CUSOLVER: 11.5.4
- CUSPARSE: 12.2.0
- CUPTI: 21.0.0
- NVML: 12.0.0+535.104.12

Julia packages: 
- CUDA: 5.1.1
- CUDA_Driver_jll: 0.7.0+0
- CUDA_Runtime_jll: 0.10.1+0

Toolchain:
- Julia: 1.10.0
- LLVM: 15.0.7

Preferences:
- CUDA_Runtime_jll.local: false

1 device:
  0: Tesla T4 (sm_75, 14.745 GiB / 15.000 GiB available)

That looks OK. Registers are probably not the issue (you can inspect the amount of registers used with CUDA.registers(kernel)).

Can you create a MWE, ideally without Flux (i.e. invoking your im2col! function directly)?

It seems to reproduce when I use big enough inputs.
Here is an MWE. I left Flux in for some utilities, but I’m calling my function directly to make sure no Flux calls are called unintentionally:

using Flux, CUDA
function my_im2col!(col::CUDA.AbstractGPUArray{T,2}, x::CUDA.AbstractGPUArray{T,4}, cdims::ConvDims) where {T}
    # Reshape col for easy access.
    col_reshaped = reshape(col, (Flux.NNlib.output_size(cdims)..., Flux.NNlib.kernel_size(cdims)..., Flux.NNlib.channels_in(cdims)))
    xsz = prod(Flux.NNlib.input_size(cdims))
    k = prod(Flux.NNlib.kernel_size(cdims))
    cin = prod(Flux.NNlib.channels_in(cdims))

    input_size = Flux.NNlib.input_size(cdims) .|> Int32
    C_in = Flux.NNlib.channels_in(cdims) |> Int32
    kernel_size = Flux.NNlib.kernel_size(cdims) .|> Int32
    p1l, p1h, p2l, p2h, p3l, p3h = Flux.NNlib.padding(cdims) .|> Int32
    pad_lo = (p1l, p2l, p3l) # low paddings
    pad_hi = (p1h, p2h, p3h) # high paddings
    stride = Flux.NNlib.stride(cdims) .|> Int32
    dilation = Flux.NNlib.dilation(cdims) .|> Int32
    flipkernel = Flux.NNlib.flipkernel(cdims)
    output_size = Flux.NNlib.output_size(cdims) .|> Int32

    @boundscheck @assert size(col) == Flux.NNlib.im2col_dims(cdims)[1:2]
    @boundscheck @assert size(x) == (input_size..., C_in)
    kernel = @cuda launch=false maxregs=30 always_inline=true im2col_kernel(col_reshaped, x, input_size, C_in, kernel_size, pad_lo, pad_hi, stride, dilation, flipkernel, output_size)
    # config = launch_configuration(kernel.fun; max_threads=256)
    config = launch_configuration(kernel.fun)
    available_threads = config.threads
    tc = min(cin, config.threads)
    tk = 1 # min(k, fld(available_threads, tc)) # set to 1 to avoid branching
    tx = min(cld(xsz, prod(stride)), fld(available_threads, tc*tk))
    threads = (tx, tk ,tc)
    n_blocks = ceil.(Int, (xsz, 1, cin) ./ threads)
    kernel(col_reshaped, x, input_size, C_in, kernel_size, pad_lo, pad_hi, stride, dilation, flipkernel, output_size; threads=threads, blocks=n_blocks)
end

# Helper function for flipkernel-induced dyslexia
@inline kernel_index(idx, kernel_size, flipkernel) = flipkernel ? idx : kernel_size - idx + Int32(1)
# amount of input (last few items on end of array dimension)
# that is not "touched" during convolution
@inline function wasted(I, K, Pl, Ph, S, D)
    (I + Pl + Ph - (K - Int32(1))*D - Int32(1))%S
end
@inline effective_input_size(I, K, Pl, Ph, S, D)= I + Pl + Ph - wasted(I, K, Pl, Ph, S, D)
# # A helper function to project from input (input_w, input_h) to output (w, h)
@inline function invproject(idx, S)
    fld(idx - Int32(1), S) + Int32(1)
end
@inline int32range(endidx, startindex=Int32(1), stride=Int32(1)) = startindex:stride:endidx

function im2col_kernel(col_reshaped, x, input_size, C_in, kernel_size, pad_lo, pad_hi, stride, dilation, flipkernel, output_size)
    tidx = (
        x = (blockIdx().x-Int32(1)) * blockDim().x + threadIdx().x,
        y = (blockIdx().y-Int32(1)) * blockDim().y + threadIdx().y,
        z = (blockIdx().z-Int32(1)) * blockDim().z + threadIdx().z,
    )
    cuda_strides = (
        x = blockDim().x * gridDim().x,
        y = blockDim().y * gridDim().y,
        z = blockDim().z * gridDim().z,
    )

    # "wasted" input points do not participate in the convolution so we don't iterate on them
    # effective_input_size takes that into account
    # each thread takes care of input_size/block_size points
    cart_ind_k = CartesianIndices(int32range.(kernel_size))
    eff_input_size = effective_input_size.(input_size, kernel_size, pad_lo, pad_hi, stride, dilation)

    @inbounds for lin_idx_k in tidx.y:cuda_strides.y:length(cart_ind_k)
        k = cart_ind_k[lin_idx_k]
        kidxs = kernel_index.(Int32.(Tuple(k)), kernel_size, flipkernel)
        dilated_kidxs = (kidxs .- Int32(1)) .* dilation .+ Int32(1) # take account of dilation

        cart_ind = CartesianIndices(int32range.(eff_input_size .- ((kernel_size .- Int32(1)).*dilation .+ Int32(1) .- dilated_kidxs), dilated_kidxs, stride))
        @inbounds for lin_idx in tidx.x:cuda_strides.x:length(cart_ind)
            idx = cart_ind[lin_idx]
            unpadded_idx = idx - CartesianIndex(pad_lo)
            out_of_bounds = eltype(x)(
                any(Int32.(Tuple(idx)) .<= pad_lo) || # inside lower pad
                any(Int32.(Tuple(idx)) .>  input_size .+ pad_lo) # inside higher pad
            )
            inbounds = (eltype(x)(1)-out_of_bounds)
            out_idx = invproject.(Int32.(Tuple(idx)) .- (dilated_kidxs .- Int32(1)), stride)
            for c in tidx.z:cuda_strides.z:C_in
                # for the input and kernel locations, calc the output location
                @inbounds col_reshaped[out_idx..., kidxs..., c] = x[unpadded_idx, c] * inbounds
            end
        end
    end
    return nothing
end

x = rand32(600, 400, 1, 100, 50) |> cu; w = rand32(3,3,1,100,1) |> cu; cdims = Flux.NNlib.DenseConvDims(x,w);
col = similar(x, Flux.NNlib.im2col_dims(cdims)...);
my_im2col!(view(col, :,:,1), x[:,:,:,:,1], cdims)
threads = (2, 1, 100)

julia> attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z)
64
1 Like

Interesting, so I need to query this attribute in addition to config.threads and limit it accordingly?

It’s a compute capability-dependent device limit, not something that the occupancy API returns, so you could realistically hard code it. See the second table CUDA C++ Programming Guide; 64 is the current z-dimension thread limit for all CUDA devices.

I’ll add some code to CUDA.jl to make this easier to debug:

julia> main()
ERROR: CUDA error: invalid argument (code 1, ERROR_INVALID_VALUE)
Number of threads in z-dimension exceeds device limit (100 > 64).
1 Like