I wrote an implementation of im2col kernel to use with a custom convolution-like operation on GPU with CUDA.jl.
After testing it for correctness (comparing with the NNlib cpu implementation), I am trying to use it in my program and I am running into a CUDA Error Code 1 (ERROR_INVALID_VALUE).
As far as I could gather from forums and CUDA manual, it is supposed to indicate an invalid kernel parameter (threads/blocks). Running with 1 thread for testing seemed to “cure” the issue, so I was thinking it was the excessive register usage (255) when dealing with large inputs (so many threads…). I couldn’t find a way to reduce register usage so I tried the maxreg @cuda option, but the problem persists.
Thank you for any advice,
code below:
using Flux
function Flux.NNlib.im2col!(col::CUDA.AbstractGPUArray{T,2}, x::CUDA.AbstractGPUArray{T,4}, cdims::ConvDims) where {T}
if Flux.NNlib.spatial_dims(cdims) != 3
throw(DimensionMismatch("im2col!() only accepts 3d convoluitional inputs"))
end
# Reshape col for easy access.
col_reshaped = reshape(col, (Flux.NNlib.output_size(cdims)..., Flux.NNlib.kernel_size(cdims)..., Flux.NNlib.channels_in(cdims)))
xsz = prod(Flux.NNlib.input_size(cdims))
k = prod(Flux.NNlib.kernel_size(cdims))
cin = prod(Flux.NNlib.channels_in(cdims))
input_size = Flux.NNlib.input_size(cdims) .|> Int32
C_in = Flux.NNlib.channels_in(cdims) |> Int32
kernel_size = Flux.NNlib.kernel_size(cdims) .|> Int32
p1l, p1h, p2l, p2h, p3l, p3h = Flux.NNlib.padding(cdims) .|> Int32
pad_lo = (p1l, p2l, p3l) # low paddings
pad_hi = (p1h, p2h, p3h) # high paddings
stride = Flux.NNlib.stride(cdims) .|> Int32
dilation = Flux.NNlib.dilation(cdims) .|> Int32
flipkernel = Flux.NNlib.flipkernel(cdims)
output_size = Flux.NNlib.output_size(cdims) .|> Int32
@boundscheck @assert size(col) == Flux.NNlib.im2col_dims(cdims)[1:2]
@boundscheck @assert size(x) == (input_size..., C_in)
kernel = @cuda launch=false maxregs=30 always_inline=true im2col_kernel(col_reshaped, x, input_size, C_in, kernel_size, pad_lo, pad_hi, stride, dilation, flipkernel, output_size)
# config = launch_configuration(kernel.fun; max_threads=256)
config = launch_configuration(kernel.fun)
available_threads = config.threads
tc = min(cin, config.threads)
tk = 1 # min(k, fld(available_threads, tc)) # set to 1 to avoid branching
tx = min(cld(xsz, prod(stride)), fld(available_threads, tc*tk))
threads = (tx, tk ,tc)
n_blocks = ceil.(Int, (xsz, 1, cin) ./ threads)
kernel(col_reshaped, x, input_size, C_in, kernel_size, pad_lo, pad_hi, stride, dilation, flipkernel, output_size; threads=threads, blocks=n_blocks)
end
# Helper function for flipkernel-induced dyslexia
@inline kernel_index(idx, kernel_size, flipkernel) = flipkernel ? idx : kernel_size - idx + Int32(1)
# amount of input (last few items on end of array dimension)
# that is not "touched" during convolution
@inline function wasted(I, K, Pl, Ph, S, D)
assume(S > Int32(0))
(I + Pl + Ph - (K - Int32(1))*D - Int32(1))%S
end
@inline effective_input_size(I, K, Pl, Ph, S, D)= I + Pl + Ph - wasted(I, K, Pl, Ph, S, D)
# # A helper function to project from input (input_w, input_h) to output (w, h)
@inline function invproject(idx, S)
assume(S > Int32(0))
fld(idx - Int32(1), S) + Int32(1)
end
@inline int32range(endidx, startindex=Int32(1), stride=Int32(1)) = startindex:stride:endidx
function im2col_kernel(col_reshaped, x, input_size, C_in, kernel_size, pad_lo, pad_hi, stride, dilation, flipkernel, output_size)
tidx = (
x = (blockIdx().x-Int32(1)) * blockDim().x + threadIdx().x,
y = (blockIdx().y-Int32(1)) * blockDim().y + threadIdx().y,
z = (blockIdx().z-Int32(1)) * blockDim().z + threadIdx().z,
)
cuda_strides = (
x = blockDim().x * gridDim().x,
y = blockDim().y * gridDim().y,
z = blockDim().z * gridDim().z,
)
# "wasted" input points do not participate in the convolution so we don't iterate on them
# effective_input_size takes that into account
# each thread takes care of input_size/block_size points
assume.(kernel_size .>= Int32(1))
cart_ind_k = CartesianIndices(int32range.(kernel_size))
eff_input_size = effective_input_size.(input_size, kernel_size, pad_lo, pad_hi, stride, dilation)
assume.(eff_input_size .>= Int32(1))
@inbounds for lin_idx_k in tidx.y:cuda_strides.y:length(cart_ind_k)
k = cart_ind_k[lin_idx_k]
kidxs = kernel_index.(Int32.(Tuple(k)), kernel_size, flipkernel)
dilated_kidxs = (kidxs .- Int32(1)) .* dilation .+ Int32(1) # take account of dilation
cart_ind = CartesianIndices(int32range.(eff_input_size .- ((kernel_size .- Int32(1)).*dilation .+ Int32(1) .- dilated_kidxs), dilated_kidxs, stride))
@inbounds for lin_idx in tidx.x:cuda_strides.x:length(cart_ind)
idx = cart_ind[lin_idx]
unpadded_idx = idx - CartesianIndex(pad_lo)
out_of_bounds = eltype(x)(
any(Int32.(Tuple(idx)) .<= pad_lo) || # inside lower pad
any(Int32.(Tuple(idx)) .> input_size .+ pad_lo) # inside higher pad
)
inbounds = (eltype(x)(1)-out_of_bounds)
out_idx = invproject.(Int32.(Tuple(idx)) .- (dilated_kidxs .- Int32(1)), stride)
for c in tidx.z:cuda_strides.z:C_in
# for the input and kernel locations, calc the output location
@inbounds col_reshaped[out_idx..., kidxs..., c] = x[unpadded_idx, c] * inbounds
end
end
end
return nothing
end
stacktrace:
ERROR: LoadError: CUDA error: invalid argument (code 1, ERROR_INVALID_VALUE)
Stacktrace:
[1] throw_api_error(res::CUDA.cudaError_enum)
@ CUDA ~/.julia/packages/CUDA/YIj5X/lib/cudadrv/libcuda.jl:27
[2] check
@ ~/.julia/packages/CUDA/YIj5X/lib/cudadrv/libcuda.jl:34 [inlined]
[3] cuLaunchKernel
@ ~/.julia/packages/CUDA/YIj5X/lib/utils/call.jl:26 [inlined]
[4] (::CUDA.var"#891#892"{Bool, Int64, CuStream, CuFunction, CuDim3, CuDim3})(kernelParams::Vector{Ptr{Nothing}})
@ CUDA ~/.julia/packages/CUDA/YIj5X/lib/cudadrv/execution.jl:69
[5] macro expansion
@ ~/.julia/packages/CUDA/YIj5X/lib/cudadrv/execution.jl:33 [inlined]
[6] macro expansion
@ ./none:0 [inlined]
[7] pack_arguments(::CUDA.var"#891#892"{Bool, Int64, CuStream, CuFunction, CuDim3, CuDim3}, ::CUDA.KernelState, ::CuDeviceArray{Float32, 7, 1}, ::CuDeviceArray{Float32, 4, 1}, ::Tuple{Int32, Int32, Int32}, ::Int32, ::Tuple{Int32, Int32, Int32}, ::Tuple{Int32, Int32, Int32}, ::Tuple{Int32, Int32, Int32}, ::Tuple{Int32, Int32, Int32}, ::Tuple{Int32, Int32, Int32}, ::Bool, ::Tuple{Int32, Int32, Int32})
@ CUDA ./none:0
[8] #launch#890
@ ~/.julia/packages/CUDA/YIj5X/lib/cudadrv/execution.jl:62 [inlined]
[9] #896
@ CUDA ~/.julia/packages/CUDA/YIj5X/lib/cudadrv/execution.jl:136 [inlined]
[10] macro expansion
@ CUDA ~/.julia/packages/CUDA/YIj5X/lib/cudadrv/execution.jl:95 [inlined]
[11] macro expansion
@ CUDA ./none:0 [inlined]
[12] convert_arguments
@ CUDA ./none:0 [inlined]
[13] #cudacall#895
@ CUDA ~/.julia/packages/CUDA/YIj5X/lib/cudadrv/execution.jl:135 [inlined]
[14] cudacall
@ CUDA ~/.julia/packages/CUDA/YIj5X/lib/cudadrv/execution.jl:134 [inlined]
[15] macro expansion
@ CUDA ~/.julia/packages/CUDA/YIj5X/src/compiler/execution.jl:281 [inlined]
[16] macro expansion
@ CUDA ./none:0 [inlined]
[17] call(::CUDA.HostKernel{typeof(OpeNet.im2col_kernel), Tuple{CuDeviceArray{Float32, 7, 1}, CuDeviceArray{Float32, 4, 1}, Tuple{Int32, Int32, Int32}, Int32, Tuple{Int32, Int32, Int32}, Tuple{Int32, Int32, Int32}, Tuple{Int32, Int32, Int32}, Tuple{Int32, Int32, Int32}, Tuple{Int32, Int32, Int32}, Bool, Tuple{Int32, Int32, Int32}}}, ::CuDeviceArray{Float32, 7, 1}, ::CuDeviceArray{Float32, 4, 1}, ::Tuple{Int32, Int32, Int32}, ::Int32, ::Tuple{Int32, Int32, Int32}, ::Tuple{Int32, Int32, Int32}, ::Tuple{Int32, Int32, Int32}, ::Tuple{Int32, Int32, Int32}, ::Tuple{Int32, Int32, Int32}, ::Bool, ::Tuple{Int32, Int32, Int32}; call_kwargs::@Kwargs{threads::Tuple{Int64, Int64, Int64}, blocks::Tuple{Int64, Int64, Int64}})
@ CUDA ./none:0
[18] (::CUDA.HostKernel{typeof(OpeNet.im2col_kernel), Tuple{CuDeviceArray{Float32, 7, 1}, CuDeviceArray{Float32, 4, 1}, Tuple{Int32, Int32, Int32}, Int32, Tuple{Int32, Int32, Int32}, Tuple{Int32, Int32, Int32}, Tuple{Int32, Int32, Int32}, Tuple{Int32, Int32, Int32}, Tuple{Int32, Int32, Int32}, Bool, Tuple{Int32, Int32, Int32}}})(::CuArray{Float32, 7, CUDA.Mem.DeviceBuffer}, ::Vararg{Any}; threads::Tuple{Int64, Int64, Int64}, blocks::Tuple{Int64, Int64, Int64}, kwargs::@Kwargs{})
@ CUDA ~/.julia/packages/CUDA/YIj5X/src/compiler/execution.jl:404
[19] HostKernel
@ ~/.julia/packages/CUDA/YIj5X/src/compiler/execution.jl:403 [inlined]
[20] im2col!(col::CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, x::CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, cdims::DenseConvDims{3, 3, 3, 6, 3})