Hi! I am working on using CUDA.jl to write kernels for 1D, 2D, and 3D arrays. The kernel worked well for 1D and 2D arrays and here is the 2D code.
using CUDA
function gpu_add1!(y, x)
index_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
index_j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
stride_i = gridDim().x * blockDim().x
stride_j = gridDim().y * blockDim().y
for i ∈ index_i:stride_i:size(y, 1)
for j ∈ index_j:stride_j:size(y, 2)
@inbounds y[i, j] += x[i, j]
end
end
return nothing
end
N = 2^8
x = CUDA.ones(N, N)
y = CUDA.zeros(N, N)
@cuda threads = (16, 16) blocks = (4, 4) gpu_add1!(y, x)
The code above can run successfully. But when I tried to write the similar code for computing 3D array, the error occurred.
Here is the code for 3D array.
using CUDA
function gpu_add2!(y, x)
index_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
index_j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
index_k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
stride_i = gridDim().x * blockDim().x
stride_j = gridDim().y * blockDim().y
stride_k = gridDim().z * blockDim().z
for i ∈ index_i:stride_i:size(y, 1)
for j ∈ index_j:stride_j:size(y, 2)
for k ∈ index_k:stride_k:size(y, 3)
@inbounds y[i, j, k] += x[i, j, k]
end
end
end
return nothing
end
N = 2^8
x = CUDA.ones(N, N, N)
y = CUDA.zeros(N, N, N)
@cuda threads = (16, 16, 16) blocks = (4, 4, 4) gpu_add2!(y, x)
Here is the error backtrace.
ERROR: CUDA error: invalid argument (code 1, ERROR_INVALID_VALUE)
Stacktrace:
[1] throw_api_error(res::CUDA.cudaError_enum)
@ CUDA ~/.julia/packages/CUDA/p5OVK/lib/cudadrv/libcuda.jl:27
[2] macro expansion
@ ~/.julia/packages/CUDA/p5OVK/lib/cudadrv/libcuda.jl:35 [inlined]
[3] cuLaunchKernel
@ ~/.julia/packages/CUDA/p5OVK/lib/utils/call.jl:26 [inlined]
[4] (::CUDA.var"#35#36"{Bool, Int64, CuStream, CuFunction, CuDim3, CuDim3})(kernelParams::Vector{Ptr{Nothing}})
@ CUDA ~/.julia/packages/CUDA/p5OVK/lib/cudadrv/execution.jl:69
[5] macro expansion
@ ~/.julia/packages/CUDA/p5OVK/lib/cudadrv/execution.jl:33 [inlined]
[6] macro expansion
@ ./none:0 [inlined]
[7] pack_arguments(::CUDA.var"#35#36"{Bool, Int64, CuStream, CuFunction, CuDim3, CuDim3}, ::CUDA.KernelState, ::CuDeviceArray{Float32, 3, 1}, ::CuDeviceArray{Float32, 3, 1})
@ CUDA ./none:0
[8] #launch#34
@ ~/.julia/packages/CUDA/p5OVK/lib/cudadrv/execution.jl:62 [inlined]
[9] #40
@ ~/.julia/packages/CUDA/p5OVK/lib/cudadrv/execution.jl:136 [inlined]
[10] macro expansion
@ ~/.julia/packages/CUDA/p5OVK/lib/cudadrv/execution.jl:95 [inlined]
[11] macro expansion
@ ./none:0 [inlined]
[12] convert_arguments
@ ./none:0 [inlined]
[13] #cudacall#39
@ ~/.julia/packages/CUDA/p5OVK/lib/cudadrv/execution.jl:135 [inlined]
[14] cudacall
@ ~/.julia/packages/CUDA/p5OVK/lib/cudadrv/execution.jl:134 [inlined]
[15] macro expansion
@ ~/.julia/packages/CUDA/p5OVK/src/compiler/execution.jl:212 [inlined]
[16] macro expansion
@ ./none:0 [inlined]
[17] call(::CUDA.HostKernel{typeof(gpu_add2!), Tuple{CuDeviceArray{Float32, 3, 1}, CuDeviceArray{Float32, 3, 1}}}, ::CuDeviceArray{Float32, 3, 1}, ::CuDeviceArray{Float32, 3, 1}; call_kwargs::Base.Pairs{Symbol, Tuple{Int64, Int64, Int64}, Tuple{Symbol, Symbol}, NamedTuple{(:threads, :blocks), Tuple{Tuple{Int64, Int64, Int64}, Tuple{Int64, Int64, Int64}}}})
@ CUDA ./none:0
[18] (::CUDA.HostKernel{typeof(gpu_add2!), Tuple{CuDeviceArray{Float32, 3, 1}, CuDeviceArray{Float32, 3, 1}}})(::CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, ::Vararg{CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}}; threads::Tuple{Int64, Int64, Int64}, blocks::Tuple{Int64, Int64, Int64}, kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
@ CUDA ~/.julia/packages/CUDA/p5OVK/src/compiler/execution.jl:333
[19] top-level scope
@ ~/.julia/packages/CUDA/p5OVK/src/compiler/execution.jl:106
I felt so weird about it since they are quite similar, yet it worked for 2D but not for 3D. I also wrote another 3D kernel which is much simpler and more straightforward than this one.
using CUDA
function gpu_add!(y, x)
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
y[i, j, k] += x[i, j, k]
return nothing
end
N = 32
x = CUDA.ones(N, N, N)
y = CUDA.zeros(N, N, N)
@cuda threads = (16, 16, 16) blocks = (2, 2, 2) gpu_add!(y, x)
But I got the same invalid argument error. Can anyone help me with this issue? I am not sure how to resolve it. Thanks!