Hey GPU Gang,
I have a kernel I’m trying to execute with CUDAnative. I’ve tested it with Nvidia Visual Profiler and it uses 46 registers/thread, and I can achieve the theoretical max of 1024 threads/block when I run it through the profiler, or powershell.
Very strangely, if I use Atom/Juno to run the same file that the profiler runs, then I get an out of resources error. I’m not sure if this constitutes a bug with either Juno or CUDAnative, or if it’s an environment error, or if this is just what I deserve for writing this on a windows machine
I can modify my workflow to get around it, but I would be grateful if anyone had some insight to share as to why this is happening. Since it may be relevant, here’s my package manager status, the MWE and output.
(@v1.4) pkg> status
Status `C:\Users\Alex\.julia\environments\v1.4\Project.toml`
[537997a7] AbstractPlotting v0.9.27
[c52e3926] Atom v0.12.11
[6e4b80f9] BenchmarkTools v0.5.0
[c5f51814] CUDAdrv v6.3.0
[be33ccc6] CUDAnative v3.1.0
[3a865a2d] CuArrays v2.2.0
[7a1cc6ca] FFTW v1.2.0
[e9467ef8] GLMakie v0.0.18
[e5e0dc1b] Juno v0.8.2
[093fc24a] LightGraphs v1.3.1
[ee78f7c6] Makie v0.9.6
[f27b6e38] Polynomials v0.7.0
[27ebfcd6] Primes v0.4.0
[6038ab10] Rotations v0.13.0
[90137ffa] StaticArrays v0.12.1
[1986cc42] Unitful v1.1.0
[b77e0a4c] InteractiveUtils
(@v1.4) pkg>
MWE
function E_1_kernel(E_1, H_2, H_3, ε, σ_E, Δx_2, Δx_3, Δt, x0, y0, z0, X2, X3)
x = (blockIdx().x - 1) * blockDim().x + threadIdx().x + x0
y = (blockIdx().y - 1) * blockDim().y + threadIdx().y + y0
z = (blockIdx().z - 1) * blockDim().z + threadIdx().z + z0
# c1 & c2 describe effects of lossy medium
@inbounds c1 = (2 * ε[x, y, z] - σ_E[x, y, z] * Δt) /
(2 * ε[x, y, z] + σ_E[x, y, z] * Δt)
@inbounds c2 = 2 * Δt / (2 * ε[x, y, z] + σ_E[x, y, z] * Δt)
@inbounds Δ = (H_3[x, y, z] - H_3[x, mod(y - 2, X2) + 1, z]) / Δx_2 -
(H_2[x, y, z] - H_2[x, y, mod(z - 2, X3) + 1]) / Δx_3
@inbounds E_1[x, y, z] = E_1[x, y, z] * c1 + Δ * c2
return nothing
end
T = Float32
actual_size = (64, 64, 64)
Ex, Hy, Hz, σ_e = map(x->CuArray(zeros(T, actual_size)), 1:4)
ε = CuArray(ones(T, actual_size))
Δx = Δy = Δz = 1.0
Δt = 0.0001
thread_combos = [(8, 8, 8), (32, 32, 1), (16, 16, 4)]
for my_threads in thread_combos
println("trying $my_threads")
@cuda blocks=(1, 1, 1) threads=my_threads E_1_kernel(Ex, Hy, Hz, ε, σ_e, Δy, Δz, Δt, 0, 0, 0, 64, 64)
synchronize()
println("success with $my_threads")
end
which gives output:
success with (8, 8, 8)
trying (32, 32, 1)
ERROR: LoadError: CUDA error: too many resources requested for launch (code 701, ERROR_LAUNCH_OUT_OF_RESOURCES)
Stacktrace:
[1] throw_api_error(::CUDAdrv.cudaError_enum) at C:\Users\Alex\.julia\packages\CUDAdrv\Uc14X\src\error.jl:105
[2] macro expansion at C:\Users\Alex\.julia\packages\CUDAdrv\Uc14X\src\error.jl:112 [inlined]
[3] cuLaunchKernel(::CuFunction, ::UInt32, ::UInt32, ::UInt32, ::UInt32, ::UInt32, ::UInt32, ::Int64, ::CuStream, ::Array{Ptr{Nothing},1}, ::Ptr{Nothing}) at C:\Users\Alex\.julia\packages\CUDAapi\XuSHC\src\call.jl:93
[4] (::CUDAdrv.var"#658#659"{Bool,Int64,CuStream,CuFunction})(::Array{Ptr{Nothing},1}) at C:\Users\Alex\.julia\packages\CUDAdrv\Uc14X\src\execution.jl:67
[5] macro expansion at C:\Users\Alex\.julia\packages\CUDAdrv\Uc14X\src\execution.jl:33 [inlined]
[6] pack_arguments(::CUDAdrv.var"#658#659"{Bool,Int64,CuStream,CuFunction}, ::CuDeviceArray{Float32,3,CUDAnative.AS.Global}, ::CuDeviceArray{Float32,3,CUDAnative.AS.Global}, ::CuDeviceArray{Float32,3,CUDAnative.AS.Global}, ::CuDeviceArray{Float32,3,CUDAnative.AS.Global}, ::CuDeviceArray{Float32,3,CUDAnative.AS.Global}, ::Float64, ::Float64, ::Float64, ::Int64, ::Int64, ::Int64, ::Int64, ::Int64) at C:\Users\Alex\.julia\packages\CUDAdrv\Uc14X\src\execution.jl:10
[7] launch(::CuFunction, ::CuDeviceArray{Float32,3,CUDAnative.AS.Global}, ::Vararg{Any,N} where N; blocks::Tuple{Int64,Int64,Int64}, threads::Tuple{Int64,Int64,Int64}, cooperative::Bool, shmem::Int64, stream::CuStream) at C:\Users\Alex\.julia\packages\CUDAdrv\Uc14X\src\execution.jl:60
[8] #663 at C:\Users\Alex\.julia\packages\CUDAdrv\Uc14X\src\execution.jl:136 [inlined]
[9] macro expansion at C:\Users\Alex\.julia\packages\CUDAdrv\Uc14X\src\execution.jl:95 [inlined]
[10] convert_arguments at C:\Users\Alex\.julia\packages\CUDAdrv\Uc14X\src\execution.jl:78 [inlined]
[11] #cudacall#662 at C:\Users\Alex\.julia\packages\CUDAdrv\Uc14X\src\execution.jl:135 [inlined]
[12] #cudacall#134 at C:\Users\Alex\.julia\packages\CUDAnative\e0IdN\src\execution.jl:217 [inlined]
[13] macro expansion at C:\Users\Alex\.julia\packages\CUDAnative\e0IdN\src\execution.jl:198 [inlined]
[14] call(::CUDAnative.HostKernel{E_1_kernel,Tuple{CuDeviceArray{Float32,3,CUDAnative.AS.Global},CuDeviceArray{Float32,3,CUDAnative.AS.Global},CuDeviceArray{Float32,3,CUDAnative.AS.Global},CuDeviceArray{Float32,3,CUDAnative.AS.Global},CuDeviceArray{Float32,3,CUDAnative.AS.Global},Float64,Float64,Float64,Int64,Int64,Int64,Int64,Int64}}, ::CuDeviceArray{Float32,3,CUDAnative.AS.Global}, ::CuDeviceArray{Float32,3,CUDAnative.AS.Global}, ::CuDeviceArray{Float32,3,CUDAnative.AS.Global}, ::CuDeviceArray{Float32,3,CUDAnative.AS.Global}, ::CuDeviceArray{Float32,3,CUDAnative.AS.Global}, ::Float64, ::Float64, ::Float64, ::Int64, ::Int64, ::Int64, ::Int64, ::Int64; call_kwargs::Base.Iterators.Pairs{Symbol,Tuple{Int64,Int64,Int64},Tuple{Symbol,Symbol},NamedTuple{(:blocks, :threads),Tuple{Tuple{Int64,Int64,Int64},Tuple{Int64,Int64,Int64}}}}) at C:\Users\Alex\.julia\packages\CUDAnative\e0IdN\src\execution.jl:170
[15] (::CUDAnative.HostKernel{E_1_kernel,Tuple{CuDeviceArray{Float32,3,CUDAnative.AS.Global},CuDeviceArray{Float32,3,CUDAnative.AS.Global},CuDeviceArray{Float32,3,CUDAnative.AS.Global},CuDeviceArray{Float32,3,CUDAnative.AS.Global},CuDeviceArray{Float32,3,CUDAnative.AS.Global},Float64,Float64,Float64,Int64,Int64,Int64,Int64,Int64}})(::CuDeviceArray{Float32,3,CUDAnative.AS.Global}, ::Vararg{Any,N} where N; kwargs::Base.Iterators.Pairs{Symbol,Tuple{Int64,Int64,Int64},Tuple{Symbol,Symbol},NamedTuple{(:blocks, :threads),Tuple{Tuple{Int64,Int64,Int64},Tuple{Int64,Int64,Int64}}}}) at C:\Users\Alex\.julia\packages\CUDAnative\e0IdN\src\execution.jl:345
[16] macro expansion at C:\Users\Alex\.julia\packages\CUDAnative\e0IdN\src\execution.jl:109 [inlined]
[17] top-level scope at C:\Users\Alex\Documents\em\beep\MWE.jl:25
And, just for sanity, here is profiler output showing that it worked just fine.
Profiler Output