Hi all,
I am learning CUDA.jl and trying to launch a bunch of tasks in parallel on the GPU:
using LinearAlgebra, Random, CUDA
function do_svd!(c,a::Any)
c .= svdvals!(a);
end
function compute_gpu(outputs,inputs)
@sync for i in eachindex(inputs)
Threads.@spawn begin
do_svd!(outputs[i], inputs[i])
CUDA.synchronize()
end
end
return outputs
end
n = 2000;
A = Matrix(rand(Float32,n,n))
iter=20;
inputs_gpu = [deepcopy(CuArray(A)) for i in 1:iter];
outputs_gpu=[CUDA.zeros(minimum(size(first(inputs_gpu))),1) for _ in 1:length(inputs_gpu)];
outputs_gpu = compute_gpu(outputs_gpu,inputs_gpu);
This works perfectly fine for iter=20, but blocks at iter=50, resulting in the following error (after which the iter=20 version als stops working, so the whole GPU seems blocked):
ERROR: TaskFailedException
nested task error: CUSOLVERError: an internal operation failed (code 7, CUSOLVER_STATUS_INTERNAL_ERROR)
Stacktrace:
[1] throw_api_error(res::CUDA.CUSOLVER.cusolverStatus_t)
@ CUDA.CUSOLVER \.julia\packages\CUDA\DfvRa\lib\cusolver\error.jl:46
[2] macro expansion
@ \.julia\packages\CUDA\DfvRa\lib\cusolver\error.jl:59 [inlined]
[3] cusolverDnCreate()
@ CUDA.CUSOLVER \.julia\packages\CUDA\DfvRa\lib\cusolver\dense.jl:10
[4] #2780
@ \.julia\packages\CUDA\DfvRa\lib\cusolver\CUSOLVER.jl:45 [inlined]
[5] (::CUDA.APIUtils.var"#8#11"{CUDA.CUSOLVER.var"#2780#2787", CUDA.APIUtils.HandleCache{CuContext, Ptr{Nothing}}, CuContext})()
@ CUDA.APIUtils \.julia\packages\CUDA\DfvRa\lib\utils\cache.jl:22
[6] lock(f::CUDA.APIUtils.var"#8#11"{CUDA.CUSOLVER.var"#2780#2787", CUDA.APIUtils.HandleCache{CuContext, Ptr{Nothing}}, CuContext}, l::ReentrantLock)
@ Base .\lock.jl:185
[7] check_cache
@ \.julia\packages\CUDA\DfvRa\lib\utils\cache.jl:20 [inlined]
[8] pop!
@ \.julia\packages\CUDA\DfvRa\lib\utils\cache.jl:41 [inlined]
[9] (::CUDA.CUSOLVER.var"#new_state#2786")(cuda::NamedTuple{(:device, :context, :stream, :math_mode, :math_precision), Tuple{CuDevice, CuContext, CuStream, CUDA.MathMode, Symbol}})
@ CUDA.CUSOLVER \.julia\packages\CUDA\DfvRa\lib\cusolver\CUSOLVER.jl:44
[10] #2784
@ \.julia\packages\CUDA\DfvRa\lib\cusolver\CUSOLVER.jl:60 [inlined]
[11] get!(default::CUDA.CUSOLVER.var"#2784#2791"{CUDA.CUSOLVER.var"#new_state#2786", NamedTuple{(:device, :context, :stream, :math_mode, :math_precision), Tuple{CuDevice, CuContext, CuStream, CUDA.MathMode, Symbol}}}, h::Dict{CuContext, NamedTuple{(:handle, :stream), Tuple{Ptr{Nothing}, CuStream}}}, key::CuContext)
@ Base .\dict.jl:481
[12] dense_handle()
@ CUDA.CUSOLVER \.julia\packages\CUDA\DfvRa\lib\cusolver\CUSOLVER.jl:59
[13] bufferSize
@ \.julia\packages\CUDA\DfvRa\lib\cusolver\dense.jl:457 [inlined]
[14] get_size
@ \.julia\packages\CUDA\DfvRa\lib\utils\call.jl:58 [inlined]
[15] with_workspace(f::CUDA.CUSOLVER.var"#2626#2629"{Char, Int64, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Int32, 1, CUDA.Mem.DeviceBuffer}, Base.RefValue{Ptr{Nothing}}, Int64, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Int64, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, Int64, Int64, Int64}, eltyp::Type{Float32}, size::CUDA.CUSOLVER.var"#bufferSize#2628"{Char, Int64, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, Base.RefValue{Ptr{Nothing}}, Int64, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Int64, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, Int64, Int64, Int64}, fallback::Nothing; keep::Bool)
@ CUDA.APIUtils \.julia\packages\CUDA\DfvRa\lib\utils\call.jl:61
[16] with_workspace (repeats 2 times)
@ \.julia\packages\CUDA\DfvRa\lib\utils\call.jl:56 [inlined]
[17] gesvdj!(jobz::Char, econ::Int64, A::CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}; tol::Float32, max_sweeps::Int64)
@ CUDA.CUSOLVER \.julia\packages\CUDA\DfvRa\lib\cusolver\dense.jl:463
[18] gesvdj!
@ \.julia\packages\CUDA\DfvRa\lib\cusolver\dense.jl:424 [inlined]
[19] _svdvals!
@ \.julia\packages\CUDA\DfvRa\lib\cusolver\linalg.jl:302 [inlined]
[20] #svdvals!#2773
@ \.julia\packages\CUDA\DfvRa\lib\cusolver\linalg.jl:294 [inlined]
[21] svdvals!
@ \.julia\packages\CUDA\DfvRa\lib\cusolver\linalg.jl:294 [inlined]
[22] do_svd!(c::CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, a::CuArray{Float32, 2, CUDA.Mem.DeviceBuffer})
@ Main \OneDrive\Documents\CSE_MIT\personal_projects\Random_matrices project.jl:158
[23] macro expansion
@ project.jl:171 [inlined]
[24] (::var"#5#6"{Vector{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}, Vector{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}, Int64})()
@ Main .\threadingconstructs.jl:258
...and 3 more exceptions.
Stacktrace:
[1] sync_end(c::Channel{Any})
@ Base .\task.jl:436
[2] macro expansion
@ .\task.jl:455 [inlined]
[3] compute_gpu(outputs::Vector{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}, inputs::Vector{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}})
@ Main project.jl:169
[4] top-level scope
@ project.jl:189
If this is some sort of GPU memory or computing space issue, can I somehow put additional tasks in a queue until the first tasks are solved? Is there a way to obtain the number of tasks I can launch in parallel without causing this issue (so that I can just submit the calculations in batches of that number)?