Multiple tasks on GPU resulting in CUSOLVERError for large number of tasks

Hi all,

I am learning CUDA.jl and trying to launch a bunch of tasks in parallel on the GPU:

using LinearAlgebra, Random, CUDA

function do_svd!(c,a::Any)
    c .= svdvals!(a);
end

function compute_gpu(outputs,inputs)
    @sync for i in eachindex(inputs)
        Threads.@spawn begin
            do_svd!(outputs[i], inputs[i])
            CUDA.synchronize()
        end
     end
    return outputs
end

n = 2000;
A = Matrix(rand(Float32,n,n))
iter=20;

inputs_gpu = [deepcopy(CuArray(A)) for i in 1:iter];
outputs_gpu=[CUDA.zeros(minimum(size(first(inputs_gpu))),1) for _ in 1:length(inputs_gpu)];
outputs_gpu = compute_gpu(outputs_gpu,inputs_gpu);

This works perfectly fine for iter=20, but blocks at iter=50, resulting in the following error (after which the iter=20 version als stops working, so the whole GPU seems blocked):

ERROR: TaskFailedException

    nested task error: CUSOLVERError: an internal operation failed (code 7, CUSOLVER_STATUS_INTERNAL_ERROR)
    Stacktrace:
      [1] throw_api_error(res::CUDA.CUSOLVER.cusolverStatus_t)
        @ CUDA.CUSOLVER \.julia\packages\CUDA\DfvRa\lib\cusolver\error.jl:46
      [2] macro expansion
        @ \.julia\packages\CUDA\DfvRa\lib\cusolver\error.jl:59 [inlined]
      [3] cusolverDnCreate()
        @ CUDA.CUSOLVER \.julia\packages\CUDA\DfvRa\lib\cusolver\dense.jl:10
      [4] #2780
        @ \.julia\packages\CUDA\DfvRa\lib\cusolver\CUSOLVER.jl:45 [inlined]
      [5] (::CUDA.APIUtils.var"#8#11"{CUDA.CUSOLVER.var"#2780#2787", CUDA.APIUtils.HandleCache{CuContext, Ptr{Nothing}}, CuContext})()
        @ CUDA.APIUtils \.julia\packages\CUDA\DfvRa\lib\utils\cache.jl:22
      [6] lock(f::CUDA.APIUtils.var"#8#11"{CUDA.CUSOLVER.var"#2780#2787", CUDA.APIUtils.HandleCache{CuContext, Ptr{Nothing}}, CuContext}, l::ReentrantLock)
        @ Base .\lock.jl:185
      [7] check_cache
        @ \.julia\packages\CUDA\DfvRa\lib\utils\cache.jl:20 [inlined]
      [8] pop!
        @ \.julia\packages\CUDA\DfvRa\lib\utils\cache.jl:41 [inlined]
      [9] (::CUDA.CUSOLVER.var"#new_state#2786")(cuda::NamedTuple{(:device, :context, :stream, :math_mode, :math_precision), Tuple{CuDevice, CuContext, CuStream, CUDA.MathMode, Symbol}})
        @ CUDA.CUSOLVER \.julia\packages\CUDA\DfvRa\lib\cusolver\CUSOLVER.jl:44
     [10] #2784
        @ \.julia\packages\CUDA\DfvRa\lib\cusolver\CUSOLVER.jl:60 [inlined]
     [11] get!(default::CUDA.CUSOLVER.var"#2784#2791"{CUDA.CUSOLVER.var"#new_state#2786", NamedTuple{(:device, :context, :stream, :math_mode, :math_precision), Tuple{CuDevice, CuContext, CuStream, CUDA.MathMode, Symbol}}}, h::Dict{CuContext, NamedTuple{(:handle, :stream), Tuple{Ptr{Nothing}, CuStream}}}, key::CuContext)
        @ Base .\dict.jl:481
     [12] dense_handle()
        @ CUDA.CUSOLVER \.julia\packages\CUDA\DfvRa\lib\cusolver\CUSOLVER.jl:59
     [13] bufferSize
        @ \.julia\packages\CUDA\DfvRa\lib\cusolver\dense.jl:457 [inlined]
     [14] get_size
        @ \.julia\packages\CUDA\DfvRa\lib\utils\call.jl:58 [inlined]
     [15] with_workspace(f::CUDA.CUSOLVER.var"#2626#2629"{Char, Int64, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Int32, 1, CUDA.Mem.DeviceBuffer}, Base.RefValue{Ptr{Nothing}}, Int64, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Int64, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, Int64, Int64, Int64}, eltyp::Type{Float32}, size::CUDA.CUSOLVER.var"#bufferSize#2628"{Char, Int64, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, Base.RefValue{Ptr{Nothing}}, Int64, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Int64, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, Int64, Int64, Int64}, fallback::Nothing; keep::Bool)
        @ CUDA.APIUtils \.julia\packages\CUDA\DfvRa\lib\utils\call.jl:61
     [16] with_workspace (repeats 2 times)
        @ \.julia\packages\CUDA\DfvRa\lib\utils\call.jl:56 [inlined]
     [17] gesvdj!(jobz::Char, econ::Int64, A::CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}; tol::Float32, max_sweeps::Int64)
        @ CUDA.CUSOLVER \.julia\packages\CUDA\DfvRa\lib\cusolver\dense.jl:463
     [18] gesvdj!
        @ \.julia\packages\CUDA\DfvRa\lib\cusolver\dense.jl:424 [inlined]
     [19] _svdvals!
        @ \.julia\packages\CUDA\DfvRa\lib\cusolver\linalg.jl:302 [inlined]
     [20] #svdvals!#2773
        @ \.julia\packages\CUDA\DfvRa\lib\cusolver\linalg.jl:294 [inlined]
     [21] svdvals!
        @ \.julia\packages\CUDA\DfvRa\lib\cusolver\linalg.jl:294 [inlined]
     [22] do_svd!(c::CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, a::CuArray{Float32, 2, CUDA.Mem.DeviceBuffer})
        @ Main \OneDrive\Documents\CSE_MIT\personal_projects\Random_matrices project.jl:158
     [23] macro expansion
        @ project.jl:171 [inlined]
     [24] (::var"#5#6"{Vector{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}, Vector{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}, Int64})()
        @ Main .\threadingconstructs.jl:258

...and 3 more exceptions.

Stacktrace:
 [1] sync_end(c::Channel{Any})
   @ Base .\task.jl:436
 [2] macro expansion
   @ .\task.jl:455 [inlined]
 [3] compute_gpu(outputs::Vector{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}, inputs::Vector{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}})
   @ Main project.jl:169
 [4] top-level scope
   @ project.jl:189

If this is some sort of GPU memory or computing space issue, can I somehow put additional tasks in a queue until the first tasks are solved? Is there a way to obtain the number of tasks I can launch in parallel without causing this issue (so that I can just submit the calculations in batches of that number)?

I think you are probably running out of GPU memory. Tasks won’t automatically queue up until memory is free, instead you’ll run into an OOM error when they try to allocate. In this case, it seems like an NVIDIA library (which really don’t like running close to OOM) is failing to allocate some internal state, causing this error. We can try to work around this by freeing up memory when we encounter this error, I’ve created a PR for that here Retry CUSOLVER handle creation when encountering an internal error. by maleadt · Pull Request #1691 · JuliaGPU/CUDA.jl · GitHub, but that’s ultimately just a workaround. It’s better to avoid running very close to OOM.

Great, thank you! Is there a way to check whether I am running close to OOM? E.g. if I am writing a package for other users, can I somehow find out how much memory is available on their GPU, or specifically how many simultaneous tasks I can launch without running close to OOM?

You can check available memory with functions like CUDA.total_memory() and CUDA.available_memory(), just like in Base for checking CPU memory (where similarly you can’t just go and launch an unbounded number of tasks without running into OOM).

Amazing, that solves it, thank you!