Kernel with dynamic parallelism seems to be calling CPU functions

I’m trying to implement the adjoint_v1_custom_kernel! using dynamic parallelism (See code below). I’m not sure why, but when trying to run it I get A LOT of unsupported call errors, after a dwarf warning. I tried to follow the examples from the CUDA.jl tests, as indicated in other topic. I’m probably doing something wrong with the dynamic parallelism idea, but I’m not sure what. Any help is appreciated.

Additionally, I tried using Cthulhu with code_typed(err; interactive = true), but a StackOverFlowError happens, when doing so (see the last block). I tested the child kernels kernel_V! and kernel_accumulate_G!separately and they both work and return the values I expect it.

Excerpt from the error log (There are a lot of other functions, most beginning with jl_)

warning: linking module flags 'Dwarf Version': IDs have conflicting values ('i32 4' from globals with 'i32 2' from start)
ERROR: LoadError: InvalidIRError: compiling MethodInstance for adjoint_v1_custom_kernel!(::CuDeviceMatrix{…}, ::CuDeviceVector{…}, ::CuDeviceVector{…}, ::CuDeviceVector{…}, ::CuDeviceVector{…}, ::CuDeviceVector{…}, ::CuDeviceMatrix{…}, ::CuDeviceMatrix{…}, ::CuDeviceVector{…}, ::CuDeviceMatrix{…}, ::Int64, ::Int64, ::Int64) resulted in invalid LLVM IR
Reason: unsupported dynamic function invocation (call to hash)
Stacktrace:
 [1] hash
   @ ./version.jl:226
 [2] multiple call sites
   @ unknown:0
Reason: unsupported dynamic function invocation (call to hash)
Stacktrace:
 [1] hash
   @ ./version.jl:227
 [2] multiple call sites
   @ unknown:0
Reason: unsupported call to a lazy-initialized function (call to ijl_rethrow)
Stacktrace:
 [1] rethrow
   @ ./error.jl:61
 [2] multiple call sites
   @ unknown:0
Reason: unsupported call to an unknown function (call to julia.get_pgcstack)
Stacktrace:
 [1] multiple call sites
   @ unknown:0
Reason: unsupported call to a lazy-initialized function (call to jl_gc_run_pending_finalizers)
Stacktrace:
 [1] enable_finalizers
   @ ./gcutils.jl:161
 [2] _trylock
   @ ./lock.jl:133
 [3] multiple call sites
   @ unknown:0
Reason: unsupported call to an external C function (call to jl_gc_have_pending_finalizers)
Stacktrace:
 [1] enable_finalizers
   @ ./gcutils.jl:160
 [2] _trylock
   @ ./lock.jl:133
 [3] multiple call sites
   @ unknown:0
Reason: unsupported call to a lazy-initialized function (call to jl_gc_run_pending_finalizers)
Stacktrace:
 [1] enable_finalizers
   @ ./gcutils.jl:161
 [2] trylock
   @ ./locks-mt.jl:59
 [3] lock
   @ ./locks-mt.jl:43
 [4] multiple call sites
   @ unknown:0

Below are the kernels I’m using.


function kernel_accumulate_G!(G::CuDeviceMatrix{Float64},
    flat_P::CuDeviceVector{Float64}, offset_P::Int32, W::CuDeviceMatrix{Float64},
    lam_l::Float64, n::Int64, s::Int32, r::Int64)

    tid = threadIdx().x + (blockIdx().x - 1) * blockDim().x
    if tid < 1 || tid > n
        return
    end
    stride = blockDim().x * gridDim().x

    idx = tid
    while idx <= n
        for r_idx in 1:r
            acc = 0.0
            for s_idx in 1:s
                flat_idx = offset_P + (s_idx - 1) * n + (idx - 1)
                P_si = flat_P[flat_idx]
                W_sr = W[s_idx, r_idx]
                acc += P_si * W_sr
            end
            CUDA.@atomic G[idx, r_idx] += lam_l * acc
        end
        idx += stride
    end
    return
end

function kernel_V!(V::CuDeviceMatrix{Float64}, flat_P::CuDeviceVector{Float64},
    s::Int32,
    U::CuDeviceMatrix{Float64},
    n::Int64,
    r::Int64)

    tid = threadIdx().x + (blockIdx().x - 1) * blockDim().x
    if tid < 1 || tid > n
        return
    end

    stride = blockDim().x * gridDim().x
    idx = tid
    while idx <= n
        for s_idx in 1:s
            P_is = flat_P[(s_idx-1)*n+idx]
            for r_idx in 1:r
                U_ir = U[idx, r_idx]
                # @cuprintf("Contribution idx = %d, s_idx = %d, r_idx = %d: P = %f, U = %f, V = %f\n",
                #     idx, s_idx, r_idx, P_is, U_ir, P_is * U_ir)
                CUDA.@atomic V[s_idx, r_idx] += P_is * U_ir
            end
        end
        idx += stride
    end

    return
end

"""
Compute G = ∑_l lam_l P_l D_l P_l^T U, for l=1,...,m;
Where P is a n × s_l matrix, D is a s_l × s_l diagonal matrix and U is a n × r.
flat_P and flat_D are vectorized representations of all P_l and D_l. 
offset_P and offset_D points to the beginning of each P_l/D_l. S = [s_l] for each l.
"""
function adjoint_v1_custom_kernel!(G::CuDeviceMatrix{Float64},
    flat_P::CuDeviceVector{Float64}, flat_D::CuDeviceVector{Float64},
    offsets_P::CuDeviceVector{Int32}, offsets_D::CuDeviceVector{Int32},
    S::CuDeviceVector{Int32}, V::CuDeviceMatrix{Float64},
    W::CuDeviceMatrix{Float64}, lam::CuDeviceVector{Float64},
    U::CuDeviceMatrix{Float64}, n::Int64, r::Int64, m::Int64)

    tid = threadIdx().x + (blockIdx().x - 1) * blockDim().x
    if tid < 1 || tid > m
        return
    end

    s_i = S[tid]
    off_P = offsets_P[tid]
    off_D = offsets_D[tid]
    lam_l = lam[tid]

    @cuda dynamic = true kernel_V!(V, flat_P, S[tid], U, n, r)
    CUDA.synchronize()
    for t in 1:S[tid]
        for k in 1:r
            W[t, k] = flat_D[off_D+t] * V[t, k]
        end
    end
    @cuda dynamic = true kernel_accumulate_G!(G, flat_P, off_P, W, lam_l, n, S[tid], r)
    return
end


function adjoint_v1_custom_kernel_call!(G, flat_P, flat_D, offsets_P, offsets_D, S, λ, U)
    m, n, r = size(S, 1), size(U, 1), size(U, 2)
    exp_T = eltype(G)
    V = CUDA.zeros(exp_T, Array(S)[1], r)
    W = CUDA.zeros(exp_T, Array(S)[1], r)
    threads = min(256, m)
    blocks = ceil(Int, m / threads)
    @cuda threads = threads blocks = blocks adjoint_v1_custom_kernel!(G,
        flat_P, flat_D, offsets_P, offsets_D, S, V, W, λ, U, n, r, m)
    CUDA.synchronize()
    @show G
    readline()
end

Error from Cthulhu:

Stacktrace:
 [1] descend_code_typed(args::Any; kwargs::@Kwargs{interp::GPUCompiler.GPUInterpreter}) (repeats 3188 times)
   @ Cthulhu ~/.julia/packages/Cthulhu/kMTSr/src/Cthulhu.jl:149
 [2] code_typed(job::GPUCompiler.CompilerJob; interactive::Bool, kwargs::@Kwargs{})
   @ GPUCompiler ~/.julia/packages/GPUCompiler/sy6sk/src/reflection.jl:134
 [3] code_typed
   @ ~/.julia/packages/GPUCompiler/sy6sk/src/reflection.jl:126 [inlined]
 [4] #code_typed#177
   @ ~/.julia/packages/GPUCompiler/sy6sk/src/reflection.jl:160 [inlined]
 [5] adjoint_v1_custom_kernel_call!(G::CuArray{…}, flat_P::CuArray{…}, flat_D::CuArray{…}, offsets_P::CuArray{…}, offsets_D::CuArray{…}, S::CuArray{…}, λ::CuArray{…}, U::CuArray{…}, sdpa::LowRankSDPA{…})
   @ Main /path/benchmark_lowrank.jl:240
 [6] run_experiments()
   @ Main /path/benchmark_lowrank.jl:582
 [7] top-level scope
   @ /path//benchmark_lowrank.jl:923
 [8] include(fname::String)
   @ Main ./sysimg.jl:38
 [9] top-level scope
   @ REPL[3]:1
in expression starting at /path/benchmark_lowrank.jl:911

I think you’re using the wrong synchronization call. Try device_synchronize(): CUDA.jl/src/device/intrinsics/dynamic_parallelism.jl at e561e7a106684f8e4be59cad98a51cc304c671d2 · JuliaGPU/CUDA.jl · GitHub

Thanks @maleadt. I tried device_synchronize(), and now the original errors don’t happen, but I getting strange values (randomly wrong) in the input arrays of the kernel. For instance, if I call:

function adjoint_v1_custom_kernel!(G::CuDeviceMatrix{Float64},
    flat_P::CuDeviceVector{Float64}, flat_D::CuDeviceVector{Float64},
    offsets_P::CuDeviceVector{Int32}, offsets_D::CuDeviceVector{Int32},
    S::CuDeviceVector{Int32}, V::CuDeviceMatrix{Float64},
    W::CuDeviceMatrix{Float64}, lam::CuDeviceVector{Float64},
    U::CuDeviceMatrix{Float64}, n::Int64, r::Int64, m::Int64)

    tid = threadIdx().x + (blockIdx().x - 1) * blockDim().x
    if tid < 1 || tid > m
        return
    end

    @cuprintf("tid = %d, S = %d, off_P = %d, off_D = %d, lam = %f\n", tid, S[tid], offsets_P[tid],
        offsets_D[tid], lam[tid])

    s_i = S[tid]
    off_P = offsets_P[tid]
    off_D = offsets_D[tid]
    lam_l = lam[tid]

    @cuda dynamic = true kernel_V!(V, flat_P, S[tid], U, n, r)
    CUDA.device_synchronize()
    for t in 1:S[tid]
        for k in 1:r
            W[t, k] = flat_D[off_D+t] * V[t, k]
        end
    end
    @cuda dynamic = true kernel_accumulate_G!(G, flat_P, off_P, W, lam_l, n, S[tid], r)
    return
end

With:

G = [0.0 0.0; 0.0 0.0; 0.0 0.0]
flat_P = [1.0, 0.0, 0.0, 0.0, 1.0, 2.0, 2.0, 0.0, 1.0, 1.0, 1.0, 0.0]
offsets_P = Int32[1, 7]
flat_D = [2.0, 1.0, 1.0, 1.0]
offsets_D = Int32[1, 3]
S = Int32[2, 2]
lam= [1.0, 1.0]
U = [1.0 2.0; 2.0 1.0; 1.0 1.0]

The @cuprintf prints:

tid = 1, S = 0, off_P = 2, off_D = 1, lam = 0.000000
tid = 2, S = 0, off_P = 2, off_D = 7, lam = -452553613805569486818732157388455520281020663598239410150432692130368304801918414631003522883310643388494356859881578079556248166068695006746267422209699022836030709186144239263994074073762479625109281337986110214130919324778496.000000

I noticed that this behavior also happens if I do not call device_synchronize at all.

Any ideas what might be causing this? Thanks again.

Pretty hard to tell, as there could be many causes. Maybe lam is getting freed on the CPU side, as I think child kernels launch in their own stream. Or your formatting string is wrong, as you hard-code types for dynamic values (try simply doing string interpolation with @cuprintln, that’ll use the correct formatting string for you).