I’m trying to implement the adjoint_v1_custom_kernel!
using dynamic parallelism (See code below). I’m not sure why, but when trying to run it I get A LOT of unsupported call errors, after a dwarf warning. I tried to follow the examples from the CUDA.jl tests, as indicated in other topic. I’m probably doing something wrong with the dynamic parallelism idea, but I’m not sure what. Any help is appreciated.
Additionally, I tried using Cthulhu with code_typed(err; interactive = true)
, but a StackOverFlowError happens, when doing so (see the last block). I tested the child kernels kernel_V!
and kernel_accumulate_G!
separately and they both work and return the values I expect it.
Excerpt from the error log (There are a lot of other functions, most beginning with jl_)
warning: linking module flags 'Dwarf Version': IDs have conflicting values ('i32 4' from globals with 'i32 2' from start)
ERROR: LoadError: InvalidIRError: compiling MethodInstance for adjoint_v1_custom_kernel!(::CuDeviceMatrix{…}, ::CuDeviceVector{…}, ::CuDeviceVector{…}, ::CuDeviceVector{…}, ::CuDeviceVector{…}, ::CuDeviceVector{…}, ::CuDeviceMatrix{…}, ::CuDeviceMatrix{…}, ::CuDeviceVector{…}, ::CuDeviceMatrix{…}, ::Int64, ::Int64, ::Int64) resulted in invalid LLVM IR
Reason: unsupported dynamic function invocation (call to hash)
Stacktrace:
[1] hash
@ ./version.jl:226
[2] multiple call sites
@ unknown:0
Reason: unsupported dynamic function invocation (call to hash)
Stacktrace:
[1] hash
@ ./version.jl:227
[2] multiple call sites
@ unknown:0
Reason: unsupported call to a lazy-initialized function (call to ijl_rethrow)
Stacktrace:
[1] rethrow
@ ./error.jl:61
[2] multiple call sites
@ unknown:0
Reason: unsupported call to an unknown function (call to julia.get_pgcstack)
Stacktrace:
[1] multiple call sites
@ unknown:0
Reason: unsupported call to a lazy-initialized function (call to jl_gc_run_pending_finalizers)
Stacktrace:
[1] enable_finalizers
@ ./gcutils.jl:161
[2] _trylock
@ ./lock.jl:133
[3] multiple call sites
@ unknown:0
Reason: unsupported call to an external C function (call to jl_gc_have_pending_finalizers)
Stacktrace:
[1] enable_finalizers
@ ./gcutils.jl:160
[2] _trylock
@ ./lock.jl:133
[3] multiple call sites
@ unknown:0
Reason: unsupported call to a lazy-initialized function (call to jl_gc_run_pending_finalizers)
Stacktrace:
[1] enable_finalizers
@ ./gcutils.jl:161
[2] trylock
@ ./locks-mt.jl:59
[3] lock
@ ./locks-mt.jl:43
[4] multiple call sites
@ unknown:0
Below are the kernels I’m using.
function kernel_accumulate_G!(G::CuDeviceMatrix{Float64},
flat_P::CuDeviceVector{Float64}, offset_P::Int32, W::CuDeviceMatrix{Float64},
lam_l::Float64, n::Int64, s::Int32, r::Int64)
tid = threadIdx().x + (blockIdx().x - 1) * blockDim().x
if tid < 1 || tid > n
return
end
stride = blockDim().x * gridDim().x
idx = tid
while idx <= n
for r_idx in 1:r
acc = 0.0
for s_idx in 1:s
flat_idx = offset_P + (s_idx - 1) * n + (idx - 1)
P_si = flat_P[flat_idx]
W_sr = W[s_idx, r_idx]
acc += P_si * W_sr
end
CUDA.@atomic G[idx, r_idx] += lam_l * acc
end
idx += stride
end
return
end
function kernel_V!(V::CuDeviceMatrix{Float64}, flat_P::CuDeviceVector{Float64},
s::Int32,
U::CuDeviceMatrix{Float64},
n::Int64,
r::Int64)
tid = threadIdx().x + (blockIdx().x - 1) * blockDim().x
if tid < 1 || tid > n
return
end
stride = blockDim().x * gridDim().x
idx = tid
while idx <= n
for s_idx in 1:s
P_is = flat_P[(s_idx-1)*n+idx]
for r_idx in 1:r
U_ir = U[idx, r_idx]
# @cuprintf("Contribution idx = %d, s_idx = %d, r_idx = %d: P = %f, U = %f, V = %f\n",
# idx, s_idx, r_idx, P_is, U_ir, P_is * U_ir)
CUDA.@atomic V[s_idx, r_idx] += P_is * U_ir
end
end
idx += stride
end
return
end
"""
Compute G = ∑_l lam_l P_l D_l P_l^T U, for l=1,...,m;
Where P is a n × s_l matrix, D is a s_l × s_l diagonal matrix and U is a n × r.
flat_P and flat_D are vectorized representations of all P_l and D_l.
offset_P and offset_D points to the beginning of each P_l/D_l. S = [s_l] for each l.
"""
function adjoint_v1_custom_kernel!(G::CuDeviceMatrix{Float64},
flat_P::CuDeviceVector{Float64}, flat_D::CuDeviceVector{Float64},
offsets_P::CuDeviceVector{Int32}, offsets_D::CuDeviceVector{Int32},
S::CuDeviceVector{Int32}, V::CuDeviceMatrix{Float64},
W::CuDeviceMatrix{Float64}, lam::CuDeviceVector{Float64},
U::CuDeviceMatrix{Float64}, n::Int64, r::Int64, m::Int64)
tid = threadIdx().x + (blockIdx().x - 1) * blockDim().x
if tid < 1 || tid > m
return
end
s_i = S[tid]
off_P = offsets_P[tid]
off_D = offsets_D[tid]
lam_l = lam[tid]
@cuda dynamic = true kernel_V!(V, flat_P, S[tid], U, n, r)
CUDA.synchronize()
for t in 1:S[tid]
for k in 1:r
W[t, k] = flat_D[off_D+t] * V[t, k]
end
end
@cuda dynamic = true kernel_accumulate_G!(G, flat_P, off_P, W, lam_l, n, S[tid], r)
return
end
function adjoint_v1_custom_kernel_call!(G, flat_P, flat_D, offsets_P, offsets_D, S, λ, U)
m, n, r = size(S, 1), size(U, 1), size(U, 2)
exp_T = eltype(G)
V = CUDA.zeros(exp_T, Array(S)[1], r)
W = CUDA.zeros(exp_T, Array(S)[1], r)
threads = min(256, m)
blocks = ceil(Int, m / threads)
@cuda threads = threads blocks = blocks adjoint_v1_custom_kernel!(G,
flat_P, flat_D, offsets_P, offsets_D, S, V, W, λ, U, n, r, m)
CUDA.synchronize()
@show G
readline()
end
Error from Cthulhu:
Stacktrace:
[1] descend_code_typed(args::Any; kwargs::@Kwargs{interp::GPUCompiler.GPUInterpreter}) (repeats 3188 times)
@ Cthulhu ~/.julia/packages/Cthulhu/kMTSr/src/Cthulhu.jl:149
[2] code_typed(job::GPUCompiler.CompilerJob; interactive::Bool, kwargs::@Kwargs{})
@ GPUCompiler ~/.julia/packages/GPUCompiler/sy6sk/src/reflection.jl:134
[3] code_typed
@ ~/.julia/packages/GPUCompiler/sy6sk/src/reflection.jl:126 [inlined]
[4] #code_typed#177
@ ~/.julia/packages/GPUCompiler/sy6sk/src/reflection.jl:160 [inlined]
[5] adjoint_v1_custom_kernel_call!(G::CuArray{…}, flat_P::CuArray{…}, flat_D::CuArray{…}, offsets_P::CuArray{…}, offsets_D::CuArray{…}, S::CuArray{…}, λ::CuArray{…}, U::CuArray{…}, sdpa::LowRankSDPA{…})
@ Main /path/benchmark_lowrank.jl:240
[6] run_experiments()
@ Main /path/benchmark_lowrank.jl:582
[7] top-level scope
@ /path//benchmark_lowrank.jl:923
[8] include(fname::String)
@ Main ./sysimg.jl:38
[9] top-level scope
@ REPL[3]:1
in expression starting at /path/benchmark_lowrank.jl:911