CUDA.jl unsupported call to an unknown function, unsupported dynamic function invocation

I am trying to translate the following function into the GPU version. But faced with the following error:

unsupported call to an unknown function
Reason: unsupported dynamic function invocation (call to libcuda)
begin
  using Pkg
  Pkg.activate(".")
  using CUDA
  using Test
  using BenchmarkTools
  using Cthulhu
  using Random
  # CUDA.versioninfo()
end

function func!(y::AbstractVector{T}, w::AbstractVector{T2}, lambda::Float64,
  z::AbstractVector{T}) where {T<:Real,T2<:Real}

  m = length(y)
  c = zeros(Float32, m) |> cu
  d = zeros(Float32, m) |> cu
  e = zeros(Float32, m) |> cu

  d[1] = w[1] + lambda
  c[1] = -2 * lambda / d[1]
  e[1] = lambda / d[1]
  z[1] = w[1] * y[1]
  d[2] = w[2] + 5 * lambda - d[1] * c[1] * c[1]
  c[2] = (-4 * lambda - d[1] * c[1] * e[1]) / d[2]
  e[2] = lambda / d[2]
  z[2] = w[2] * y[2] - c[1] * z[1]

  # for (i = 2; i < m - 1; i++) 
  m = length(y)
  @inbounds for i = 3:(m-1)
    i1 = i - 1
    i2 = i - 2
    d[i] = w[i] + 6 * lambda - c[i1] * c[i1] * d[i1] - e[i2] * e[i2] * d[i2]
    c[i] = (-4 * lambda - d[i1] * c[i1] * e[i1]) / d[i]
    e[i] = lambda / d[i]
    z[i] = w[i] * y[i] - c[i1] * z[i1] - e[i2] * z[i2]
  end

  i = m - 1
  i1 = i - 1
  i2 = i - 2
  d[m-1] = w[m-1] + 5 * lambda - c[i1] * c[i1] * d[i1] - e[i2] * e[i2] * d[i2]
  c[m-1] = (-2 * lambda - d[i1] * c[i1] * e[i1]) / d[m-1]
  z[m-1] = w[m-1] * y[m-1] - c[i1] * z[i1] - e[i2] * z[i2]

  i = m
  i1 = i - 1
  i2 = i - 2
  d[m] = w[m] + lambda - c[i1] * c[i1] * d[i1] - e[i2] * e[i2] * d[i2]
  z[m] = (w[m] * y[m] - c[i1] * z[i1] - e[i2] * z[i2]) / d[m]
  z[m-1] = z[m-1] / d[m-1] - c[m-1] * z[m]

  @inbounds for i in (m-2):-1:1
    z[i] = z[i] / d[i] - c[i] * z[i+1] - e[i] * z[i+2]
  end
end


begin
  Random.seed!(1)
  n = Int(1e4)
  y = rand(100, n)
  w = rand(100, n)

  # gpu version
  function kernel(y, w, z)
    n = size(y, 2)
    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x

    @cuprintln("thread $(threadIdx().x), block $(blockIdx().x)")
    if i <= n
      func!(y[:, i], w[:, i], 2.0, z[:, i])
    end
    return
  end

  function main_gpu()
    y2 = cu(y)
    w2 = cu(w)
    res = cu(zeros(size(y)))

    N = size(y, 2)
    nthreads = 256 * 2
    nblocks = ceil(Int, N / nthreads)
    @show N nthreads nblocks
    CUDA.@sync begin
      # @device_code_warntype 
      @cuda threads = nthreads blocks = nblocks kernel(y2, w2, res)
    end
    return
  end

  @time main_gpu()
end

The error message:

Stacktrace:
 [1] unsafe_cuCtxGetDevice
   @ C:\Users\kong\.julia\packages\CUDA\tTK8Y\lib\cudadrv\libcuda.jl:150
 [2] current_device
   @ C:\Users\kong\.julia\packages\CUDA\tTK8Y\lib\cudadrv\devices.jl:23
 [3] device
   @ C:\Users\kong\.julia\packages\CUDA\tTK8Y\lib\cudadrv\context.jl:287
 [4] check_exceptions
   @ C:\Users\kong\.julia\packages\CUDA\tTK8Y\src\compiler\exceptions.jl:33
 [5] #synchronize#134
   @ C:\Users\kong\.julia\packages\CUDA\tTK8Y\lib\cudadrv\stream.jl:134
 [6] multiple call sites
   @ unknown:0
Reason: unsupported call to an unknown function (call to ijl_lazy_load_and_lookup)
Stacktrace:
 [1] unsafe_cuCtxGetDevice
   @ C:\Users\kong\.julia\packages\CUDA\tTK8Y\lib\cudadrv\libcuda.jl:150
 [2] current_device
   @ C:\Users\kong\.julia\packages\CUDA\tTK8Y\lib\cudadrv\devices.jl:23
 [3] device
   @ C:\Users\kong\.julia\packages\CUDA\tTK8Y\lib\cudadrv\context.jl:287
 [4] check_exceptions
   @ C:\Users\kong\.julia\packages\CUDA\tTK8Y\src\compiler\exceptions.jl:33
 [5] #synchronize#134
   @ C:\Users\kong\.julia\packages\CUDA\tTK8Y\lib\cudadrv\stream.jl:134
 [6] multiple call sites
   @ unknown:0
Reason: unsupported dynamic function invocation (call to libcuda)
1 Like

You’re using CuArray within a kernel (by calling cu). That’s not allowed within kernels, where you can only perform relatively simple scalar operations.

Thank you @maleadt . Could you tell me where can I find more examples about CUDA.jl?

Its documentation, e.g., the introductory tutorial: Introduction · CUDA.jl