Hi there, I have just started thinking about moving some of my JAX code to Julia and wanted to have a go using Enzyme for my AD. I am looking to make use of CUDA GPUs.
I have started with a simple program to verify things are working. My code works okay on CPU and gives the correct results.
On the GPU however I am getting an error regarding the “nvcuda.dll” file:
“ERROR: SystemError: opening file “nvcuda”: No such file or directory”
I have checked that this file exists in system32 (I’m working on Windows) and so is on PATH and have also placed it in my bin folder of my CUDA toolkit installation. I am using a local installation of CUDA that I have linked with CUDA.jl
I have tried hunting through the stacktrace to see if I can find where the .dll is called from but didn’t get very far.
I am working in VSCode with the Julia extension on Julia 1.8.2
Stacktrace:
[1] systemerror(p::String, errno::Int32; extrainfo::Nothing)
@ Base .\error.jl:176
[2] #systemerror#80
@ .\error.jl:175 [inlined]
[3] systemerror
@ .\error.jl:175 [inlined]
[4] open(fname::String; lock::Bool, read::Bool, write::Nothing, create::Nothing, truncate::Nothing, append::Nothing)
@ Base .\iostream.jl:293
[5] open(fname::String, mode::String; lock::Bool)
@ Base .\iostream.jl:356
[6] open(fname::String, mode::String)
@ Base .\iostream.jl:355
[7] open(::Enzyme.Compiler.var"#120#125", ::String, ::Vararg{String}; kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
@ Base .\io.jl:382
[8] open
@ .\io.jl:381 [inlined]
[9] check_ir!(job::GPUCompiler.CompilerJob{Enzyme.Compiler.EnzymeTarget, Enzyme.Compiler.EnzymeCompilerParams, GPUCompiler.FunctionSpec{typeof(test3), Tuple{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, Vector{Float32}}}}, errors::Vector{Tuple{String, Vector{Base.StackTraces.StackFrame}, Any}}, imported::Set{String}, inst::LLVM.CallInst, calls::Vector{Any})
@ Enzyme.Compiler C:\Users\hugom\.julia\packages\Enzyme\DIkTv\src\compiler\validation.jl:299
[10] check_ir!(job::GPUCompiler.CompilerJob{Enzyme.Compiler.EnzymeTarget, Enzyme.Compiler.EnzymeCompilerParams, GPUCompiler.FunctionSpec{typeof(test3), Tuple{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, Vector{Float32}}}}, errors::Vector{Tuple{String, Vector{Base.StackTraces.StackFrame}, Any}}, imported::Set{String}, f::LLVM.Function)
@ Enzyme.Compiler C:\Users\hugom\.julia\packages\Enzyme\DIkTv\src\compiler\validation.jl:189
[11] check_ir!(job::GPUCompiler.CompilerJob{Enzyme.Compiler.EnzymeTarget, Enzyme.Compiler.EnzymeCompilerParams, GPUCompiler.FunctionSpec{typeof(test3), Tuple{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, Vector{Float32}}}}, errors::Vector{Tuple{String, Vector{Base.StackTraces.StackFrame}, Any}}, mod::LLVM.Module)
@ Enzyme.Compiler C:\Users\hugom\.julia\packages\Enzyme\DIkTv\src\compiler\validation.jl:162
[12] check_ir
@ C:\Users\hugom\.julia\packages\Enzyme\DIkTv\src\compiler\validation.jl:140 [inlined]
[13] codegen(output::Symbol, job::GPUCompiler.CompilerJob{Enzyme.Compiler.EnzymeTarget, Enzyme.Compiler.EnzymeCompilerParams, GPUCompiler.FunctionSpec{typeof(test3), Tuple{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, Vector{Float32}}}}; libraries::Bool, deferred_codegen::Bool, optimize::Bool, ctx::LLVM.Context, strip::Bool, validate::Bool, only_entry::Bool, parent_job::Nothing)
@ Enzyme.Compiler C:\Users\hugom\.julia\packages\Enzyme\DIkTv\src\compiler.jl:6096
[14] _thunk
@ C:\Users\hugom\.julia\packages\Enzyme\DIkTv\src\compiler.jl:6870 [inlined]
[15] _thunk(job::GPUCompiler.CompilerJob{Enzyme.Compiler.EnzymeTarget, Enzyme.Compiler.EnzymeCompilerParams, GPUCompiler.FunctionSpec{typeof(test3), Tuple{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, Vector{Float32}}}})
@ Enzyme.Compiler C:\Users\hugom\.julia\packages\Enzyme\DIkTv\src\compiler.jl:6864
[16] cached_compilation(job::GPUCompiler.CompilerJob, key::UInt64, specid::UInt64)
@ Enzyme.Compiler C:\Users\hugom\.julia\packages\Enzyme\DIkTv\src\compiler.jl:6908
[17] #s883#169
@ C:\Users\hugom\.julia\packages\Enzyme\DIkTv\src\compiler.jl:6968 [inlined]
[18] var"#s883#169"(F::Any, Fn::Any, DF::Any, A::Any, TT::Any, Mode::Any, ModifiedBetween::Any, width::Any, specid::Any, ReturnPrimal::Any, ShadowInit::Any, ::Any, #unused#::Type, f::Any, df::Any, #unused#::Type, tt::Any, #unused#::Type, #unused#::Type, #unused#::Type, #unused#::Type, #unused#::Type, #unused#::Any)
@ Enzyme.Compiler .\none:0
[19] (::Core.GeneratedFunctionStub)(::Any, ::Vararg{Any})
@ Core .\boot.jl:582
[20] thunk
@ C:\Users\hugom\.julia\packages\Enzyme\DIkTv\src\compiler.jl:7001 [inlined]
[21] thunk (repeats 3 times)
@ C:\Users\hugom\.julia\packages\Enzyme\DIkTv\src\compiler.jl:6994 [inlined]
[22] autodiff(::EnzymeCore.ReverseMode, ::typeof(test3), ::Type{Const{Nothing}}, ::Duplicated{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}, ::Vararg{Any})
@ Enzyme C:\Users\hugom\.julia\packages\Enzyme\DIkTv\src\Enzyme.jl:211
[23] autodiff(::EnzymeCore.ReverseMode, ::typeof(test3), ::Duplicated{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}, ::Duplicated{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}, ::Vararg{Any})
@ Enzyme C:\Users\hugom\.julia\packages\Enzyme\DIkTv\src\Enzyme.jl:248
[24] top-level scope
@ REPL[5]:1
CPU code (works fine):
using Enzyme
using CUDA
function test3(x,y,z)
z[1] = sum(x + 2.0f0*y)
return nothing
end
a = ones(480,800)
da = zero(a)
b = 5.0f0*ones(480,800)
db = zero(b)
c = [0f0]
dc = [1.0f0]
Enzyme.autodiff(Reverse,test3,Duplicated(a,da),Duplicated(b,db),Duplicated(c,dc))
GPU code:
a_gpu = CUDA.ones(480,800)
da_gpu = zero(a_gpu)
b_gpu = 5.0f0*CUDA.ones(480,800)
db_gpu = zero(b_gpu)
c_gpu = [0f0]
dc_gpu = [1.0f0]
Enzyme.autodiff(Reverse,test3,Duplicated(a_gpu,da_gpu),Duplicated(b_gpu,db_gpu),Duplicated(c_gpu,dc_gpu))
I have tried using autodiff_deferred and that also provided a similar error. Any suggestions for where I can begin troubleshooting the error would be most appreciated!
Thanks in advance!