I have some code that is similar to an example from the CUDA.jl Introduction. Specifically, this one:
using CUDA
N = 2^20
x_d = CUDA.fill(1.0f0, N) # a vector stored on the GPU filled with 1.0 (Float32)
y_d = CUDA.fill(2.0f0, N) # a vector stored on the GPU filled with 2.0
function gpu_add2!(y, x)
index = threadIdx().x # this example only requires linear indexing, so just use `x`
stride = blockDim().x
for i = index:stride:length(y)
@inbounds y[i] += x[i]
end
return nothing
end
fill!(y_d, 2)
@cuda threads=256 gpu_add2!(y_d, x_d)
@test all(Array(y_d) .== 3.0f0)
Running this example seems to cause an error on my computer:
ERROR: Format of __nvvm__reflect function not recognized
Stacktrace:
[1] error(s::String)
@ Base .\error.jl:35
[2] macro expansion
@ C:\Users\Mark Lau\.julia\packages\GPUCompiler\S3TWf\src\ptx.jl:439 [inlined]
[3] macro expansion
@ C:\Users\Mark Lau\.julia\packages\TimerOutputs\LHjFw\src\TimerOutput.jl:253 [inlined]
[4] nvvm_reflect!(fun::LLVM.Function)
@ GPUCompiler C:\Users\Mark Lau\.julia\packages\GPUCompiler\S3TWf\src\ptx.jl:413
[5] function_pass_callback(ptr::Ptr{Nothing}, data::Ptr{Nothing})
@ LLVM C:\Users\Mark Lau\.julia\packages\LLVM\X1AeZ\src\pass.jl:49
[6] LLVMRunPassManager
@ C:\Users\Mark Lau\.julia\packages\LLVM\X1AeZ\lib\13\libLLVM_h.jl:4898 [inlined]
[7] run!
@ C:\Users\Mark Lau\.julia\packages\LLVM\X1AeZ\src\passmanager.jl:39 [inlined]
[8] macro expansion
@ C:\Users\Mark Lau\.julia\packages\LLVM\X1AeZ\src\base.jl:102 [inlined]
[9] optimize_module!(job::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget}, mod::LLVM.Module)
@ GPUCompiler C:\Users\Mark Lau\.julia\packages\GPUCompiler\S3TWf\src\ptx.jl:149
[10] optimize!(job::GPUCompiler.CompilerJob, mod::LLVM.Module)
@ GPUCompiler C:\Users\Mark Lau\.julia\packages\GPUCompiler\S3TWf\src\optim.jl:245
[11] macro expansion
@ C:\Users\Mark Lau\.julia\packages\GPUCompiler\S3TWf\src\driver.jl:342 [inlined]
[12] macro expansion
@ C:\Users\Mark Lau\.julia\packages\TimerOutputs\LHjFw\src\TimerOutput.jl:253 [inlined]
[13] macro expansion
@ C:\Users\Mark Lau\.julia\packages\GPUCompiler\S3TWf\src\driver.jl:341 [inlined]
[14] macro expansion
@ C:\Users\Mark Lau\.julia\packages\TimerOutputs\LHjFw\src\TimerOutput.jl:253 [inlined]
[15] macro expansion
@ C:\Users\Mark Lau\.julia\packages\GPUCompiler\S3TWf\src\driver.jl:331 [inlined]
[16] emit_llvm(job::GPUCompiler.CompilerJob, method_instance::Any; libraries::Bool, deferred_codegen::Bool, optimize::Bool, cleanup::Bool, only_entry::Bool, validate::Bool, ctx::LLVM.Context)
@ GPUCompiler C:\Users\Mark Lau\.julia\packages\GPUCompiler\S3TWf\src\utils.jl:83
[17] cufunction_compile(job::GPUCompiler.CompilerJob, ctx::LLVM.Context)
@ CUDA C:\Users\Mark Lau\.julia\packages\CUDA\ZdCxS\src\compiler\execution.jl:360
[18] #221
@ C:\Users\Mark Lau\.julia\packages\CUDA\ZdCxS\src\compiler\execution.jl:354 [inlined]
[19] JuliaContext(f::CUDA.var"#221#222"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams, GPUCompiler.FunctionSpec{typeof(gpu_add2!), Tuple{CuDeviceVector{Float32, 1}, CuDeviceVector{Float32, 1}}}}})
@ GPUCompiler C:\Users\Mark Lau\.julia\packages\GPUCompiler\S3TWf\src\driver.jl:76
[20] cufunction_compile(job::GPUCompiler.CompilerJob)
@ CUDA C:\Users\Mark Lau\.julia\packages\CUDA\ZdCxS\src\compiler\execution.jl:353
[21] cached_compilation(cache::Dict{UInt64, Any}, job::GPUCompiler.CompilerJob, compiler::typeof(CUDA.cufunction_compile), linker::typeof(CUDA.cufunction_link))
@ GPUCompiler C:\Users\Mark Lau\.julia\packages\GPUCompiler\S3TWf\src\cache.jl:90
[22] cufunction(f::typeof(gpu_add2!), tt::Type{Tuple{CuDeviceVector{Float32, 1}, CuDeviceVector{Float32, 1}}}; name::Nothing, always_inline::Bool, kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
@ CUDA C:\Users\Mark Lau\.julia\packages\CUDA\ZdCxS\src\compiler\execution.jl:306
[23] cufunction(f::typeof(gpu_add2!), tt::Type{Tuple{CuDeviceVector{Float32, 1}, CuDeviceVector{Float32, 1}}})
@ CUDA C:\Users\Mark Lau\.julia\packages\CUDA\ZdCxS\src\compiler\execution.jl:299
[24] top-level scope
@ C:\Users\Mark Lau\.julia\packages\CUDA\ZdCxS\src\compiler\execution.jl:102
I used to be able to run code like this, but after a recent updating of packages through the package manager, I seem to run into this problem, so maybe that has something to do with it.
Here’s my CUDA.versioninfo()
output:
CUDA.versioninfo()
CUDA runtime 11.8, artifact installation
CUDA driver 12.0
Unknown NVIDIA driver
Libraries:
- CUBLAS: 11.11.3
- CURAND: 10.3.0
- CUFFT: 10.9.0
- CUSOLVER: 11.4.1
- CUSPARSE: 11.7.5
- CUPTI: 18.0.0
- NVML: missing
Toolchain:
- Julia: 1.8.5
- LLVM: 13.0.1
- PTX ISA support: 3.2, 4.0, 4.1, 4.2, 4.3, 5.0, 6.0, 6.1, 6.3, 6.4, 6.5, 7.0, 7.1, 7.2
- Device capability support: sm_35, sm_37, sm_50, sm_52, sm_53, sm_60, sm_61, sm_62, sm_70, sm_72, sm_75, sm_80, sm_86
1 device:
0: NVIDIA GeForce GTX 1070 (sm_61, 7.030 GiB / 8.000 GiB available)
Has anyone seen this error before? Any ideas what might be causing this?