My shortest MWE was:
using CUDA
function test1(u, X)
X1 = similar(X)
X1 .= rem.(X, u)
return X1
end
function test2(u, X)
X2 = similar(X)
map!(x->rem(x, u), X2, X)
return X2
end
function main()
t = 0
for i in 1:10000
X = CUDA.rand(Float32, Int(1e6)) .- 0.5
a= test1(0.1, X)
b = test2(0.1, X)
t += sum(a .!== b)
end
return t
end
main()
Which produced the error:
ERROR: LoadError: Failed to compile PTX code (ptxas exited with code 1)
Error in get_form_value default
If you think this is a bug, please file an issue and attach C:\Users\James\AppData\Local\Temp\jl_3S0cNQ349S.ptx
Stacktrace:
[1] error(s::String)
@ Base .\error.jl:33
[2] cufunction_compile(job::GPUCompiler.CompilerJob)
@ CUDA C:\Users\James\.julia\packages\CUDA\O0mym\src\compiler\execution.jl:399
[3] cached_compilation(cache::Dict{UInt64, Any}, job::GPUCompiler.CompilerJob, compiler::typeof(CUDA.cufunction_compile), linker::typeof(CUDA.cufunction_link))
@ GPUCompiler C:\Users\James\.julia\packages\GPUCompiler\7APUC\src\cache.jl:89
[4] cufunction(f::GPUArrays.var"#map_kernel#18"{Int64}, tt::Type{Tuple{CUDA.CuKernelContext, CuDeviceVector{Float64, 1}, Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{1}, Tuple{Base.OneTo{Int64}}, var"#1#2"{Float64},
Tuple{Base.Broadcast.Extruded{CuDeviceVector{Float64, 1}, Tuple{Bool}, Tuple{Int64}}}}, Int64}}; name::Nothing, kwargs::Base.Iterators.Pairs{Union{}, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
@ CUDA C:\Users\James\.julia\packages\CUDA\O0mym\src\compiler\execution.jl:297
[5] cufunction
@ C:\Users\James\.julia\packages\CUDA\O0mym\src\compiler\execution.jl:291 [inlined]
[6] macro expansion
@ C:\Users\James\.julia\packages\CUDA\O0mym\src\compiler\execution.jl:102 [inlined]
[7] #launch_heuristic#234
@ C:\Users\James\.julia\packages\CUDA\O0mym\src\gpuarrays.jl:17 [inlined]
[8] map!(f::Function, dest::CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, xs::CuArray{Float64, 1, CUDA.Mem.DeviceBuffer})
@ GPUArrays C:\Users\James\.julia\packages\GPUArrays\0vqbc\src\host\broadcast.jl:130
[9] test2(u::Float64, X::CuArray{Float64, 1, CUDA.Mem.DeviceBuffer})
@ Main C:\Users\James\.julia\dev\DiffusionSimulator\test\modrem.jl:13
[10] main()
@ Main C:\Users\James\.julia\dev\DiffusionSimulator\test\modrem.jl:22
[11] top-level scope
@ C:\Users\James\.julia\dev\DiffusionSimulator\test\modrem.jl:28
[12] include(fname::String)
@ Base.MainInclude .\client.jl:444
[13] top-level scope
@ REPL[7]:1
[14] top-level scope
@ C:\Users\James\.julia\packages\CUDA\O0mym\src\initialization.jl:52
in expression starting at C:\Users\James\.julia\dev\DiffusionSimulator\test\modrem.jl:28
The offending line doesn’t always trigger an error, so the outer loop helps to force this.
I’m running this from the REPL in debug l2, no check-bounds
etc, or Pkg.test
, unmodified Julia 1.6.2 on both devices, totally standard add CUDA
installs.