Out-of-bounds error when calling map!

My shortest MWE was:

using CUDA


function test1(u, X)
    X1 = similar(X)
    X1 .= rem.(X, u)
    return X1
end

function test2(u, X)
    
    X2 = similar(X)
    map!(x->rem(x, u), X2, X)
    return X2
end

function main()
    t = 0
    for i in 1:10000
        X = CUDA.rand(Float32, Int(1e6)) .- 0.5
        a= test1(0.1, X)
        b = test2(0.1, X)
        t += sum(a .!== b)
    end
    return t
end

main()

Which produced the error:

ERROR: LoadError: Failed to compile PTX code (ptxas exited with code 1)
Error in get_form_value default
If you think this is a bug, please file an issue and attach C:\Users\James\AppData\Local\Temp\jl_3S0cNQ349S.ptx
Stacktrace:
  [1] error(s::String)
    @ Base .\error.jl:33
  [2] cufunction_compile(job::GPUCompiler.CompilerJob)
    @ CUDA C:\Users\James\.julia\packages\CUDA\O0mym\src\compiler\execution.jl:399
  [3] cached_compilation(cache::Dict{UInt64, Any}, job::GPUCompiler.CompilerJob, compiler::typeof(CUDA.cufunction_compile), linker::typeof(CUDA.cufunction_link))
    @ GPUCompiler C:\Users\James\.julia\packages\GPUCompiler\7APUC\src\cache.jl:89
  [4] cufunction(f::GPUArrays.var"#map_kernel#18"{Int64}, tt::Type{Tuple{CUDA.CuKernelContext, CuDeviceVector{Float64, 1}, Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{1}, Tuple{Base.OneTo{Int64}}, var"#1#2"{Float64}, 
Tuple{Base.Broadcast.Extruded{CuDeviceVector{Float64, 1}, Tuple{Bool}, Tuple{Int64}}}}, Int64}}; name::Nothing, kwargs::Base.Iterators.Pairs{Union{}, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
    @ CUDA C:\Users\James\.julia\packages\CUDA\O0mym\src\compiler\execution.jl:297
  [5] cufunction
    @ C:\Users\James\.julia\packages\CUDA\O0mym\src\compiler\execution.jl:291 [inlined]
  [6] macro expansion
    @ C:\Users\James\.julia\packages\CUDA\O0mym\src\compiler\execution.jl:102 [inlined]
  [7] #launch_heuristic#234
    @ C:\Users\James\.julia\packages\CUDA\O0mym\src\gpuarrays.jl:17 [inlined]
  [8] map!(f::Function, dest::CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, xs::CuArray{Float64, 1, CUDA.Mem.DeviceBuffer})
    @ GPUArrays C:\Users\James\.julia\packages\GPUArrays\0vqbc\src\host\broadcast.jl:130
  [9] test2(u::Float64, X::CuArray{Float64, 1, CUDA.Mem.DeviceBuffer})
    @ Main C:\Users\James\.julia\dev\DiffusionSimulator\test\modrem.jl:13
 [10] main()
    @ Main C:\Users\James\.julia\dev\DiffusionSimulator\test\modrem.jl:22
 [11] top-level scope
    @ C:\Users\James\.julia\dev\DiffusionSimulator\test\modrem.jl:28
 [12] include(fname::String)
    @ Base.MainInclude .\client.jl:444
 [13] top-level scope
    @ REPL[7]:1
 [14] top-level scope
    @ C:\Users\James\.julia\packages\CUDA\O0mym\src\initialization.jl:52
in expression starting at C:\Users\James\.julia\dev\DiffusionSimulator\test\modrem.jl:28

The offending line doesn’t always trigger an error, so the outer loop helps to force this.

I’m running this from the REPL in debug l2, no check-bounds etc, or Pkg.test, unmodified Julia 1.6.2 on both devices, totally standard add CUDA installs.