CUDA Warning about freeing DeviceMemory

I’m using Flux.jl for machine learning. When I run training, I got this error (seems to be generated from compiled code in Zygote)

ERROR: LoadError: CUDA error: an illegal memory access was encountered (code 700, ERROR_ILLEGAL_ADDRESS)
Stacktrace:
  [1] throw_api_error(res::CUDA.cudaError_enum)
    @ CUDA ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/lib/cudadrv/libcuda.jl:30
  [2] check
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/lib/cudadrv/libcuda.jl:37 [inlined]
  [3] cuMemAllocFromPoolAsync
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/GPUToolbox/src/ccalls.jl:33 [inlined]
  [4] alloc(::Type{CUDA.DeviceMemory}, bytesize::Int64; async::Bool, stream::CUDA.CuStream, pool::CUDA.CuMemoryPool)
    @ CUDA ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/lib/cudadrv/memory.jl:71
  [5] alloc
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/lib/cudadrv/memory.jl:61 [inlined]
  [6] (::CUDA.var"#_pool_alloc##0#_pool_alloc##1"{Int64, CUDA.CuMemoryPool, @NamedTuple{device::CUDA.CuDevice, context::CUDA.CuContext, stream::CUDA.CuStream, math_mode::CUDA.MathMode, math_precision::Symbol}})()
    @ CUDA ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/src/memory.jl:645
  [7] retry_reclaim
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/src/memory.jl:434 [inlined]
  [8] _pool_alloc
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/src/memory.jl:640 [inlined]
  [9] macro expansion
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/src/memory.jl:623 [inlined]
 [10] macro expansion
    @ ./timing.jl:461 [inlined]
 [11] pool_alloc
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/src/memory.jl:622 [inlined]
 [12] (::CUDA.var"#650#651"{CUDA.DeviceMemory, Int64})()
    @ CUDA ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/src/array.jl:92
 [13] cached_alloc(f::CUDA.var"#650#651"{CUDA.DeviceMemory, Int64}, key::Tuple{UnionAll, CUDA.CuDevice, DataType, Int64})
    @ GPUArrays ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/GPUArrays/src/host/alloc_cache.jl:36
 [14] CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}(::UndefInitializer, dims::Tuple{Int64, Int64})
    @ CUDA ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/src/array.jl:91
 [15] _pullback(::Zygote.Context{false}, ::typeof(match_constant_score), ::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}, ::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory})
    @ Zygote ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/Zygote/src/compiler/interface2.jl:81
 [16] tactic_loss
    @ ~/data/script/gnn-tactic.jl:974 [inlined]
 [17] _pullback(::Zygote.Context{false}, ::typeof(tactic_loss), ::Model, ::@NamedTuple{common::@NamedTuple{hole::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}, locus::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}, literal::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}, constant::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}, sort::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}}, kind::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}, revert_free::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}, apply_constant::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}, apply_free::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}, motivated_apply_constant::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}, generalize_free::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}, cases_free::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}, induction_free::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}, induction_recursor::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}, rewrite_constant::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}, rewrite_free::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}, rewrite_pos::CUDA.CuArray{Float32, 1, CUDA.DeviceMemory}, rewrite_dir::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}, generalize_at_pos::CUDA.CuArray{Float32, 1, CUDA.DeviceMemory}}, ::@NamedTuple{score::CUDA.CuArray{Float32, 1, CUDA.DeviceMemory}, graph::GNNHeteroGraph{Tuple{CUDA.CuArray{Int64, 1, CUDA.DeviceMemory}, CUDA.CuArray{Int64, 1, CUDA.DeviceMemory}, Nothing}}, free_mask::CUDA.CuArray{Int64, 1, CUDA.DeviceMemory}, target_mask::CUDA.CuArray{Bool, 1, CUDA.DeviceMemory}, goal_mask::CUDA.CuArray{Bool, 1, CUDA.DeviceMemory}, kind::CUDA.CuArray{Int64, 1, CUDA.DeviceMemory}, revert_free::CUDA.CuArray{Int64, 1, CUDA.DeviceMemory}, apply_constant::CUDA.CuArray{Int64, 1, CUDA.DeviceMemory}, apply_free::CUDA.CuArray{Int64, 1, CUDA.DeviceMemory}, motivated_apply_constant::CUDA.CuArray{Int64, 1, CUDA.DeviceMemory}, generalize_free::CUDA.CuArray{Int64, 1, CUDA.DeviceMemory}, cases_free::CUDA.CuArray{Int64, 1, CUDA.DeviceMemory}, induction_free::CUDA.CuArray{Int64, 1, CUDA.DeviceMemory}, induction_recursor::CUDA.CuArray{Int64, 1, CUDA.DeviceMemory}, rewrite_constant::CUDA.CuArray{Int64, 1, CUDA.DeviceMemory}, rewrite_free::CUDA.CuArray{Int64, 1, CUDA.DeviceMemory}, rewrite_pos::CUDA.CuArray{Bool, 1, CUDA.DeviceMemory}, rewrite_dir::CUDA.CuArray{Int64, 1, CUDA.DeviceMemory}, generalize_at::CUDA.CuArray{Bool, 1, CUDA.DeviceMemory}, generalize_at_pos::CUDA.CuArray{Bool, 1, CUDA.DeviceMemory}})
    @ Zygote ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/Zygote/src/compiler/interface2.jl:0
 [18] #190
    @ ~/data/script/gnn-tactic.jl:1382 [inlined]
 [19] _pullback(ctx::Zygote.Context{false}, f::var"#190#191"{@NamedTuple{score_mean::Vector{Float32}, score_std::Vector{Float32}, loss::Vector{Float32}, loss_tactic::Vector{Float32}, loss_fp::Vector{Float32}}}, args::Model)
    @ Zygote ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/Zygote/src/compiler/interface2.jl:0
 [20] pullback(f::Function, cx::Zygote.Context{false}, args::Model)
    @ Zygote ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/Zygote/src/compiler/interface.jl:96
 [21] pullback
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/Zygote/src/compiler/interface.jl:94 [inlined]
 [22] withgradient(f::Function, args::Model)
    @ Zygote ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/Zygote/src/compiler/interface.jl:211
 [23] #withgradient#5
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/Flux/src/gradient.jl:182 [inlined]
 [24] withgradient
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/Flux/src/gradient.jl:169 [inlined]
...

and a very large number of errors that look like this:

WARNING: Error while freeing DeviceMemory(1 byte at 0x0000000302ad3600):
CUDA.CuError(code=CUDA.cudaError_enum(0x000002bc))

Stacktrace:
  [1] throw_api_error(res::CUDA.cudaError_enum)
    @ CUDA ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/lib/cudadrv/libcuda.jl:30
  [2] check
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/lib/cudadrv/libcuda.jl:37 [inlined]
  [3] cuMemFreeAsync
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/GPUToolbox/src/ccalls.jl:33 [inlined]
  [4] #free#491
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/lib/cudadrv/memory.jl:87 [inlined]
  [5] free
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/lib/cudadrv/memory.jl:82 [inlined]
  [6] #_pool_free##0
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/src/memory.jl:714 [inlined]
  [7] #context!#546
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/lib/cudadrv/state.jl:168 [inlined]
  [8] context!
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/lib/cudadrv/state.jl:163 [inlined]
  [9] _pool_free
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/src/memory.jl:713 [inlined]
 [10] macro expansion
    @ ./timing.jl:461 [inlined]
 [11] pool_free(managed::CUDA.Managed{CUDA.DeviceMemory})
    @ CUDA ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/src/memory.jl:695
 [12] release(::GPUArrays.RefCounted{CUDA.Managed{CUDA.DeviceMemory}})
    @ GPUArrays ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/GPUArrays/src/host/abstractarray.jl:42
 [13] unsafe_free!
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/GPUArrays/src/host/abstractarray.jl:100 [inlined]
 [14] unsafe_free!(x::CUDA.CuArray{Bool, 1, CUDA.DeviceMemory})
    @ GPUArrays ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/GPUArrays/src/host/abstractarray.jl:115

This repeated itself with different byte sizes for hundreds of times but always with the error code 0x2bc:

...
WARNING: Error while freeing DeviceMemory(480 bytes at 0x0000000302bcb800):
CUDA.CuError(code=CUDA.cudaError_enum(0x000002bc))

Stacktrace:
  [1] throw_api_error(res::CUDA.cudaError_enum)
    @ CUDA ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/lib/cudadrv/libcuda.jl:30
  [2] check
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/lib/cudadrv/libcuda.jl:37 [inlined]
  [3] cuMemFreeAsync
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/GPUToolbox/src/ccalls.jl:33 [inlined]
  [4] #free#491
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/lib/cudadrv/memory.jl:87 [inlined]
  [5] free
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/lib/cudadrv/memory.jl:82 [inlined]
  [6] #_pool_free##0
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/src/memory.jl:714 [inlined]
  [7] #context!#546
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/lib/cudadrv/state.jl:168 [inlined]
  [8] context!
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/lib/cudadrv/state.jl:163 [inlined]
  [9] _pool_free
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/src/memory.jl:713 [inlined]
 [10] macro expansion
    @ ./timing.jl:461 [inlined]
 [11] pool_free(managed::CUDA.Managed{CUDA.DeviceMemory})
    @ CUDA ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/src/memory.jl:695
 [12] release(::GPUArrays.RefCounted{CUDA.Managed{CUDA.DeviceMemory}})
    @ GPUArrays ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/GPUArrays/src/host/abstractarray.jl:42
 [13] unsafe_free!
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/GPUArrays/src/host/abstractarray.jl:100 [inlined]
 [14] unsafe_free!(x::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory})
    @ GPUArrays ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/GPUArrays/src/host/abstractarray.jl:115

CUDA.versioninfo():

CUDA toolchain:
- runtime 12.6, artifact installation
- driver 580.105.8 for 13.1
- compiler 12.9

CUDA libraries:
- CUBLAS: 12.6.4
- CURAND: 10.3.7
- CUFFT: 11.3.0
- CUSOLVER: 11.7.1
- CUSPARSE: 12.5.4
- CUPTI: 2024.3.2 (API 12.6.0)
- NVML: 13.0.0+580.105.8

Julia packages:
- CUDA: 5.9.6
- GPUArrays: 11.3.3
- GPUCompiler: 1.7.5
- KernelAbstractions: 0.9.39
- CUDA_Driver_jll: 13.1.0+0
- CUDA_Compiler_jll: 0.4.1+0
- CUDA_Runtime_jll: 0.19.2+0

Toolchain:
- Julia: 1.12.1
- LLVM: 18.1.7

Preferences:
- CUDA_Runtime_jll.version: 12.6

1 device:
  0: Tesla T4 (sm_75, 14.560 GiB / 15.000 GiB available)

However, on smaller toy examples the same issue doesn’t exist. What could be causing this problem?

The last line in my script before all the cuda calls is an invocation to this function:

to_ordered_index(i::AbstractArray) = CartesianIndex.(1:length(i), max.(1, i))
function match_score(a::AbstractArray, i::AbstractArray, target::AbstractArray)
    iszero(i) && return zeros(eltype(a)) |> gpu

    mask = i .!= 0
    j = Zygote.ignore(() -> to_ordered_index(i[mask]))
    return a[:, mask]'[j] - target[mask]
end