CUDA Warning about freeing DeviceMemory

Chrysoberyl · January 14, 2026, 5:22am

I’m using Flux.jl for machine learning. When I run training, I got this error (seems to be generated from compiled code in Zygote)

ERROR: LoadError: CUDA error: an illegal memory access was encountered (code 700, ERROR_ILLEGAL_ADDRESS)
Stacktrace:
  [1] throw_api_error(res::CUDA.cudaError_enum)
    @ CUDA ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/lib/cudadrv/libcuda.jl:30
  [2] check
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/lib/cudadrv/libcuda.jl:37 [inlined]
  [3] cuMemAllocFromPoolAsync
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/GPUToolbox/src/ccalls.jl:33 [inlined]
  [4] alloc(::Type{CUDA.DeviceMemory}, bytesize::Int64; async::Bool, stream::CUDA.CuStream, pool::CUDA.CuMemoryPool)
    @ CUDA ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/lib/cudadrv/memory.jl:71
  [5] alloc
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/lib/cudadrv/memory.jl:61 [inlined]
  [6] (::CUDA.var"#_pool_alloc##0#_pool_alloc##1"{Int64, CUDA.CuMemoryPool, @NamedTuple{device::CUDA.CuDevice, context::CUDA.CuContext, stream::CUDA.CuStream, math_mode::CUDA.MathMode, math_precision::Symbol}})()
    @ CUDA ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/src/memory.jl:645
  [7] retry_reclaim
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/src/memory.jl:434 [inlined]
  [8] _pool_alloc
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/src/memory.jl:640 [inlined]
  [9] macro expansion
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/src/memory.jl:623 [inlined]
 [10] macro expansion
    @ ./timing.jl:461 [inlined]
 [11] pool_alloc
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/src/memory.jl:622 [inlined]
 [12] (::CUDA.var"#650#651"{CUDA.DeviceMemory, Int64})()
    @ CUDA ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/src/array.jl:92
 [13] cached_alloc(f::CUDA.var"#650#651"{CUDA.DeviceMemory, Int64}, key::Tuple{UnionAll, CUDA.CuDevice, DataType, Int64})
    @ GPUArrays ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/GPUArrays/src/host/alloc_cache.jl:36
 [14] CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}(::UndefInitializer, dims::Tuple{Int64, Int64})
    @ CUDA ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/src/array.jl:91
 [15] _pullback(::Zygote.Context{false}, ::typeof(match_constant_score), ::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}, ::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory})
    @ Zygote ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/Zygote/src/compiler/interface2.jl:81
 [16] tactic_loss
    @ ~/data/script/gnn-tactic.jl:974 [inlined]
 [17] _pullback(::Zygote.Context{false}, ::typeof(tactic_loss), ::Model, ::@NamedTuple{common::@NamedTuple{hole::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}, locus::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}, literal::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}, constant::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}, sort::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}}, kind::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}, revert_free::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}, apply_constant::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}, apply_free::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}, motivated_apply_constant::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}, generalize_free::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}, cases_free::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}, induction_free::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}, induction_recursor::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}, rewrite_constant::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}, rewrite_free::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}, rewrite_pos::CUDA.CuArray{Float32, 1, CUDA.DeviceMemory}, rewrite_dir::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}, generalize_at_pos::CUDA.CuArray{Float32, 1, CUDA.DeviceMemory}}, ::@NamedTuple{score::CUDA.CuArray{Float32, 1, CUDA.DeviceMemory}, graph::GNNHeteroGraph{Tuple{CUDA.CuArray{Int64, 1, CUDA.DeviceMemory}, CUDA.CuArray{Int64, 1, CUDA.DeviceMemory}, Nothing}}, free_mask::CUDA.CuArray{Int64, 1, CUDA.DeviceMemory}, target_mask::CUDA.CuArray{Bool, 1, CUDA.DeviceMemory}, goal_mask::CUDA.CuArray{Bool, 1, CUDA.DeviceMemory}, kind::CUDA.CuArray{Int64, 1, CUDA.DeviceMemory}, revert_free::CUDA.CuArray{Int64, 1, CUDA.DeviceMemory}, apply_constant::CUDA.CuArray{Int64, 1, CUDA.DeviceMemory}, apply_free::CUDA.CuArray{Int64, 1, CUDA.DeviceMemory}, motivated_apply_constant::CUDA.CuArray{Int64, 1, CUDA.DeviceMemory}, generalize_free::CUDA.CuArray{Int64, 1, CUDA.DeviceMemory}, cases_free::CUDA.CuArray{Int64, 1, CUDA.DeviceMemory}, induction_free::CUDA.CuArray{Int64, 1, CUDA.DeviceMemory}, induction_recursor::CUDA.CuArray{Int64, 1, CUDA.DeviceMemory}, rewrite_constant::CUDA.CuArray{Int64, 1, CUDA.DeviceMemory}, rewrite_free::CUDA.CuArray{Int64, 1, CUDA.DeviceMemory}, rewrite_pos::CUDA.CuArray{Bool, 1, CUDA.DeviceMemory}, rewrite_dir::CUDA.CuArray{Int64, 1, CUDA.DeviceMemory}, generalize_at::CUDA.CuArray{Bool, 1, CUDA.DeviceMemory}, generalize_at_pos::CUDA.CuArray{Bool, 1, CUDA.DeviceMemory}})
    @ Zygote ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/Zygote/src/compiler/interface2.jl:0
 [18] #190
    @ ~/data/script/gnn-tactic.jl:1382 [inlined]
 [19] _pullback(ctx::Zygote.Context{false}, f::var"#190#191"{@NamedTuple{score_mean::Vector{Float32}, score_std::Vector{Float32}, loss::Vector{Float32}, loss_tactic::Vector{Float32}, loss_fp::Vector{Float32}}}, args::Model)
    @ Zygote ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/Zygote/src/compiler/interface2.jl:0
 [20] pullback(f::Function, cx::Zygote.Context{false}, args::Model)
    @ Zygote ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/Zygote/src/compiler/interface.jl:96
 [21] pullback
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/Zygote/src/compiler/interface.jl:94 [inlined]
 [22] withgradient(f::Function, args::Model)
    @ Zygote ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/Zygote/src/compiler/interface.jl:211
 [23] #withgradient#5
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/Flux/src/gradient.jl:182 [inlined]
 [24] withgradient
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/Flux/src/gradient.jl:169 [inlined]
...

and a very large number of errors that look like this:

WARNING: Error while freeing DeviceMemory(1 byte at 0x0000000302ad3600):
CUDA.CuError(code=CUDA.cudaError_enum(0x000002bc))

Stacktrace:
  [1] throw_api_error(res::CUDA.cudaError_enum)
    @ CUDA ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/lib/cudadrv/libcuda.jl:30
  [2] check
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/lib/cudadrv/libcuda.jl:37 [inlined]
  [3] cuMemFreeAsync
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/GPUToolbox/src/ccalls.jl:33 [inlined]
  [4] #free#491
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/lib/cudadrv/memory.jl:87 [inlined]
  [5] free
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/lib/cudadrv/memory.jl:82 [inlined]
  [6] #_pool_free##0
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/src/memory.jl:714 [inlined]
  [7] #context!#546
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/lib/cudadrv/state.jl:168 [inlined]
  [8] context!
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/lib/cudadrv/state.jl:163 [inlined]
  [9] _pool_free
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/src/memory.jl:713 [inlined]
 [10] macro expansion
    @ ./timing.jl:461 [inlined]
 [11] pool_free(managed::CUDA.Managed{CUDA.DeviceMemory})
    @ CUDA ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/src/memory.jl:695
 [12] release(::GPUArrays.RefCounted{CUDA.Managed{CUDA.DeviceMemory}})
    @ GPUArrays ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/GPUArrays/src/host/abstractarray.jl:42
 [13] unsafe_free!
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/GPUArrays/src/host/abstractarray.jl:100 [inlined]
 [14] unsafe_free!(x::CUDA.CuArray{Bool, 1, CUDA.DeviceMemory})
    @ GPUArrays ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/GPUArrays/src/host/abstractarray.jl:115

This repeated itself with different byte sizes for hundreds of times but always with the error code 0x2bc:

...
WARNING: Error while freeing DeviceMemory(480 bytes at 0x0000000302bcb800):
CUDA.CuError(code=CUDA.cudaError_enum(0x000002bc))

Stacktrace:
  [1] throw_api_error(res::CUDA.cudaError_enum)
    @ CUDA ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/lib/cudadrv/libcuda.jl:30
  [2] check
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/lib/cudadrv/libcuda.jl:37 [inlined]
  [3] cuMemFreeAsync
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/GPUToolbox/src/ccalls.jl:33 [inlined]
  [4] #free#491
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/lib/cudadrv/memory.jl:87 [inlined]
  [5] free
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/lib/cudadrv/memory.jl:82 [inlined]
  [6] #_pool_free##0
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/src/memory.jl:714 [inlined]
  [7] #context!#546
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/lib/cudadrv/state.jl:168 [inlined]
  [8] context!
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/lib/cudadrv/state.jl:163 [inlined]
  [9] _pool_free
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/src/memory.jl:713 [inlined]
 [10] macro expansion
    @ ./timing.jl:461 [inlined]
 [11] pool_free(managed::CUDA.Managed{CUDA.DeviceMemory})
    @ CUDA ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/src/memory.jl:695
 [12] release(::GPUArrays.RefCounted{CUDA.Managed{CUDA.DeviceMemory}})
    @ GPUArrays ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/GPUArrays/src/host/abstractarray.jl:42
 [13] unsafe_free!
    @ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/GPUArrays/src/host/abstractarray.jl:100 [inlined]
 [14] unsafe_free!(x::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory})
    @ GPUArrays ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/GPUArrays/src/host/abstractarray.jl:115

CUDA.versioninfo():

CUDA toolchain:
- runtime 12.6, artifact installation
- driver 580.105.8 for 13.1
- compiler 12.9

CUDA libraries:
- CUBLAS: 12.6.4
- CURAND: 10.3.7
- CUFFT: 11.3.0
- CUSOLVER: 11.7.1
- CUSPARSE: 12.5.4
- CUPTI: 2024.3.2 (API 12.6.0)
- NVML: 13.0.0+580.105.8

Julia packages:
- CUDA: 5.9.6
- GPUArrays: 11.3.3
- GPUCompiler: 1.7.5
- KernelAbstractions: 0.9.39
- CUDA_Driver_jll: 13.1.0+0
- CUDA_Compiler_jll: 0.4.1+0
- CUDA_Runtime_jll: 0.19.2+0

Toolchain:
- Julia: 1.12.1
- LLVM: 18.1.7

Preferences:
- CUDA_Runtime_jll.version: 12.6

1 device:
  0: Tesla T4 (sm_75, 14.560 GiB / 15.000 GiB available)

However, on smaller toy examples the same issue doesn’t exist. The problem also doesn’t exist on CPU. What could be causing this problem?

The last line in my script before all the cuda calls is an invocation to this function:

to_ordered_index(i::AbstractArray) = CartesianIndex.(1:length(i), max.(1, i))
function match_score(a::AbstractArray, i::AbstractArray, target::AbstractArray)
    iszero(i) && return zeros(eltype(a)) |> gpu

    mask = i .!= 0
    j = Zygote.ignore(() -> to_ordered_index(i[mask]))
    return a[:, mask]'[j] - target[mask]
end

Chrysoberyl · January 15, 2026, 7:37am

I have found a possible solution. The cause of this issue is because during backpropagation on GPU, both branches get evaluated in a conditional. The solution is to make it so that evaluating the false branch does not lead to any out of bound index access.

to_ordered_index(i::AbstractArray) = CartesianIndex.(max.(1, i), 1:length(i))

function match_score(a::AbstractArray, i::AbstractArray, target::AbstractArray)
    iszero(i) && return zeros(eltype(a)) |> gpu

    mask = i .!= 0
    j = Zygote.ignore(() -> to_ordered_index(i)[mask] |> gpu)
    return a[j] - target[mask]
end

Topic		Replies	Views
Flux runs out of memory Machine Learning memory-allocation , flux	25	4599	June 1, 2023
GPU gradient calculation with Zygote failling with an RTX 4060 General Usage question	1	195	December 17, 2023
I cannot use my gpu despite trying everything General Usage cuda	28	1154	January 7, 2024
ERROR: this intrinsic must be compiled to be called General Usage flux , zygote , cuarrays	9	2078	December 29, 2020
LLVM crash when running Flux and CuArray examples in julia 0.7 GPU cudanative , bug , debugging , flux	13	1698	August 21, 2018

CUDA Warning about freeing DeviceMemory

Related topics