I’m using Flux.jl for machine learning. When I run training, I got this error (seems to be generated from compiled code in Zygote)
ERROR: LoadError: CUDA error: an illegal memory access was encountered (code 700, ERROR_ILLEGAL_ADDRESS)
Stacktrace:
[1] throw_api_error(res::CUDA.cudaError_enum)
@ CUDA ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/lib/cudadrv/libcuda.jl:30
[2] check
@ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/lib/cudadrv/libcuda.jl:37 [inlined]
[3] cuMemAllocFromPoolAsync
@ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/GPUToolbox/src/ccalls.jl:33 [inlined]
[4] alloc(::Type{CUDA.DeviceMemory}, bytesize::Int64; async::Bool, stream::CUDA.CuStream, pool::CUDA.CuMemoryPool)
@ CUDA ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/lib/cudadrv/memory.jl:71
[5] alloc
@ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/lib/cudadrv/memory.jl:61 [inlined]
[6] (::CUDA.var"#_pool_alloc##0#_pool_alloc##1"{Int64, CUDA.CuMemoryPool, @NamedTuple{device::CUDA.CuDevice, context::CUDA.CuContext, stream::CUDA.CuStream, math_mode::CUDA.MathMode, math_precision::Symbol}})()
@ CUDA ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/src/memory.jl:645
[7] retry_reclaim
@ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/src/memory.jl:434 [inlined]
[8] _pool_alloc
@ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/src/memory.jl:640 [inlined]
[9] macro expansion
@ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/src/memory.jl:623 [inlined]
[10] macro expansion
@ ./timing.jl:461 [inlined]
[11] pool_alloc
@ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/src/memory.jl:622 [inlined]
[12] (::CUDA.var"#650#651"{CUDA.DeviceMemory, Int64})()
@ CUDA ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/src/array.jl:92
[13] cached_alloc(f::CUDA.var"#650#651"{CUDA.DeviceMemory, Int64}, key::Tuple{UnionAll, CUDA.CuDevice, DataType, Int64})
@ GPUArrays ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/GPUArrays/src/host/alloc_cache.jl:36
[14] CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}(::UndefInitializer, dims::Tuple{Int64, Int64})
@ CUDA ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/src/array.jl:91
[15] _pullback(::Zygote.Context{false}, ::typeof(match_constant_score), ::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}, ::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory})
@ Zygote ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/Zygote/src/compiler/interface2.jl:81
[16] tactic_loss
@ ~/data/script/gnn-tactic.jl:974 [inlined]
[17] _pullback(::Zygote.Context{false}, ::typeof(tactic_loss), ::Model, ::@NamedTuple{common::@NamedTuple{hole::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}, locus::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}, literal::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}, constant::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}, sort::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}}, kind::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}, revert_free::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}, apply_constant::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}, apply_free::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}, motivated_apply_constant::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}, generalize_free::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}, cases_free::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}, induction_free::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}, induction_recursor::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}, rewrite_constant::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}, rewrite_free::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}, rewrite_pos::CUDA.CuArray{Float32, 1, CUDA.DeviceMemory}, rewrite_dir::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory}, generalize_at_pos::CUDA.CuArray{Float32, 1, CUDA.DeviceMemory}}, ::@NamedTuple{score::CUDA.CuArray{Float32, 1, CUDA.DeviceMemory}, graph::GNNHeteroGraph{Tuple{CUDA.CuArray{Int64, 1, CUDA.DeviceMemory}, CUDA.CuArray{Int64, 1, CUDA.DeviceMemory}, Nothing}}, free_mask::CUDA.CuArray{Int64, 1, CUDA.DeviceMemory}, target_mask::CUDA.CuArray{Bool, 1, CUDA.DeviceMemory}, goal_mask::CUDA.CuArray{Bool, 1, CUDA.DeviceMemory}, kind::CUDA.CuArray{Int64, 1, CUDA.DeviceMemory}, revert_free::CUDA.CuArray{Int64, 1, CUDA.DeviceMemory}, apply_constant::CUDA.CuArray{Int64, 1, CUDA.DeviceMemory}, apply_free::CUDA.CuArray{Int64, 1, CUDA.DeviceMemory}, motivated_apply_constant::CUDA.CuArray{Int64, 1, CUDA.DeviceMemory}, generalize_free::CUDA.CuArray{Int64, 1, CUDA.DeviceMemory}, cases_free::CUDA.CuArray{Int64, 1, CUDA.DeviceMemory}, induction_free::CUDA.CuArray{Int64, 1, CUDA.DeviceMemory}, induction_recursor::CUDA.CuArray{Int64, 1, CUDA.DeviceMemory}, rewrite_constant::CUDA.CuArray{Int64, 1, CUDA.DeviceMemory}, rewrite_free::CUDA.CuArray{Int64, 1, CUDA.DeviceMemory}, rewrite_pos::CUDA.CuArray{Bool, 1, CUDA.DeviceMemory}, rewrite_dir::CUDA.CuArray{Int64, 1, CUDA.DeviceMemory}, generalize_at::CUDA.CuArray{Bool, 1, CUDA.DeviceMemory}, generalize_at_pos::CUDA.CuArray{Bool, 1, CUDA.DeviceMemory}})
@ Zygote ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/Zygote/src/compiler/interface2.jl:0
[18] #190
@ ~/data/script/gnn-tactic.jl:1382 [inlined]
[19] _pullback(ctx::Zygote.Context{false}, f::var"#190#191"{@NamedTuple{score_mean::Vector{Float32}, score_std::Vector{Float32}, loss::Vector{Float32}, loss_tactic::Vector{Float32}, loss_fp::Vector{Float32}}}, args::Model)
@ Zygote ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/Zygote/src/compiler/interface2.jl:0
[20] pullback(f::Function, cx::Zygote.Context{false}, args::Model)
@ Zygote ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/Zygote/src/compiler/interface.jl:96
[21] pullback
@ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/Zygote/src/compiler/interface.jl:94 [inlined]
[22] withgradient(f::Function, args::Model)
@ Zygote ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/Zygote/src/compiler/interface.jl:211
[23] #withgradient#5
@ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/Flux/src/gradient.jl:182 [inlined]
[24] withgradient
@ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/Flux/src/gradient.jl:169 [inlined]
...
and a very large number of errors that look like this:
WARNING: Error while freeing DeviceMemory(1 byte at 0x0000000302ad3600):
CUDA.CuError(code=CUDA.cudaError_enum(0x000002bc))
Stacktrace:
[1] throw_api_error(res::CUDA.cudaError_enum)
@ CUDA ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/lib/cudadrv/libcuda.jl:30
[2] check
@ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/lib/cudadrv/libcuda.jl:37 [inlined]
[3] cuMemFreeAsync
@ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/GPUToolbox/src/ccalls.jl:33 [inlined]
[4] #free#491
@ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/lib/cudadrv/memory.jl:87 [inlined]
[5] free
@ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/lib/cudadrv/memory.jl:82 [inlined]
[6] #_pool_free##0
@ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/src/memory.jl:714 [inlined]
[7] #context!#546
@ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/lib/cudadrv/state.jl:168 [inlined]
[8] context!
@ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/lib/cudadrv/state.jl:163 [inlined]
[9] _pool_free
@ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/src/memory.jl:713 [inlined]
[10] macro expansion
@ ./timing.jl:461 [inlined]
[11] pool_free(managed::CUDA.Managed{CUDA.DeviceMemory})
@ CUDA ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/src/memory.jl:695
[12] release(::GPUArrays.RefCounted{CUDA.Managed{CUDA.DeviceMemory}})
@ GPUArrays ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/GPUArrays/src/host/abstractarray.jl:42
[13] unsafe_free!
@ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/GPUArrays/src/host/abstractarray.jl:100 [inlined]
[14] unsafe_free!(x::CUDA.CuArray{Bool, 1, CUDA.DeviceMemory})
@ GPUArrays ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/GPUArrays/src/host/abstractarray.jl:115
This repeated itself with different byte sizes for hundreds of times but always with the error code 0x2bc:
...
WARNING: Error while freeing DeviceMemory(480 bytes at 0x0000000302bcb800):
CUDA.CuError(code=CUDA.cudaError_enum(0x000002bc))
Stacktrace:
[1] throw_api_error(res::CUDA.cudaError_enum)
@ CUDA ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/lib/cudadrv/libcuda.jl:30
[2] check
@ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/lib/cudadrv/libcuda.jl:37 [inlined]
[3] cuMemFreeAsync
@ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/GPUToolbox/src/ccalls.jl:33 [inlined]
[4] #free#491
@ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/lib/cudadrv/memory.jl:87 [inlined]
[5] free
@ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/lib/cudadrv/memory.jl:82 [inlined]
[6] #_pool_free##0
@ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/src/memory.jl:714 [inlined]
[7] #context!#546
@ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/lib/cudadrv/state.jl:168 [inlined]
[8] context!
@ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/lib/cudadrv/state.jl:163 [inlined]
[9] _pool_free
@ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/src/memory.jl:713 [inlined]
[10] macro expansion
@ ./timing.jl:461 [inlined]
[11] pool_free(managed::CUDA.Managed{CUDA.DeviceMemory})
@ CUDA ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/CUDA/src/memory.jl:695
[12] release(::GPUArrays.RefCounted{CUDA.Managed{CUDA.DeviceMemory}})
@ GPUArrays ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/GPUArrays/src/host/abstractarray.jl:42
[13] unsafe_free!
@ ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/GPUArrays/src/host/abstractarray.jl:100 [inlined]
[14] unsafe_free!(x::CUDA.CuArray{Float32, 2, CUDA.DeviceMemory})
@ GPUArrays ~/nix/store/20dwbx3shyln9jqhf845yr3r88h5qxz7-trexp-cuda-load-path/GPUArrays/src/host/abstractarray.jl:115
CUDA.versioninfo():
CUDA toolchain:
- runtime 12.6, artifact installation
- driver 580.105.8 for 13.1
- compiler 12.9
CUDA libraries:
- CUBLAS: 12.6.4
- CURAND: 10.3.7
- CUFFT: 11.3.0
- CUSOLVER: 11.7.1
- CUSPARSE: 12.5.4
- CUPTI: 2024.3.2 (API 12.6.0)
- NVML: 13.0.0+580.105.8
Julia packages:
- CUDA: 5.9.6
- GPUArrays: 11.3.3
- GPUCompiler: 1.7.5
- KernelAbstractions: 0.9.39
- CUDA_Driver_jll: 13.1.0+0
- CUDA_Compiler_jll: 0.4.1+0
- CUDA_Runtime_jll: 0.19.2+0
Toolchain:
- Julia: 1.12.1
- LLVM: 18.1.7
Preferences:
- CUDA_Runtime_jll.version: 12.6
1 device:
0: Tesla T4 (sm_75, 14.560 GiB / 15.000 GiB available)
However, on smaller toy examples the same issue doesn’t exist. What could be causing this problem?
The last line in my script before all the cuda calls is an invocation to this function:
to_ordered_index(i::AbstractArray) = CartesianIndex.(1:length(i), max.(1, i))
function match_score(a::AbstractArray, i::AbstractArray, target::AbstractArray)
iszero(i) && return zeros(eltype(a)) |> gpu
mask = i .!= 0
j = Zygote.ignore(() -> to_ordered_index(i[mask]))
return a[:, mask]'[j] - target[mask]
end