Hard error using dice loss

Cheers,

Regardless of the model, data, or any other condition, I’ve never been able of using the built-in Flux.dice_coeff_loss() function. A very long error dump shows up, apparently tied to CUDA and memory usage

For the error dump from below, which is limited to 1/10 of it, my model is a simple CNN, labels are binaries, and predictions are limited by sigmoid.

Any advice is greatly appreciated. Thanks.

ERROR: a exception was thrown during kernel execution.
Run Julia on debug level 2 for device stack traces.
ERROR: a exception was thrown during kernel execution.
Run Julia on debug level 2 for device stack traces.
ERROR: a exception was thrown during kernel execution.
Run Julia on debug level 2 for device stack traces.
ERROR: LoadError: CUDA error: an illegal memory access was encountered (code 700, ERROR_ILLEGAL_ADDRESS)
Stacktrace:
[1] throw_api_error(res::CUDA.cudaError_enum)
@ CUDA ~/.julia/packages/CUDA/htRwP/lib/cudadrv/libcuda.jl:27
[2] nonblocking_synchronize(val::CuContext)
@ CUDA ~/.julia/packages/CUDA/htRwP/lib/cudadrv/synchronization.jl:163
[3] device_synchronize(; blocking::Bool, spin::Bool)
@ CUDA ~/.julia/packages/CUDA/htRwP/lib/cudadrv/synchronization.jl:174
[4] device_synchronize
@ ~/.julia/packages/CUDA/htRwP/lib/cudadrv/synchronization.jl:169 [inlined]
[5] CuModule(data::Vector{UInt8}, options::Dict{CUDA.CUjit_option_enum, Any})
@ CUDA ~/.julia/packages/CUDA/htRwP/lib/cudadrv/module.jl:40
[6] CuModule
@ CUDA ~/.julia/packages/CUDA/htRwP/lib/cudadrv/module.jl:23 [inlined]
[7] link(job::GPUCompiler.CompilerJob, compiled::@NamedTuple{image::Vector{UInt8}, entry::String, external_gvars::Vector{String}})
@ CUDA ~/.julia/packages/CUDA/htRwP/src/compiler/compilation.jl:409
[8] actual_compilation(cache::Dict{Any, CuFunction}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, compiler::typeof(CUDA.compile), linker::typeof(CUDA.link))
@ GPUCompiler ~/.julia/packages/GPUCompiler/U36Ed/src/execution.jl:132
[9] cached_compilation(cache::Dict{Any, CuFunction}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, compiler::Function, linker::Function)
@ GPUCompiler ~/.julia/packages/GPUCompiler/U36Ed/src/execution.jl:103
[10] macro expansion
@ ~/.julia/packages/CUDA/htRwP/src/compiler/execution.jl:367 [inlined]
[11] macro expansion
@ ./lock.jl:267 [inlined]
[12] cufunction(f::GPUArrays.var"#broadcast_kernel#38", tt::Type{Tuple{CUDA.CuKernelContext, CuDeviceArray{Int64, 4, 1}, Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{4, CUDA.Mem.DeviceBuffer}, NTuple{4, Base.OneTo{Int64}}, typeof(), Tuple{Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{4, CUDA.Mem.DeviceBuffer}, Nothing, typeof(), Tuple{Base.Broadcast.Extruded{CuDeviceArray{Bool, 4, 1}, NTuple{4, Bool}, NTuple{4, Int64}}, Int64}}, Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{4, CUDA.Mem.DeviceBuffer}, Nothing, typeof(conj), Tuple{Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{4, CUDA.Mem.DeviceBuffer}, Nothing, typeof(^), Tuple{Base.Broadcast.Extruded{CuDeviceArray{Bool, 4, 1}, NTuple{4, Bool}, NTuple{4, Int64}}, Int64}}}}}}, Int64}}; kwargs::@Kwargs{})
@ CUDA ~/.julia/packages/CUDA/htRwP/src/compiler/execution.jl:362
[13] cufunction(f::GPUArrays.var"#broadcast_kernel#38", tt::Type{Tuple{CUDA.CuKernelContext, CuDeviceArray{Int64, 4, 1}, Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{4, CUDA.Mem.DeviceBuffer}, NTuple{4, Base.OneTo{Int64}}, typeof(), Tuple{Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{4, CUDA.Mem.DeviceBuffer}, Nothing, typeof(), Tuple{Base.Broadcast.Extruded{CuDeviceArray{Bool, 4, 1}, NTuple{4, Bool}, NTuple{4, Int64}}, Int64}}, Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{4, CUDA.Mem.DeviceBuffer}, Nothing, typeof(conj), Tuple{Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{4, CUDA.Mem.DeviceBuffer}, Nothing, typeof(^), Tuple{Base.Broadcast.Extruded{CuDeviceArray{Bool, 4, 1}, NTuple{4, Bool}, NTuple{4, Int64}}, Int64}}}}}}, Int64}})
@ CUDA ~/.julia/packages/CUDA/htRwP/src/compiler/execution.jl:359
[14] macro expansion
@ ~/.julia/packages/CUDA/htRwP/src/compiler/execution.jl:112 [inlined]
[15] #launch_heuristic#1122
@ ~/.julia/packages/CUDA/htRwP/src/gpuarrays.jl:17 [inlined]
[16] launch_heuristic
@ ~/.julia/packages/CUDA/htRwP/src/gpuarrays.jl:15 [inlined]
[17] _copyto!
@ ~/.julia/packages/GPUArrays/Hd5Sk/src/host/broadcast.jl:56 [inlined]
[18] copyto!
@ ~/.julia/packages/GPUArrays/Hd5Sk/src/host/broadcast.jl:37 [inlined]
[19] copy
@ ~/.julia/packages/GPUArrays/Hd5Sk/src/host/broadcast.jl:28 [inlined]
[20] materialize
@ ./broadcast.jl:903 [inlined]
[21] (::Zygote.var"#1237#1240"{2, CuArray{Bool, 4, CUDA.Mem.DeviceBuffer}})(ȳ::CuArray{Bool, 4, CUDA.Mem.DeviceBuffer})
@ Zygote ~/.julia/packages/Zygote/jxHJc/src/lib/broadcast.jl:108
[22] #3876#back
@ ~/.julia/packages/ZygoteRules/M4xmc/src/adjoint.jl:72 [inlined]
[23] #dice_coeff_loss#26
@ ~/.julia/packages/Flux/ljuc2/src/losses/functions.jl:519 [inlined]
[24] (::Zygote.Pullback{Tuple{Flux.Losses.var"##dice_coeff_loss#26", Int64, typeof(Flux.Losses.dice_coeff_loss), CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CuArray{Bool, 4, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.ZBack{ChainRules.var"#/_pullback#1317"{Float32, Float32, ChainRulesCore.ProjectTo{Float32, @NamedTuple{}}, ChainRulesCore.ProjectTo{Float32, @NamedTuple{}}}}, Zygote.ZBack{ChainRules.var"#-_pullback#1316"{Int64, Bool, ChainRulesCore.ProjectTo{Float32, @NamedTuple{}}, ChainRulesCore.ProjectTo{Float64, @NamedTuple{}}}}, Zygote.Pullback{Tuple{typeof(Base.Broadcast.materialize), CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}}, Tuple{}}, Zygote.var"#4191#back#1443"{Zygote.var"#1439#1442"{CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}}}, Zygote.var"#4191#back#1443"{Zygote.var"#1439#1442"{CuArray{Bool, 4, CUDA.Mem.DeviceBuffer}}}, Zygote.var"#3876#back#1241"{Zygote.var"#1237#1240"{2, CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}}}, Zygote.ZBack{Flux.Losses.var"#_check_sizes_pullback#12"}, Zygote.Pullback{Tuple{typeof(Base.Broadcast.materialize), CuArray{Bool, 4, CUDA.Mem.DeviceBuffer}}, Tuple{}}, Zygote.ZBack{Zygote.var"#plus_pullback#345"{Tuple{Float32, Float32}}}, Zygote.ZBack{ChainRules.var"#times_pullback2#1327"{Int64, Float32}}, Zygote.var"#1922#back#161"{Zygote.var"#157#160"}, Zygote.var"#4191#back#1443"{Zygote.var"#1439#1442"{CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}}}, Zygote.var"#3796#back#1207"{Zygote.var"#1203#1206"{CuArray{Bool, 4, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}}}, Zygote.ZBack{Zygote.var"#plus_pullback#345"{Tuple{Int64, Float32, Float32}}}, Zygote.Pullback{Tuple{typeof(Flux.ofeltype), CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, Int64}, Tuple{Zygote.ZBack{Zygote.var"#convert_pullback#330"}, Zygote.Pullback{Tuple{typeof(eltype), CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.ZBack{ChainRules.var"#eltype_pullback#385"}, Zygote.ZBack{ChainRules.var"#typeof_pullback#45"}}}, Zygote.Pullback{Tuple{typeof(float), Type{Float32}}, Tuple{}}}}, Zygote.Pullback{Tuple{typeof(Base.Broadcast.materialize), CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}}, Tuple{}}, Zygote.var"#1922#back#161"{Zygote.var"#157#160"}, Zygote.var"#3876#back#1241"{Zygote.var"#1237#1240"{2, CuArray{Bool, 4, CUDA.Mem.DeviceBuffer}}}}})(Δ::Float32)
@ Zygote ~/.julia/packages/Zygote/jxHJc/src/compiler/interface2.jl:0
[25] dice_coeff_loss
@ ~/.julia/packages/Flux/ljuc2/src/losses/functions.jl:515 [inlined]
[26] (::Zygote.Pullback{Tuple{typeof(Core.kwcall), @NamedTuple{smooth::Int64}, typeof(Flux.Losses.dice_coeff_loss), CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CuArray{Bool, 4, CUDA.Mem.DeviceBuffer}}, Any})(Δ::Float32)
@ Zygote ~/.julia/packages/Zygote/jxHJc/src/compiler/interface2.jl:0
[27] lossFunction
@ ~/projects/pascalvoc-segmentation/unet2dice.jl:155 [inlined]
[28] (::Zygote.Pullback{Tuple{typeof(lossFunction), CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CuArray{Bool, 4, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.var"#2013#back#204"{typeof(identity)}, Zygote.Pullback{Tuple{typeof(Core.kwcall), @NamedTuple{smooth::Int64}, typeof(Flux.Losses.dice_coeff_loss), CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CuArray{Bool, 4, CUDA.Mem.DeviceBuffer}}, Any}, Zygote.Pullback{Tuple{Type{NamedTuple{(:smooth,)}}, Tuple{Int64}}, Tuple{Zygote.var"#2220#back#315"{Zygote.Jnew{@NamedTuple{smooth::Int64}, Nothing, true}}}}}})(Δ::Float32)
@ Zygote ~/.julia/packages/Zygote/jxHJc/src/compiler/interface2.jl:0
[29] #24
@ ~/.julia/packages/LibML/VPz3X/src/training.jl:6 [inlined]
[30] (::Zygote.Pullback{Tuple{LibML.var"#24#25"{typeof(lossFunction), CuArray{Bool, 4, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}}, UNet2}, Tuple{Zygote.var"#2180#back#303"{Zygote.var"#back#302"{:X, Zygote.Context{false}, LibML.var"#24#25"{typeof(lossFunction), CuArray{Bool, 4, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}}, CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}}}, Zygote.Pullback{Tuple{typeof(lossFunction), CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CuArray{Bool, 4, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.var"#2013#back#204"{typeof(identity)}, Zygote.Pullback{Tuple{typeof(Core.kwcall), @NamedTuple{smooth::Int64}, typeof(Flux.Losses.dice_coeff_loss), CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CuArray{Bool, 4, CUDA.Mem.DeviceBuffer}}, Any}, Zygote.Pullback{Tuple{Type{NamedTuple{(:smooth,)}}, Tuple{Int64}}, Tuple{Zygote.var"#2220#back#315"{Zygote.Jnew{@NamedTuple{smooth::Int64}, Nothing, true}}}}}}, Zygote.Pullback{Tuple{UNet2, CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}}, Any}, Zygote.var"#2180#back#303"{Zygote.var"#back#302"{:y, Zygote.Context{false}, LibML.var"#24#25"{typeof(lossFunction), CuArray{Bool, 4, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}}, CuArray{Bool, 4, CUDA.Mem.DeviceBuffer}}}, Zygote.var"#2180#back#303"{Zygote.var"#back#302"{:lossfn, Zygote.Context{false}, LibML.var"#24#25"{typeof(lossFunction), CuArray{Bool, 4, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}}, typeof(lossFunction)}}}})(Δ::Float32)
@ Zygote ~/.julia/packages/Zygote/jxHJc/src/compiler/interface2.jl:0
[31] (::Zygote.var"#75#76"{Zygote.Pullback{Tuple{LibML.var"#24#25"{typeof(lossFunction), CuArray{Bool, 4, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}}, UNet2}, Tuple{Zygote.var"#2180#back#303"{Zygote.var"#back#302"{:X, Zygote.Context{false}, LibML.var"#24#25"{typeof(lossFunction), CuArray{Bool, 4, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}}, CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}}}, Zygote.Pullback{Tuple{typeof(lossFunction), CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CuArray{Bool, 4, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.var"#2013#back#204"{typeof(identity)}, Zygote.Pullback{Tuple{typeof(Core.kwcall), @NamedTuple{smooth::Int64}, typeof(Flux.Losses.dice_coeff_loss), CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CuArray{Bool, 4, CUDA.Mem.DeviceBuffer}}, Any}, Zygote.Pullback{Tuple{Type{NamedTuple{(:smooth,)}}, Tuple{Int64}}, Tuple{Zygote.var"#2220#back#315"{Zygote.Jnew{@NamedTuple{smooth::Int64}, Nothing, true}}}}}}, Zygote.Pullback{Tuple{UNet2, CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}}, Any}, Zygote.var"#2180#back#303"{Zygote.var"#back#302"{:y, Zygote.Context{false}, LibML.var"#24#25"{typeof(lossFunction), CuArray{Bool, 4, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}}, CuArray{Bool, 4, CUDA.Mem.DeviceBuffer}}}, Zygote.var"#2180#back#303"{Zygote.var"#back#302"{:lossfn, Zygote.Context{false}, LibML.var"#24#25"{typeof(lossFunction), CuArray{Bool, 4, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}}, typeof(lossFunction)}}}}})(Δ::Float32)
@ Zygote ~/.julia/packages/Zygote/jxHJc/src/compiler/interface.jl:91
[32] withgradient(f::Function, args::UNet2)
@ Zygote ~/.julia/packages/Zygote/jxHJc/src/compiler/interface.jl:213
[33] trainModel!(model::UNet2, data::MLUtils.DataLoader{MLUtils.MappedData{:auto, typeof(gpu), Tuple{Array{Float32, 4}, Array{Bool, 4}}}, Random._GLOBAL_RNG, Val{nothing}}, optstate::@NamedTuple{enc::@NamedTuple{layers::Tuple{@NamedTuple{layers::Tuple{@NamedTuple{layers::Tuple{@NamedTuple{σ::Tuple{}, weight::Optimisers.Leaf{Optimisers.Adam, Tuple{CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}}}, bias::Optimisers.Leaf{Optimisers.Adam, Tuple{CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}}}, stride::Tuple{Tuple{}, Tuple{}}, pad::Tuple{Tuple{}, Tuple{}}, dilation::Tuple{Tuple{}, Tuple{}}, groups::Tuple{}}, @NamedTuple{σ::Tuple{}, weight::Optimisers.Leaf{Optimisers.Adam, Tuple{CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}}}, bias::Optimisers.Leaf{Optimisers.Adam, Tuple{CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}}}, stride::Tuple{Tuple{}, Tuple{}}, pad::Tuple{Tuple{}, Tuple{}}, dilation::Tuple{Tuple{}, Tuple{}}, groups::Tuple{}}}}, @NamedTuple{λ::Tuple{}, β::Optimisers.Leaf{Optimisers.Adam, Tuple{CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}}}, γ::Optimisers.Leaf{Optimisers.Adam, Tuple{CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}}}, μ::Tuple{}, σ²::Tuple{}, ϵ::Tuple{}, momentum::Tuple{}, affine::Tuple{}, track_stats::Tuple{}, active::Tuple{}, chs::Tuple{}}}}, @NamedTuple{layers::Tuple{Tuple{}, @NamedTuple{layers::Tuple{@NamedTuple{σ::Tuple{}, weight::Optimisers.Leaf{Optimisers.Adam, Tuple{CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}}}, bias::Optimisers.Leaf{Optimisers.Adam, Tuple{CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}}}, stride::Tuple{Tuple{}, Tuple{}}, pad::Tuple{Tuple{}, Tuple{}}, dilation::Tuple{Tuple{}, Tuple{}}, groups::Tuple{}}, @NamedTuple{σ::Tuple{}, weight::Optimisers.Leaf{Optimisers.Adam, Tuple{CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}}}, bias::Optimisers.Leaf{Optimisers.Adam, Tuple{CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}}}, stride::Tuple{Tuple{}, Tuple{}}, pad::Tuple{Tuple{}, Tuple{}}, dilation::Tuple{Tuple{}, Tuple{}}, groups::Tuple{}}}}, @NamedTuple{λ::Tuple{}, β::Optimisers.Leaf{Optimisers.Adam, Tuple{CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}}}, γ::Optimisers.Leaf{Optimisers.Adam, Tuple{CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}}}, μ::Tuple{}, σ²::Tuple{}, ϵ::Tuple{}, momentum::Tuple{}, affine::Tuple{}, track_stats::Tuple{}, active::Tuple{}, chs::Tuple{}}}}}}, dec::@NamedTuple{layers::Tuple{@NamedTuple{layers::Tuple{@NamedTuple{layers::Tuple{@NamedTuple{σ::Tuple{}, weight::Optimisers.Leaf{Optimisers.Adam, Tuple{CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}}}, bias::Optimisers.Leaf{Optimisers.Adam, Tuple{CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}}}, stride::Tuple{Tuple{}, Tuple{}}, pad::NTuple{4, Tuple{}}, dilation::Tuple{Tuple{}, Tuple{}}, groups::Tuple{}}, @NamedTuple{λ::Tuple{}, β::Optimisers.Leaf{Optimisers.Adam, Tuple{CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}}}, γ::Optimisers.Leaf{Optimisers.Adam, Tuple{CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}}}, μ::Tuple{}, σ²::Tuple{}, ϵ::Tuple{}, momentum::Tuple{}, affine::Tuple{}, track_stats::Tuple{}, active::Tuple{}, chs::Tuple{}}}}, @NamedTuple{p::Tuple{}, dims::Tuple{}, active::Tuple{}, rng::Tuple{}}}}, @NamedTuple{layers::Tuple{@NamedTuple{layers::Tuple{@NamedTuple{σ::Tuple{}, weight::Optimisers.Leaf{Optimisers.Adam, Tuple{CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}}}, bias::Optimisers.Leaf{Optimisers.Adam, Tuple{CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}}}, stride::Tuple{Tuple{}, Tuple{}}, pad::Tuple{Tuple{}, Tuple{}}, dilation::Tuple{Tuple{}, Tuple{}}, groups::Tuple{}}, @NamedTuple{σ::Tuple{}, weight::Optimisers.Leaf{Optimisers.Adam, Tuple{CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}}}, bias::Optimisers.Leaf{Optimisers.Adam, Tuple{CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}}}, stride::Tuple{Tuple{}, Tuple{}}, pad::Tuple{Tuple{}, Tuple{}}, dilation::Tuple{Tuple{}, Tuple{}}, groups::Tuple{}}}}, @NamedTuple{λ::Tuple{}, β::Optimisers.Leaf{Optimisers.Adam, Tuple{CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}}}, γ::Optimisers.Leaf{Optimisers.Adam, Tuple{CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}}}, μ::Tuple{}, σ²::Tuple{}, ϵ::Tuple{}, momentum::Tuple{}, affine::Tuple{}, track_stats::Tuple{}, active::Tuple{}, chs::Tuple{}}, @NamedTuple{σ::Tuple{}, weight::Optimisers.Leaf{Optimisers.Adam, Tuple{CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}}}, bias::Optimisers.Leaf{Optimisers.Adam, Tuple{CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}}}, stride::Tuple{Tuple{}, Tuple{}}, pad::NTuple{4, Tuple{}}, dilation::Tuple{Tuple{}, Tuple{}}, groups::Tuple{}}}}, Tuple{}}}, verbose::Tuple{}}, lossfn::typeof(lossFunction); verbose::Bool)
@ LibML ~/.julia/packages/LibML/VPz3X/src/training.jl:5
[34] trainModel!(model::UNet2, data::MLUtils.DataLoader{MLUtils.MappedData{:auto, typeof(gpu), Tuple{Array{Float32, 4}, Array{Bool, 4}}}, Random._GLOBAL_RNG, Val{nothing}}, optstate::@NamedTuple{enc::@NamedTuple{layers::Tuple{@NamedTuple{layers::Tuple{@NamedTuple{layers::Tuple{@NamedTuple{σ::Tuple{}, weight::Optimisers.Leaf{Optimisers.Adam, Tuple{CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}}}, bias::Optimisers.Leaf{Optimisers.Adam, Tuple{CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}}}, stride::Tuple{Tuple{}, Tuple{}}, pad::Tuple{Tuple{}, Tuple{}}, dilation::Tuple{Tuple{}, Tuple{}}, groups::Tuple{}}, @NamedTuple{σ::Tuple{}, weight::Optimisers.Leaf{Optimisers.Adam, Tuple{CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}}}, bias::Optimisers.Leaf{Optimisers.Adam, Tuple{CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}}}, stride::Tuple{Tuple{}, Tuple{}}, pad::Tuple{Tuple{}, Tuple{}}, dilation::Tuple{Tuple{}, Tuple{}}, groups::Tuple{}}}}, @NamedTuple{λ::Tuple{}, β::Optimisers.Leaf{Optimisers.Adam, Tuple{CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}}}, γ::Optimisers.Leaf{Optimisers.Adam, Tuple{CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}}}, μ::Tuple{}, σ²::Tuple{}, ϵ::Tuple{}, momentum::Tuple{}, affine::Tuple{}, track_stats::Tuple{}, active::Tuple{}, chs::Tuple{}}}}, @NamedTuple{layers::Tuple{Tuple{}, @NamedTuple{layers::Tuple{@NamedTuple{σ::Tuple{}, weight::Optimisers.Leaf{Optimisers.Adam, Tuple{CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}}}, bias::Optimisers.Leaf{Optimisers.Adam, Tuple{CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}}}, stride::Tuple{Tuple{}, Tuple{}}, pad::Tuple{Tuple{}, Tuple{}}, dilation::Tuple{Tuple{}, Tuple{}}, groups::Tuple{}}, @NamedTuple{σ::Tuple{}, weight::Optimisers.Leaf{Optimisers.Adam, Tuple{CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}}}, bias::Optimisers.Leaf{Optimisers.Adam, Tuple{CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}}}, stride::Tuple{Tuple{}, Tuple{}}, pad::Tuple{Tuple{}, Tuple{}}, dilation::Tuple{Tuple{}, Tuple{}}, groups::Tuple{}}}}, @NamedTuple{λ::Tuple{}, β::Optimisers.Leaf{Optimisers.Adam, Tuple{CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}}}, γ::Optimisers.Leaf{Optimisers.Adam, Tuple{CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}}}, μ::Tuple{}, σ²::Tuple{}, ϵ::Tuple{}, momentum::Tuple{}, affine::Tuple{}, track_stats::Tuple{}, active::Tuple{}, chs::Tuple{}}}}}}, dec::@NamedTuple{layers::Tuple{@NamedTuple{layers::Tuple{@NamedTuple{layers::Tuple{@NamedTuple{σ::Tuple{}, weight::Optimisers.Leaf{Optimisers.Adam, Tuple{CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}}}, bias::Optimisers.Leaf{Optimisers.Adam, Tuple{CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}}}, stride::Tuple{Tuple{}, Tuple{}}, pad::NTuple{4, Tuple{}}, dilation::Tuple{Tuple{}, Tuple{}}, groups::Tuple{}}, @NamedTuple{λ::Tuple{}, β::Optimisers.Leaf{Optimisers.Adam, Tuple{CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}}}, γ::Optimisers.Leaf{Optimisers.Adam, Tuple{CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}}}, μ::Tuple{}, σ²::Tuple{}, ϵ::Tuple{}, momentum::Tuple{}, affine::Tuple{}, track_stats::Tuple{}, active::Tuple{}, chs::Tuple{}}}}, @NamedTuple{p::Tuple{}, dims::Tuple{}, active::Tuple{}, rng::Tuple{}}}}, @NamedTuple{layers::Tuple{@NamedTuple{layers::Tuple{@NamedTuple{σ::Tuple{}, weight::Optimisers.Leaf{Optimisers.Adam, Tuple{CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}}}, bias::Optimisers.Leaf{Optimisers.Adam, Tuple{CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}}}, stride::Tuple{Tuple{}, Tuple{}}, pad::Tuple{Tuple{}, Tuple{}}, dilation::Tuple{Tuple{}, Tuple{}}, groups::Tuple{}}, @NamedTuple{σ::Tuple{}, weight::Optimisers.Leaf{Optimisers.Adam, Tuple{CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}}}, bias::Optimisers.Leaf{Optimisers.Adam, Tuple{CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}}}, stride::Tuple{Tuple{}, Tuple{}}, pad::Tuple{Tuple{}, Tuple{}}, dilation::Tuple{Tuple{}, Tuple{}}, groups::Tuple{}}}}, @NamedTuple{λ::Tuple{}, β::Optimisers.Leaf{Optimisers.Adam, Tuple{CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}}}, γ::Optimisers.Leaf{Optimisers.Adam, Tuple{CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}}}, μ::Tuple{}, σ²::Tuple{}, ϵ::Tuple{}, momentum::Tuple{}, affine::Tuple{}, track_stats::Tuple{}, active::Tuple{}, chs::Tuple{}}, @NamedTuple{σ::Tuple{}, weight::Optimisers.Leaf{Optimisers.Adam, Tuple{CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}}}, bias::Optimisers.Leaf{Optimisers.Adam, Tuple{CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}}}, stride::Tuple{Tuple{}, Tuple{}}, pad::NTuple{4, Tuple{}}, dilation::Tuple{Tuple{}, Tuple{}}, groups::Tuple{}}}}, Tuple{}}}, verbose::Tuple{}}, lossfn::Function)

Any thoughts…?

Seems like a bug, please make an issue on Flux.jl (I guess nobody saw it here, sorry). I think I can reproduce this:

julia> using Flux, CUDA

julia> let x = randn(3,5) |> cu
           y = Flux.onehotbatch("abcab", 'a':'c') |> cu
           Flux.dice_coeff_loss(x, y)  # works forward
       end
1.1841338f0

julia> let x = randn(3,5) |> cu
           y = Flux.onehotbatch("abcab", 'a':'c') |> cu
           gradient(Flux.mse, x, y)  # some gradients work
       end
(Float32[-0.16939788 -0.19461282 … -0.30000073 -0.017194644; 0.07464689 -0.15628384 … -0.17090265 -0.007114268; -0.22359066 -0.06903434 … 0.1566836 -0.022250716], nothing)

julia> let x = randn(3,5) |> cu
           y = Flux.onehotbatch("abcab", 'a':'c') |> cu
           gradient(Flux.dice_coeff_loss, x, y)
       end
ERROR: a exception was thrown during kernel execution.
       Run Julia on debug level 2 for device stack traces.
...
ERROR: KernelException: exception thrown during kernel execution on device Tesla V100-PCIE-16GB
Stacktrace:
  [1] check_exceptions()
    @ CUDA ~/.julia/packages/CUDA/htRwP/src/compiler/exceptions.jl:34
  [2] device_synchronize(; blocking::Bool, spin::Bool)
    @ CUDA ~/.julia/packages/CUDA/htRwP/lib/cudadrv/synchronization.jl:180

(@v1.10) pkg> st Flux CUDA
Status `~/.julia/environments/v1.10/Project.toml`
  [052768ef] CUDA v5.2.0
  [587475ba] Flux v0.14.11
1 Like