Null gradients with GANs

When trying to replicate a paper that uses GANs with unconventional loss calculations, I’m having problems with null gradients. The basic steps are the following:

  1. genOutput = gen(genInput)
  2. MSE(genOut, label)
  3. MAE(genOut |> volFrac, label |> volFrac)
  4. discOutFake = cat( genInput, condition, genOutput ) |> disc
  5. genLoss = logit_binary_crossentropy(discOutFake, ones)) + 1e4 * MSE + MAE
  6. discOutReal = cat( genInput, condition, label ) |> disc
  7. discLoss = logit_binary_crossentropy(discOutReal, ones)) + logit_binary_crossentropy(discOutFake, zeros))

And the MWE:

using Statistics, Flux, MLUtils, Zygote, LinearAlgebra

const genInput_ = rand(Float32, (10, 10, 3, 5))
const condition_ = rand(Float32, (10, 10, 3, 5))
const label_ = rand(Float32, (10, 10, 1, 5))

function models()
    gen = Chain(Conv((5, 5), 3 => 1, pad = SamePad()),
        ConvTranspose((5, 5), 1 => 1, pad = SamePad()),
    )
    disc = Chain(Conv((5, 5), 7 => 1, pad = SamePad()), flatten,
        Dense(100 => 1, leakyrelu)
    )
    return gen |> gpu, disc |> gpu
end

volFrac(x) = [mean(x[:, :, :, sample]) for sample in axes(x, 4)]
reshapeDiscOut(x) = dropdims(x |> transpose |> Array; dims = 2)

function GANgradsMWE(gen, disc, genInput, condition, label)
  discOutFake, discInputFake = 0.0, 0.0 # initialize for scope purposes
  function genLoss(genOutput) # generator loss. Defined here for scope purposes
    mse = (genOutput .- label) .^ 2 |> mean
    absError = abs.(volFrac(genOutput) .- volFrac(label)) |> mean
    discInputFake = cat(genInput, condition, genOutput; dims = 3) |> gpu
    discOutFake = discInputFake |> disc |> cpu |> reshapeDiscOut
    return Flux.Losses.logitbinarycrossentropy(
      discOutFake, ones(size(discOutFake))
    ) + 10_000 * mse + 1 * absError
  end
  function discLoss(discOutReal, discOutFake) # discriminator loss
    return Flux.Losses.logitbinarycrossentropy(
      discOutReal, ones(discOutReal |> size)
    ) + Flux.Losses.logitbinarycrossentropy(
      discOutFake, zeros(discOutFake |> size)
    )
  end
  genInputGPU = genInput |> gpu
  discInputReal = cat(genInput, condition, label; dims = 3) |> gpu
  genLossVal_, genGrads_ = withgradient(
    gen -> genLoss(gen(genInputGPU) |> cpu), gen
  )
  discLossVal_, discGrads_ = withgradient(
    disc -> discLoss(
        disc(discInputReal) |> cpu |> reshapeDiscOut,
        disc(discInputFake) |> cpu |> reshapeDiscOut
    ),
    disc
  )
  return genGrads_, genLossVal_, discGrads_, discLossVal_
end
genGrads, genLossVal, discGrads, discLossVal = GANgradsMWE(
    models()..., genInput_, condition_, label_
)
@show norm(genGrads); @show norm(discGrads);

And the stacktrace:

ERROR: MethodError: no method matching iterate(::Nothing)
Closest candidates are:
  iterate(::Union{LinRange, StepRangeLen}) at range.jl:872
  iterate(::Union{LinRange, StepRangeLen}, ::Integer) at range.jl:872
  iterate(::T) where T<:Union{Base.KeySet{<:Any, <:Dict}, Base.ValueIterator{<:Dict}} at dict.jl:712
  ...
Stacktrace:
  [1] isempty(itr::Nothing)
    @ Base .\essentials.jl:788
  [2] norm(itr::Nothing, p::Int64) (repeats 2 times)
    @ LinearAlgebra C:\Users\LucasKaoid\AppData\Local\Programs\Julia-1.8.0\share\julia\stdlib\v1.8\LinearAlgebra\src\generic.jl:591
  [3] (::Base.MappingRF{typeof(norm), Base.BottomRF{typeof(max)}})(acc::Base._InitialValue, x::Nothing)
    @ Base .\reduce.jl:95
  [4] _foldl_impl(op::Base.MappingRF{typeof(norm), Base.BottomRF{typeof(max)}}, init::Base._InitialValue, itr::NamedTuple{(:σ, :weight, :bias, :stride, :pad, :dilation, :groups), Tuple{Nothing, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Nothing, Nothing, Nothing, Nothing}})
    @ Base .\reduce.jl:58
  [5] foldl_impl(op::Base.MappingRF{typeof(norm), Base.BottomRF{typeof(max)}}, nt::Base._InitialValue, itr::NamedTuple{(:σ, :weight, :bias, :stride, :pad, :dilation, :groups), Tuple{Nothing, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Nothing, Nothing, Nothing, Nothing}})
    @ Base .\reduce.jl:48
  [6] mapfoldl_impl(f::typeof(norm), op::typeof(max), nt::Base._InitialValue, itr::NamedTuple{(:σ, :weight, :bias, :stride, :pad, :dilation, :groups), Tuple{Nothing, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Nothing, Nothing, Nothing, Nothing}})
    @ Base .\reduce.jl:44
  [7] mapfoldl(f::Function, op::Function, itr::NamedTuple{(:σ, :weight, :bias, :stride, :pad, :dilation, :groups), Tuple{Nothing, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Nothing, Nothing, Nothing, Nothing}}; init::Base._InitialValue)
    @ Base .\reduce.jl:162
  [8] mapfoldl
    @ .\reduce.jl:162 [inlined]
  [9] #mapreduce#262
    @ .\reduce.jl:294 [inlined]
 [10] mapreduce(f::Function, op::Function, itr::NamedTuple{(:σ, :weight, :bias, :stride, :pad, :dilation, :groups), Tuple{Nothing, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Nothing, Nothing, Nothing, Nothing}})
    @ Base .\reduce.jl:294
 [11] generic_normInf(x::NamedTuple{(:σ, :weight, :bias, :stride, :pad, :dilation, :groups), Tuple{Nothing, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, 
CUDA.Mem.DeviceBuffer}, Nothing, Nothing, Nothing, Nothing}})
    @ LinearAlgebra C:\Users\LucasKaoid\AppData\Local\Programs\Julia-1.8.0\share\julia\stdlib\v1.8\LinearAlgebra\src\generic.jl:453
 [12] normInf(x::NamedTuple{(:σ, :weight, :bias, :stride, :pad, :dilation, :groups), Tuple{Nothing, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Nothing, Nothing, Nothing, Nothing}})
    @ LinearAlgebra C:\Users\LucasKaoid\AppData\Local\Programs\Julia-1.8.0\share\julia\stdlib\v1.8\LinearAlgebra\src\generic.jl:522
 [13] generic_norm2(x::NamedTuple{(:σ, :weight, :bias, :stride, :pad, :dilation, :groups), Tuple{Nothing, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Nothing, Nothing, Nothing, Nothing}})
    @ LinearAlgebra C:\Users\LucasKaoid\AppData\Local\Programs\Julia-1.8.0\share\julia\stdlib\v1.8\LinearAlgebra\src\generic.jl:463
 [14] norm2(x::NamedTuple{(:σ, :weight, :bias, :stride, :pad, :dilation, :groups), Tuple{Nothing, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Nothing, Nothing, Nothing, Nothing}})
    @ LinearAlgebra C:\Users\LucasKaoid\AppData\Local\Programs\Julia-1.8.0\share\julia\stdlib\v1.8\LinearAlgebra\src\generic.jl:524
 [15] norm(itr::NamedTuple{(:σ, :weight, :bias, :stride, :pad, :dilation, :groups), Tuple{Nothing, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Nothing, Nothing, Nothing, Nothing}}, p::Int64)
    @ LinearAlgebra C:\Users\LucasKaoid\AppData\Local\Programs\Julia-1.8.0\share\julia\stdlib\v1.8\LinearAlgebra\src\generic.jl:593
 [16] norm(itr::NamedTuple{(:σ, :weight, :bias, :stride, :pad, :dilation, :groups), Tuple{Nothing, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Nothing, Nothing, Nothing, Nothing}})
    @ LinearAlgebra C:\Users\LucasKaoid\AppData\Local\Programs\Julia-1.8.0\share\julia\stdlib\v1.8\LinearAlgebra\src\generic.jl:591
 [17] MappingRF
    @ .\reduce.jl:95 [inlined]
 [18] afoldl(::Base.MappingRF{typeof(norm), Base.BottomRF{typeof(max)}}, ::Base._InitialValue, ::NamedTuple{(:σ, :weight, :bias, :stride, :pad, :dilation, :groups), Tuple{Nothing, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Nothing, Nothing, Nothing, Nothing}}, ::NamedTuple{(:σ, :weight, :bias, :stride, :pad, :dilation, :groups), Tuple{Nothing, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Nothing, Nothing, Nothing, Nothing}})
    @ Base .\operators.jl:548
 [19] _foldl_impl(op::Base.MappingRF{typeof(norm), Base.BottomRF{typeof(max)}}, init::Base._InitialValue, itr::Tuple{NamedTuple{(:σ, :weight, :bias, :stride, :pad, :dilation, :groups), Tuple{Nothing, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Nothing, Nothing, Nothing, Nothing}}, NamedTuple{(:σ, :weight, :bias, :stride, :pad, :dilation, :groups), Tuple{Nothing, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Nothing, Nothing, Nothing, Nothing}}})
    @ Base .\tuple.jl:277
 [20] foldl_impl(op::Base.MappingRF{typeof(norm), Base.BottomRF{typeof(max)}}, nt::Base._InitialValue, itr::Tuple{NamedTuple{(:σ, :weight, :bias, :stride, :pad, :dilation, :groups), 
Tuple{Nothing, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Nothing, Nothing, Nothing, Nothing}}, NamedTuple{(:σ, :weight, :bias, :stride, :pad, :dilation, :groups), Tuple{Nothing, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Nothing, Nothing, Nothing, Nothing}}})
    @ Base .\reduce.jl:48
 [21] mapfoldl_impl(f::typeof(norm), op::typeof(max), nt::Base._InitialValue, itr::Tuple{NamedTuple{(:σ, :weight, :bias, :stride, :pad, :dilation, :groups), Tuple{Nothing, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Nothing, Nothing, Nothing, Nothing}}, NamedTuple{(:σ, :weight, :bias, :stride, :pad, :dilation, :groups), Tuple{Nothing, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Nothing, Nothing, Nothing, Nothing}}})
    @ Base .\reduce.jl:44
 [22] mapfoldl(f::Function, op::Function, itr::Tuple{NamedTuple{(:σ, :weight, :bias, :stride, :pad, :dilation, :groups), Tuple{Nothing, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Nothing, Nothing, Nothing, Nothing}}, NamedTuple{(:σ, :weight, :bias, :stride, :pad, :dilation, :groups), Tuple{Nothing, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Nothing, Nothing, Nothing, Nothing}}}; init::Base._InitialValue)
    @ Base .\reduce.jl:162
 [23] mapfoldl
    @ .\reduce.jl:162 [inlined]
 [24] mapreduce(f::Function, op::Function, itr::Tuple{NamedTuple{(:σ, :weight, :bias, :stride, :pad, :dilation, :groups), Tuple{Nothing, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Nothing, Nothing, Nothing, Nothing}}, NamedTuple{(:σ, :weight, :bias, :stride, :pad, :dilation, :groups), Tuple{Nothing, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Nothing, Nothing, Nothing, Nothing}}}; kw::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
    @ Base .\reduce.jl:294
 [25] mapreduce
    @ .\reduce.jl:294 [inlined]
 [26] generic_normInf(x::Tuple{NamedTuple{(:σ, :weight, :bias, :stride, :pad, :dilation, :groups), Tuple{Nothing, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Nothing, Nothing, Nothing, Nothing}}, NamedTuple{(:σ, :weight, :bias, :stride, :pad, :dilation, :groups), Tuple{Nothing, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Nothing, Nothing, Nothing, Nothing}}})
    @ LinearAlgebra C:\Users\LucasKaoid\AppData\Local\Programs\Julia-1.8.0\share\julia\stdlib\v1.8\LinearAlgebra\src\generic.jl:453
 [27] normInf
    @ C:\Users\LucasKaoid\AppData\Local\Programs\Julia-1.8.0\share\julia\stdlib\v1.8\LinearAlgebra\src\generic.jl:522 [inlined]
 [28] generic_norm2(x::Tuple{NamedTuple{(:σ, :weight, :bias, :stride, :pad, :dilation, :groups), Tuple{Nothing, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Nothing, Nothing, Nothing, Nothing}}, NamedTuple{(:σ, :weight, :bias, :stride, :pad, :dilation, :groups), Tuple{Nothing, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Nothing, Nothing, Nothing, Nothing}}})
    @ LinearAlgebra C:\Users\LucasKaoid\AppData\Local\Programs\Julia-1.8.0\share\julia\stdlib\v1.8\LinearAlgebra\src\generic.jl:463
 [29] norm2
    @ C:\Users\LucasKaoid\AppData\Local\Programs\Julia-1.8.0\share\julia\stdlib\v1.8\LinearAlgebra\src\generic.jl:524 [inlined]
 [30] norm
    @ C:\Users\LucasKaoid\AppData\Local\Programs\Julia-1.8.0\share\julia\stdlib\v1.8\LinearAlgebra\src\generic.jl:593 [inlined]
 [31] norm
    @ C:\Users\LucasKaoid\AppData\Local\Programs\Julia-1.8.0\share\julia\stdlib\v1.8\LinearAlgebra\src\generic.jl:591 [inlined]
 [32] MappingRF
    @ .\reduce.jl:95 [inlined]
 [33] _foldl_impl
    @ .\reduce.jl:58 [inlined]
 [34] foldl_impl
    @ .\reduce.jl:48 [inlined]
 [35] mapfoldl_impl(f::typeof(norm), op::typeof(max), nt::Base._InitialValue, itr::NamedTuple{(:layers,), Tuple{Tuple{NamedTuple{(:σ, :weight, :bias, :stride, :pad, :dilation, :groups), Tuple{Nothing, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Nothing, Nothing, Nothing, Nothing}}, NamedTuple{(:σ, :weight, :bias, :stride, :pad, :dilation, :groups), Tuple{Nothing, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Nothing, Nothing, Nothing, Nothing}}}}})
    @ Base .\reduce.jl:44
 [36] mapfoldl(f::Function, op::Function, itr::NamedTuple{(:layers,), Tuple{Tuple{NamedTuple{(:σ, :weight, :bias, :stride, :pad, :dilation, :groups), Tuple{Nothing, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Nothing, Nothing, Nothing, Nothing}}, NamedTuple{(:σ, :weight, :bias, :stride, :pad, :dilation, :groups), Tuple{Nothing, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Nothing, Nothing, Nothing, Nothing}}}}}; init::Base._InitialValue)
    @ Base .\reduce.jl:162
 [37] mapfoldl
    @ .\reduce.jl:162 [inlined]
 [38] mapreduce(f::Function, op::Function, itr::NamedTuple{(:layers,), Tuple{Tuple{NamedTuple{(:σ, :weight, :bias, :stride, :pad, :dilation, :groups), Tuple{Nothing, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Nothing, Nothing, Nothing, Nothing}}, NamedTuple{(:σ, :weight, :bias, :stride, :pad, :dilation, :groups), Tuple{Nothing, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Nothing, Nothing, Nothing, Nothing}}}}}; kw::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
    @ Base .\reduce.jl:294
 [39] mapreduce
    @ .\reduce.jl:294 [inlined]
 [40] generic_normInf(x::NamedTuple{(:layers,), Tuple{Tuple{NamedTuple{(:σ, :weight, :bias, :stride, :pad, :dilation, :groups), Tuple{Nothing, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Nothing, Nothing, Nothing, Nothing}}, NamedTuple{(:σ, :weight, :bias, :stride, :pad, :dilation, :groups), Tuple{Nothing, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Nothing, Nothing, Nothing, Nothing}}}}})
    @ LinearAlgebra C:\Users\LucasKaoid\AppData\Local\Programs\Julia-1.8.0\share\julia\stdlib\v1.8\LinearAlgebra\src\generic.jl:453
 [41] normInf
    @ C:\Users\LucasKaoid\AppData\Local\Programs\Julia-1.8.0\share\julia\stdlib\v1.8\LinearAlgebra\src\generic.jl:522 [inlined]
 [42] generic_norm2(x::NamedTuple{(:layers,), Tuple{Tuple{NamedTuple{(:σ, :weight, :bias, :stride, :pad, :dilation, :groups), Tuple{Nothing, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Nothing, Nothing, Nothing, Nothing}}, NamedTuple{(:σ, :weight, :bias, :stride, :pad, :dilation, :groups), Tuple{Nothing, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Nothing, Nothing, Nothing, Nothing}}}}})
    @ LinearAlgebra C:\Users\LucasKaoid\AppData\Local\Programs\Julia-1.8.0\share\julia\stdlib\v1.8\LinearAlgebra\src\generic.jl:463
 [43] norm2
    @ C:\Users\LucasKaoid\AppData\Local\Programs\Julia-1.8.0\share\julia\stdlib\v1.8\LinearAlgebra\src\generic.jl:524 [inlined]
 [44] norm(itr::NamedTuple{(:layers,), Tuple{Tuple{NamedTuple{(:σ, :weight, :bias, :stride, :pad, :dilation, :groups), Tuple{Nothing, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Nothing, Nothing, Nothing, Nothing}}, NamedTuple{(:σ, :weight, :bias, :stride, :pad, :dilation, :groups), Tuple{Nothing, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Nothing, Nothing, Nothing, Nothing}}}}}, p::Int64)
    @ LinearAlgebra C:\Users\LucasKaoid\AppData\Local\Programs\Julia-1.8.0\share\julia\stdlib\v1.8\LinearAlgebra\src\generic.jl:593
 [45] norm
    @ C:\Users\LucasKaoid\AppData\Local\Programs\Julia-1.8.0\share\julia\stdlib\v1.8\LinearAlgebra\src\generic.jl:591 [inlined]
 [46] MappingRF
    @ .\reduce.jl:95 [inlined]
 [47] afoldl
    @ .\operators.jl:548 [inlined]
 [48] _foldl_impl
    @ .\tuple.jl:277 [inlined]
 [49] foldl_impl
    @ .\reduce.jl:48 [inlined]
 [50] mapfoldl_impl(f::typeof(norm), op::typeof(max), nt::Base._InitialValue, itr::Tuple{NamedTuple{(:layers,), Tuple{Tuple{NamedTuple{(:σ, :weight, :bias, :stride, :pad, :dilation, 
:groups), Tuple{Nothing, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Nothing, Nothing, Nothing, Nothing}}, NamedTuple{(:σ, :weight, :bias, :stride, :pad, :dilation, :groups), Tuple{Nothing, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Nothing, Nothing, Nothing, Nothing}}}}}})
    @ Base .\reduce.jl:44
 [51] mapfoldl(f::Function, op::Function, itr::Tuple{NamedTuple{(:layers,), Tuple{Tuple{NamedTuple{(:σ, :weight, :bias, :stride, :pad, :dilation, :groups), Tuple{Nothing, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Nothing, Nothing, Nothing, Nothing}}, NamedTuple{(:σ, :weight, :bias, :stride, :pad, :dilation, :groups), Tuple{Nothing, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Nothing, Nothing, Nothing, Nothing}}}}}}; init::Base._InitialValue)
    @ Base .\reduce.jl:162
 [52] mapfoldl
    @ .\reduce.jl:162 [inlined]
 [53] mapreduce(f::Function, op::Function, itr::Tuple{NamedTuple{(:layers,), Tuple{Tuple{NamedTuple{(:σ, :weight, :bias, :stride, :pad, :dilation, :groups), Tuple{Nothing, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Nothing, Nothing, Nothing, Nothing}}, NamedTuple{(:σ, :weight, :bias, :stride, :pad, :dilation, :groups), Tuple{Nothing, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Nothing, Nothing, Nothing, Nothing}}}}}}; kw::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
    @ Base .\reduce.jl:294
 [54] mapreduce
    @ .\reduce.jl:294 [inlined]
 [55] generic_normInf(x::Tuple{NamedTuple{(:layers,), Tuple{Tuple{NamedTuple{(:σ, :weight, :bias, :stride, :pad, :dilation, :groups), Tuple{Nothing, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Nothing, Nothing, Nothing, Nothing}}, NamedTuple{(:σ, :weight, :bias, :stride, :pad, :dilation, :groups), Tuple{Nothing, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Nothing, Nothing, Nothing, Nothing}}}}}})
    @ LinearAlgebra C:\Users\LucasKaoid\AppData\Local\Programs\Julia-1.8.0\share\julia\stdlib\v1.8\LinearAlgebra\src\generic.jl:453
 [56] normInf
    @ C:\Users\LucasKaoid\AppData\Local\Programs\Julia-1.8.0\share\julia\stdlib\v1.8\LinearAlgebra\src\generic.jl:522 [inlined]
 [57] generic_norm2(x::Tuple{NamedTuple{(:layers,), Tuple{Tuple{NamedTuple{(:σ, :weight, :bias, :stride, :pad, :dilation, :groups), Tuple{Nothing, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Nothing, Nothing, Nothing, Nothing}}, NamedTuple{(:σ, :weight, :bias, :stride, :pad, :dilation, :groups), Tuple{Nothing, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Nothing, Nothing, Nothing, Nothing}}}}}})
    @ LinearAlgebra C:\Users\LucasKaoid\AppData\Local\Programs\Julia-1.8.0\share\julia\stdlib\v1.8\LinearAlgebra\src\generic.jl:463
 [58] norm2
    @ C:\Users\LucasKaoid\AppData\Local\Programs\Julia-1.8.0\share\julia\stdlib\v1.8\LinearAlgebra\src\generic.jl:524 [inlined]
 [59] norm(itr::Tuple{NamedTuple{(:layers,), Tuple{Tuple{NamedTuple{(:σ, :weight, :bias, :stride, :pad, :dilation, :groups), Tuple{Nothing, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Nothing, Nothing, Nothing, Nothing}}, NamedTuple{(:σ, :weight, :bias, :stride, :pad, :dilation, :groups), Tuple{Nothing, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Nothing, Nothing, Nothing, Nothing}}}}}}, p::Int64)
    @ LinearAlgebra C:\Users\LucasKaoid\AppData\Local\Programs\Julia-1.8.0\share\julia\stdlib\v1.8\LinearAlgebra\src\generic.jl:593
 [60] norm(itr::Tuple{NamedTuple{(:layers,), Tuple{Tuple{NamedTuple{(:σ, :weight, :bias, :stride, :pad, :dilation, :groups), Tuple{Nothing, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Nothing, Nothing, Nothing, Nothing}}, NamedTuple{(:σ, :weight, :bias, :stride, :pad, :dilation, :groups), Tuple{Nothing, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Nothing, Nothing, Nothing, Nothing}}}}}})
    @ LinearAlgebra C:\Users\LucasKaoid\AppData\Local\Programs\Julia-1.8.0\share\julia\stdlib\v1.8\LinearAlgebra\src\generic.jl:591

From this preview docs page, I understand that the basic rules for correct (explicit) gradients in Flux are that both the loss function and model executions must be inside the gradient call. I already tried a handful of versions of these loss calculations, and the gradient is never right. Even when being able to “train”, closer inspection always reveals that some intermediate values are being ignored in gradient calculation. How should I implement this in Julia?

The gradients aren’t null. LinearAlgebra.norm() can’t take nothing as an argument. Using LinearAlgebra.norm(::Nothing, p::Real = 2) = false, the norms of the gradients are calculated as expected.