Flux.jl is not modifying the loss

using Flux,LinearAlgebra
function MWE()
    η=0.01
    model=Conv((64, 64), 1=>1, relu,stride=30)
    loss(x,y) = Flux.mse(model(x),y)
    dataset=[(rand(100,100,1,1),rand(2,2,1,1))]
    @show model(dataset[1][1])
    @show loss(dataset[1]...)
    for i in 1:10
        Flux.train!(loss,params(model), dataset, ADAM(η))
        @show i, loss(dataset[1]...)
    end
end

MWE();
model((dataset[1])[1]) = [0.0 2.86116; 0.0 0.0] (tracked)
loss(dataset[1]...) = 1.4128901344841402 (tracked)
(i, loss(dataset[1]...)) = (1, 0.37165153102154674 (tracked))
(i, loss(dataset[1]...)) = (2, 0.37165153102154674 (tracked))
(i, loss(dataset[1]...)) = (3, 0.37165153102154674 (tracked))
(i, loss(dataset[1]...)) = (4, 0.37165153102154674 (tracked))
(i, loss(dataset[1]...)) = (5, 0.37165153102154674 (tracked))
(i, loss(dataset[1]...)) = (6, 0.37165153102154674 (tracked))
(i, loss(dataset[1]...)) = (7, 0.37165153102154674 (tracked))
(i, loss(dataset[1]...)) = (8, 0.37165153102154674 (tracked))
(i, loss(dataset[1]...)) = (9, 0.37165153102154674 (tracked))
(i, loss(dataset[1]...)) = (10, 0.37165153102154674 (tracked))

For some weird reason i’m not able to fix this behavior, is there a remedy?

Why do you allocate a new ADAM optimizer every loop iteration? Try pulling this out of the loop (so you allocate once), although I doubt that that’s the root of the issue. Everything looks correct to me otherwise.

2 Likes

same outcome

using Flux,LinearAlgebra
function MWE()
    η=0.01
    model=Conv((64, 64), 1=>1, relu,stride=30)
    loss(x,y) = Flux.mse(model(x),y)
    dataset=[(rand(100,100,1,1),rand(2,2,1,1))]
    @show model(dataset[1][1])
    @show loss(dataset[1]...)
    opt=ADAM(η)
    for i in 1:10
        Flux.train!(loss,params(model), dataset, opt)
        @show i, loss(dataset[1]...)
    end
end

MWE();
model((dataset[1])[1]) = [0.0 0.0; 1.46359 0.0] (tracked)
loss(dataset[1]...) = 0.2001447729236289 (tracked)
(i, loss(dataset[1]...)) = (1, 0.29602579662457984 (tracked))
(i, loss(dataset[1]...)) = (2, 0.29602579662457984 (tracked))
(i, loss(dataset[1]...)) = (3, 0.29602579662457984 (tracked))
(i, loss(dataset[1]...)) = (4, 0.29602579662457984 (tracked))
(i, loss(dataset[1]...)) = (5, 0.29602579662457984 (tracked))
(i, loss(dataset[1]...)) = (6, 0.29602579662457984 (tracked))
(i, loss(dataset[1]...)) = (7, 0.29602579662457984 (tracked))
(i, loss(dataset[1]...)) = (8, 0.29602579662457984 (tracked))
(i, loss(dataset[1]...)) = (9, 0.29602579662457984 (tracked))
(i, loss(dataset[1]...)) = (10, 0.29602579662457984 (tracked))

I think this is simply because you’re using relu at the final output with only a single layer — if the model’s output ever goes negative then Flux completely loses the trail since the derivatives at that point also become zero.

3 Likes

Thank you! unfortunately i was familiar with the meaning of RELU, i thought it was a linear correspondence!
now it works