Logging training loss in Flux

bhaveshshrimali · May 8, 2022, 12:12am

Hi all,

I had a quick question – seemingly trivial but haven’t managed to figure it out after an hour of trying…

What’s the best way to log the training loss and ouput say to a simple text file for visualization using Flux. Consider for instance a template example using NeuralOperators.jl
slight modified, say

function get_data_don(; n=2048, Δsamples=2^3, grid_size=div(2^13, Δsamples))
    file = matopen(joinpath(datadep"Burgers", "burgers_data_R10.mat"))
    x_data = collect(read(file, "a")[1:n, 1:Δsamples:end])
    y_data = collect(read(file, "u")[1:n, 1:Δsamples:end])
    close(file)

    return x_data, y_data
end

function train_don(; n=300, cuda=true, learning_rate=0.001, epochs=400)
    if cuda && has_cuda()
        @info "Training on GPU"
        device = gpu
    else
        @info "Training on CPU"
        device = cpu
    end

    x, y = get_data_don(n=n)

    xtrain = x[1:280, :]'
    ytrain = y[1:280, :]

    xval = x[end-19:end, :]' |> device
    yval = y[end-19:end, :] |> device

    grid = collect(range(0, 1, length=1024)') |> device

    opt = ADAM(learning_rate)

    m = DeepONet((1024,1024,1024), (1,1024,1024), gelu, gelu) |> device

    loss(X, y, sensor) = Flux.Losses.mse(m(X, sensor), y)
    evalcb() = @show(loss(xval, yval, grid))

    data = [(xtrain, ytrain, grid)] |> device

    # Flux's default training loop
    Flux.@epochs epochs Flux.train!(loss, params(m), data, opt, cb=evalcb)
    
    # What we need instead is a custom training loop
    global loss_vector = Vector{Float32}();
    my_custom_train!(loss, ps, data, opt)          # BUT THIS DOESN'T WORK

    ỹ = m(xval |> device, grid |> device)

    diffvec = vec(abs.(cpu(yval) .- cpu(ỹ)))
    mean_diff = sum(diffvec)/length(diffvec)
    return mean_diff

    
end

which is along the lines of what the docs on Flux indicate, namely,


function logging_callback(loss)
    global loss_vector;
    push!(loss_vector, loss);
end

function my_custom_train!(loss, ps, data, opt)
    global loss_vector
    local training_loss                                                            
    ps = Flux.Params(ps)    # Simply using Params throws an error
    for d ∈ data
      gs = gradient(ps) do
        training_loss = loss(d...)
        return training_loss
      end

      loggin_callback(training_loss)
      update!(opt, ps, gs)
    end
end

This is in essence similar to Flux: Custom Training + Logging - #4 by contradict but I couldn’t understand how I could simply append the loss to a vector and then write to a simple txt file for visualization downstream… Any pointers?

Thanks,

ChrisRackauckas · May 8, 2022, 4:47am

That should work fine. Did you try it?

bhaveshshrimali · May 8, 2022, 4:59am

Thanks, Chris. I did try, but didn’t manage to make it work yet. It’s pretty much the same code as in NeuralOperators.jl, specifically Burgers_deeponet.jl
along with the pointers in the linked post…

using DataDeps, MAT, MLUtils
using NeuralOperators, Flux
using CUDA, FluxTraining, BSON
# using Plots
function get_data_don(; n=2048, Δsamples=2^3, grid_size=div(2^13, Δsamples))
    file = matopen(joinpath(pwd(), "burgers_data_R10.mat"))
    x_data = collect(read(file, "a")[1:n, 1:Δsamples:end])
    y_data = collect(read(file, "u")[1:n, 1:Δsamples:end])
    close(file)

    return x_data, y_data
end

function logging_callback(loss)
    global loss_vector;
    push!(loss_vector, loss);
end

function my_custom_train!(loss, ps, data, opt)
    global loss_vector
    local training_loss                                                            
    ps = Flux.Params(ps)    # Simply using Params throws an error
    for d in data
      gs = gradient(ps) do
        training_loss = loss(d...)
        return training_loss
      end

      logging_callback(training_loss)
      update!(opt, ps, gs)
    end
end


function train_don(; n=300, cuda=true, epochs=80000, η₀=1f-3, λ=1f-4)
    if cuda && has_cuda()
        CUDA.allowscalar(false)
        @info "Training on GPU"
        device = gpu
    else
        @info "Training on CPU"
        device = cpu
    end

    x, y = get_data_don(n=n)

    xtrain = x[1:280, :]' 
    ytrain = y[1:280, :] 
    # return xtrain, ytrain;
    xval = x[end-19:end, :]' |> device
    yval = y[end-19:end, :] |> device

    widths = 1024;
    grid = collect(range(0, 1, length=widths)') |> device

    # opt = ADAM(learning_rate)
    opt = Flux.Optimiser(WeightDecay(λ), Flux.ADAM(η₀))

    m = DeepONet((widths, widths, widths), (1, widths, widths), gelu, gelu) |> device

    loss(X, y, sensor) = Flux.Losses.mse(m(X, sensor), y)
    global loss_vector = Vector{Float32}();
    # store_loss_cb() = CUDA.@allowscalar push!(loss_vector, loss(xtrain, ytrain, grid));
    # evalcb() = @show(loss(xtrain, ytrain, grid));

    data = [(xtrain, ytrain, grid)] |> device
    # my_custom_train!(loss, params(m), data, opt)
    # Flux.@epochs epochs Flux.train!(loss, params(m), data, opt, cb=evalcb)
    # custom training loop
    my_custom_train!(loss, params(m), data, opt)
    
    
    BSON.@save "deeponet_burgers_$widths.bson" m;
    # ỹ = m(xval |> device, grid |> device)

    # diffvec = vec(abs.(cpu(yval) .- cpu(ỹ)))
    # mean_diff = sum(diffvec)/length(diffvec)
    # io = open("losses_$widths.txt", "w") do io
    #     for x in loss_vector
    #       println(io, x)
    #     end
    # end
    # return mean_diff
end

train_don()

which throws

[ Info: Training on GPU
ERROR: MethodError: no method matching update!(::Flux.Optimise.Optimiser, ::Zygote.Params{Zygote.Buffer{Any, Vector{Any}}}, ::Zygote.Grads)
Closest candidates are:
  update!(::HMAC_CTX, ::Any, ::Any) at C:\Users\bshri\AppData\Local\Programs\Julia-1.7.1\share\julia\stdlib\v1.7\SHA\src\hmac.jl:25
  update!(::T, ::U, ::Any) where {T<:SHA.SHA_CTX, U<:Union{Tuple{Vararg{UInt8, N}} where N, AbstractVector{UInt8}}} at C:\Users\bshri\AppData\Local\Programs\Julia-1.7.1\share\julia\stdlib\v1.7\SHA\src\common.jl:21
  update!(::HMAC_CTX, ::Any) at C:\Users\bshri\AppData\Local\Programs\Julia-1.7.1\share\julia\stdlib\v1.7\SHA\src\hmac.jl:25

ericphanson · May 8, 2022, 2:37pm

I think it’s Flux.Optimise.update!

Topic		Replies	Views
Flux: Custom Training + Logging General Usage flux	7	2238	June 19, 2020
FluxTraning log metrics General Usage question	0	179	January 9, 2023
How to record loss during training with Flux.jl? Machine Learning flux	1	668	March 14, 2022
Problem with Training and trying to plot loss function in Flux New to Julia	5	794	August 17, 2022
FLUX.JL -- MethodError: no method matching loss() Machine Learning question , flux	4	1065	April 24, 2023

Logging training loss in Flux

Related topics