How to train the lstm model more than one epoch

Here is an example in one epoch. How can I realize training the model in 10 epochs.

using Flux
using CUDA

X_train = rand(60000, 4)
Y_train = rand(60000, 1)

seq_len = 600
batch_size = 10

no_features = 4
N = size(X_train,1)
num_batches = Int(floor(N/(batch_size*seq_len)))

# mini batching
X_batched = [[hcat([Float32.(X_train[j+i*seq_len+(k*(batch_size)*seq_len), :]) for i in 0:batch_size-1]...) for j in 1:seq_len] for k in 0:num_batches-1]
Y_batched = [vcat([Float32.(Y_train[seq_len * i + 1 + (k*(batch_size)*seq_len) : (i+1) * seq_len + (k*(batch_size)*seq_len)]') for i in 0:batch_size-1]...) for k in 0:num_batches-1]

gpu_or_cpu = cpu

if gpu_or_cpu ==gpu
    CUDA.allowscalar(false)
end

# convert to cpu or gpu
X_batched =  X_batched |> gpu_or_cpu
Y_batched = Y_batched |> gpu_or_cpu
data_train = (X_batched, Y_batched)

# select optimizer
opt = ADAM(0.001, (0.9, 0.999))

# definition of the loss function
function loss(X,Y)
    mse_val = 0.0
    for i in 1:length(X)
        Flux.reset!(model)
        mse_val += sum(abs2.(vcat(model.(X[i])...)'.-Y[i]))
    end
    return mse_val
end

# ini of the model
model = Chain(LSTM(4, 70), LSTM(70, 70), LSTM(70, 70), Dense(70, 1, relu)) |> gpu_or_cpu
ps = Flux.params(model)
Flux.reset!(model)

# train 1 epoch

@time Flux.train!(loss, ps, [data_train], opt)

Thanks very much!

Documentation of the @epochs macro can be found here. Also, you can simply put your last line, the training line in a for loop that executes as many times as you would like. That is the core functionality of the @epochs macro anyways.

https://fluxml.ai/Flux.jl/stable/training/training/#Flux.Optimise.@epochs

I try to use Zygote.pullback to access the training loss and the gradient.Change the last line.The whole code is

using JLD2
using Flux
using CUDA,Zygote

X_train = rand(60000, 4)
Y_train = rand(60000, 1)

seq_len = 600
batch_size = 10

no_features = 4
N = size(X_train,1)
num_batches = Int(floor(N/(batch_size*seq_len)))

# mini batching
X_batched = [[hcat([Float32.(X_train[j+i*seq_len+(k*(batch_size)*seq_len), :]) for i in 0:batch_size-1]...) for j in 1:seq_len] for k in 0:num_batches-1]
Y_batched = [vcat([Float32.(Y_train[seq_len * i + 1 + (k*(batch_size)*seq_len) : (i+1) * seq_len + (k*(batch_size)*seq_len)]') for i in 0:batch_size-1]...) for k in 0:num_batches-1]

gpu_or_cpu = cpu

if gpu_or_cpu ==gpu
    CUDA.allowscalar(false)
end

# convert to cpu or gpu
X_batched =  X_batched |> gpu_or_cpu
Y_batched = Y_batched |> gpu_or_cpu
data_train = (X_batched, Y_batched)

# select optimizer
opt = ADAM(0.001, (0.9, 0.999))

# definition of the loss function
function loss(X,Y)
    mse_val = 0.0
    for i in 1:length(X)
        Flux.reset!(model)
        mse_val += sum(abs2.(vcat(model.(X[i])...)'.-Y[i]))
    end
    return mse_val
end

# ini of the model
model = Chain(LSTM(4, 70), LSTM(70, 70), LSTM(70, 70), Dense(70, 1, relu)) |> gpu_or_cpu
ps = Flux.params(model)
Flux.reset!(model)
## use Zygote.pullback to access the training loss and the gradient
#The following code is error
for d in data_train
    train_loss, back = Zygote.pullback(() -> loss(d...), ps)
     gs = back(one(train_loss))
    update!(opt, ps, gs)
  end
end

How to solve the error?
MethodError: no method matching loss(::Array{Array{Float32,2},1}, ::Array{Array{Float32,2},1}, ::Array{Array{Float32,2},1}, ::Array{Array{Float32,2},1}, ::Array{Array{Float32,2},1}, ::Array{Array{Float32,2},1}, ::Array{Array{Float32,2},1}, ::Array{Array{Float32,2},1}, ::Array{Array{Float32,2},1}, ::Array{Array{Float32,2},1})
Closest candidates are:
loss(::Any, ::Any) at In line 34
error in line 49

When you create data_train, you create it as a tuple of (X_batched, Y_batched), hence, when you do

for d in data_train
  # ...
end

The loop runs twice. Once with d = X_batched, once with d = Y_batched. This is not what you are trying to achieve.

Instead, to iterate over each of the 10 batches, you could use:

for d in zip(data_train...)
  # ...
end

Now, I believe there are a few other things that are problematic in your MWE.

  • In your mini-batch training with 10 batches, you have a Y_batched[i] that is 10 × 600, this means you expect 10 outputs to your model, currently it has only 1. I believe your batching approach is not what you are trying to achieve.
  • In your loss, you use Flux.reset!() at every step of the sequence. You should reset before/after the sequence, but not during it.

Here is a modification of your MWE to make it work.

using Flux
using CUDA,Zygote

X_train = rand(Float32, 60000, 4)
Y_train = rand(Float32, 60000, 1)

seq_len = 600
batch_size = 10

no_features = 4
N = size(X_train,1)
num_batches = Int(floor(N/(batch_size*seq_len)))

# Create batches of a time series `X` by splitting the series into
# sequences of length `s`. Each new sequence is shifted by `r` steps.
# When s == r,  the series is split into non-overlapping batches.
function batch_timeseries(X, s::Int, r::Int)
    @assert r > 0 "r must be positive"
    # If X is passed in format T×1, reshape it
    if isa(X, AbstractVector)       
        X = permutedims(X)
    end
    T = size(X, 2)
    @assert s ≤ T "s cannot be longer than the total series"
    # Ensure uniform sequence lengths by dropping the first observations until
    # the total sequence length matches a multiple of the batchsize
    X = X[:, ((T - s) % r)+1:end]   
    [X[:, t:r:end-s+t] for t ∈ 1:s] # Output
end

# mini batching
X_batched = [batch_timeseries(permutedims(X_train[(1 + (k - 1) * seq_len * batch_size):(k * seq_len * batch_size), :]), seq_len, seq_len) for k ∈ 1:num_batches]
Y_batched = [batch_timeseries(permutedims(Y_train[(1 + (k - 1) * seq_len * batch_size):(k * seq_len * batch_size), :]), seq_len, seq_len) for k ∈ 1:num_batches]

gpu_or_cpu = cpu

if gpu_or_cpu ==gpu
    CUDA.allowscalar(false)
end

# convert to cpu or gpu (apply element wise)
X_batched = gpu_or_cpu.(X_batched) 
Y_batched = gpu_or_cpu.(Y_batched)
data_train = (X_batched, Y_batched)

# select optimizer
opt = ADAM(0.001, (0.9, 0.999))

# definition of the loss function
function loss(m, X, Y)
    [m(X[1])] # Warm-up the model on the first observation
    sum(sum(abs2, m(xi) - yi) for (xi, yi) in zip(X[2:end], Y[2:end]))
end

# ini of the model
model = Chain(LSTM(4, 70), LSTM(70, 70), LSTM(70, 70), Dense(70, 1, relu)) |> gpu_or_cpu
ps = Flux.params(model)
Flux.reset!(model)
## use Zygote.pullback to access the training loss and the gradient
#The following code is error
for d in zip(data_train...)
    Flux.reset!(model) # Reset the model before each minibatch
    train_loss, back = Zygote.pullback(() -> loss(model, d...), ps)
    gs = back(one(train_loss))
    Flux.update!(opt, ps, gs)
  end
end

Yes. You are right. But I am still confused to plot loss during each epoch.
Another question:if I want to increase the epoch, for example, 50, How should I modify it?

Then you could simply embed your training loop in an “epoch loop”, e.g.,

epochs = 100
train_losses = Vector{Float32}(undef, epochs) # Keep track of losses
for epoch ∈ 1:epochs
    for d in zip(data_train...)
        Flux.reset!(model) # Reset the model before each minibatch
        train_loss, back = Zygote.pullback(() -> loss(model, d...), ps)
        train_losses[epoch] = train_loss # Append loss to a list for plotting
        gs = back(one(train_loss))
        Flux.update!(opt, ps, gs)
    end
end
plot(train_losses, xlab="Epoch", ylab="Loss", lab="")

It works! Thank you very much!
Here is a simple example. Do you know how to plot loss?

using Flux

Nt = 100       # time steps
Nin,Nout = 5,3 # input size, output size
Nh = 28        # hidden dim
lstm = Chain(LSTM(Nin,Nh),Dense(Nh,Nout)) # simple lstm

# generate some fake data
X,Y = [randn(Float32,Nin,Nt) for i=1:10],[randn(Float32,Nout,Nt) for i=1:10] 

data = Flux.Data.DataLoader((X, Y), batchsize=2)
# loss uses broadcasting 
loss(x, y) = sum(Flux.Losses.mse.(lstm.(x), y))
ps = Flux.params(lstm)

Flux.train!(loss,ps,data,ADAM())

Check the example above, I save the loss in a vector called train_lossesand plot it after the training is done. Let me know if something is unclear.

I supplement the above code for plotting loss.

using Flux

Nt = 100       # time steps
Nin,Nout = 5,3 # input size, output size
Nh = 28        # hidden dim
lstm = Chain(LSTM(Nin,Nh),Dense(Nh,Nout)) # simple lstm

# generate some fake data
X,Y = [randn(Float32,Nin,Nt) for i=1:10],[randn(Float32,Nout,Nt) for i=1:10] 

data = Flux.Data.DataLoader((X, Y), batchsize=2)
# loss uses broadcasting 
loss(x, y) = sum(Flux.Losses.mse.(lstm.(x), y))
ps = Flux.params(lstm)

epochs = 100
train_losses = Vector{Float32}(undef, epochs) # Keep track of losses
for epoch ∈ 1:epochs
    Flux.train!(loss,ps,data,ADAM())
    #The following is error.
    train_losses[epoch] = loss(data...) 
end
plot(train_losses, xlab="Epoch", ylab="Loss", lab="")

Errors show:
MethodError: no method matching loss(::Tuple{Array{Array{Float32,2},1},Array{Array{Float32,2},1}}, ::Tuple{Array{Array{Float32,2},1},Array{Array{Float32,2},1}}, ::Tuple{Array{Array{Float32,2},1},Array{Array{Float32,2},1}}, ::Tuple{Array{Array{Float32,2},1},Array{Array{Float32,2},1}}, ::Tuple{Array{Array{Float32,2},1},Array{Array{Float32,2},1}})

So the train_losses[epoch] = loss(data...) how to modified?

Now you are not iterating over the mini-batches in the loss. You could do

train_losses[epoch] = sum(loss(d...) for d in data) 

GreatYou are amazing!

1 Like