Problems on training LSTM

I’m trying to use LSTM to forecast the trend of Covid-19. Code is below:

using MAT
using Plots
using Measures
using Flux
using DifferentialEquations
using DiffEqFlux
using LaTeXStrings
#data importing
vars = matread("")
Infected = vars["Hubei_Infected_All"]
Recovered = vars["Hubei_Recovered_All"]
Dead = vars["Hubei_Dead_All"]
Time = vars["Hubei_Time"]
#build network
n=2
rnn = Chain(LSTM(4,n),LSTM(n,1))
p1,re = Flux.destructure(rnn)
p2 = Float64[1, 0.023]
p3 = [p1; p2]
ps = Flux.params(p3)
m=size(p1)[1]
p3
#Solve the diffequations
function QSIR(du, u, p, t)
    β = abs(p[m+1])
    γ = abs(p[m+2])
    du[1]=  - β*u[1]*(u[2])/u0[1]
    du[2] = β*u[1]*(u[2])/u0[1] - γ*u[2] - abs(re(p[1:m])(u)[1])*u[2]/u0[1]
    du[3] = γ*u[2]
    du[4] =  + abs(re(p[1:m])(u)[1])*u[2]/u0[1]
end


u0 = Float64[11000000.0, 500.0 ,10.0, 10.0]
tspan = (0, 39.0)
datasize = 39;

prob = ODEProblem(QSIR, u0, tspan, p3)
t = range(tspan[1],tspan[2],length=datasize)

sol= Array(concrete_solve(prob,Rosenbrock23(autodiff = false),u0,p3,saveat=t))
function predict_adjoint()
  Array(concrete_solve(prob,Rosenbrock23(autodiff = false),u0,p3,saveat=t))
end

I = Infected[1, :];
R = Recovered[1,:];
D = Dead[1, :];
#Loss Function
function loss_adjoint()
 prediction = predict_adjoint()
 loss = sum(abs2, log.(abs.(I[1:39])) .- log.(abs.(prediction[2, :]))) + sum(abs2, log.(abs.(R[1:39] + D[1:39]) .+ 1) .- log.(abs.(prediction[3, :] .+ 1)))
end

Loss = Float64[]
P1 = Float64[]
P2 = Float64[]
P3 = Float64[]

using DiffEqFlux
datan = Iterators.repeated((), 1000)
opt = ADAM(0.01)
cb = function()
  global Loss = append!(Loss, loss_adjoint())
  global P1 = append!(P1, p3[m+1])
  global P2 = append!(P2, p3[m+2])
  global P3 = append!(P3, p3)
end
Flux.train!(loss_adjoint, ps, datan, opt, cb = cb)
#min of the Loss
L = findmin(Loss)
idx = L[2]
idx1 = (idx-1)*(m+2) +1
idx2 = idx*(m+2)
global p3 = P3[idx1: idx2]
using DiffEqFlux
prediction=  Array(concrete_solve(prob,Rosenbrock23(autodiff=false),u0,p3,saveat=t))
S_NN_all_loss = prediction[1, :]
I_NN_all_loss = prediction[2, :]
R_NN_all_loss = prediction[3, :]
T_NN_all_loss = prediction[4, :]
#Q(t)
Q_parameter = zeros(Float64, length(S_NN_all_loss), 1)

for i = 1:length(S_NN_all_loss)
  Q_parameter[i] = abs(re(p3[1:m])([S_NN_all_loss[i],I_NN_all_loss[i], R_NN_all_loss[i], T_NN_all_loss[i]])[1])
end
#Plot Q
scatter(t,Q_parameter/u0[1], xlabel = "Days post 500 infected", ylabel = "Q(t)", label = "Quarantine Strength",color = :black, framestyle = :box, grid =:off, legend = :topleft, left_margin = 5mm, bottom_margin = 5mm, foreground_color_legend = nothing, background_color_legend = nothing, ylims = (0.0, 1.2),  yguidefontsize = 14, xguidefontsize = 14,  xtickfont = font(12, "TimesNewRoman"), ytickfont = font(12, "TimesNewRoman"), legendfontsize = 12)

Run the code and it’s confusing that Q_parameter (output of the network) is a constant array and very close to zero, which means LSTM doesn’t work at all.
But when I use ANN (code like this:

Chain(Dense(4,2,relu),Dense(2,1))

it runs well. I can get the result I want.
There must be some error in the code, how to deal with it?

1 Like

I still can’t figure out what’s wrong with the codes :sob: :sob:

There is a lot going on in the example. It would help if you could make a minimal working example so we can separated design issues (i.e. network size, learning rates, sequence length for training) from code issues (i.e. not feeding in data properly, etc…). Whenever I work with a new DL framework I tend to use sine waves or a classification of mnist with the data fed in in a single bit as a baseline for something I know works, maybe one of those could help? I have a notebook which might point you in the right direction. The docs and the model zoo are also good places to see how to do things.

1 Like