Dear All,
I am trying to solve system of two ordinary differential equations.
𝝫’(x) = (λ-1/v(x))f(x)
v’(x) = x^3(λf(x)/(λf(x)x-2𝝫(x)))^2
With boundary conditions
𝝫(x_low) = 0
𝝫(x_high) = 0
v(x) and 𝝫(x) are unknown functions. f(x) is a known function (density function of known distribution) and λ is a known constant. I am trying to solve it using a neural network with ADAM gradient descent. Unfortunately, it fails to converge. For first few hundreds of iterations, it is converging, but then it starts to diverge…
Here is minimal working example:
#*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
#*-*-* Static Mirrleesian planning problem solved with Deep Learning *-*-*
#*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
#(1) Install and initialize packages
using Pkg
Pkg.add("Plots")
Pkg.add("Parameters")
Pkg.add("LinearAlgebra")
Pkg.add("Flux")
Pkg.build("Flux")
Pkg.add("Random")
Pkg.add("Distributions")
Pkg.add("ForwardDiff")
Pkg.add("BenchmarkTools")
using Plots
using Parameters
using LinearAlgebra
using Flux
using Random
using Distributions
using ForwardDiff
using BenchmarkTools
Random.seed!(100)
Pkg.add("ZygoteRules")
using ZygoteRules
ZygoteRules.@adjoint function ForwardDiff.Dual{T}(x, ẋ::Tuple) where T
@assert length(ẋ) == 1
ForwardDiff.Dual{T}(x, ẋ), ḋ -> (ḋ.partials[1], (ḋ.value,))
end
ZygoteRules.@adjoint ZygoteRules.literal_getproperty(d::ForwardDiff.Dual{T}, ::Val{:partials}) where T =
d.partials, ṗ -> (ForwardDiff.Dual{T}(ṗ[1], 0),)
ZygoteRules.@adjoint ZygoteRules.literal_getproperty(d::ForwardDiff.Dual{T}, ::Val{:value}) where T =
d.value, ẋ -> (ForwardDiff.Dual{T}(0, ẋ),)
#(2) Calibrate parameter values
τ = Float32(0.3)
k = Float32(4)
yl = Float32(2)
F_high = Float32(1-10^(-3))
θl = Float32(sqrt(yl/(1-τ)))
#(3) Algorithm choices
@with_kw struct ADAM
ϰ::Int64 = 2500
ϑ::Int64 = 150
T::Int64 = 16
Γ::Float64 = 0.01
β1::Float64 = 0.9
β2::Float64 = 0.99
ϵ::Float64 = 10^(-8)
ν::Float32 = 10^(-5)
𝚰::Int64 = 500000
𝛀::Float64 = 0.01
𝒯::Int64 = 2000
𝓇::Float64 = 0.9
end
AD = ADAM(Γ=0.00001,T=8,ϰ=5000)
@unpack T,ϰ = AD
#(4) Define skill distribution
f(θ) = 2*k*θ^(-2*k-1)*θl^(2*k)
F(θ) = 1 - θ^(-2*k)*θl^(2*k)
#f(θ) = 1/(θh-θl)
#(5) Solve for upper bound
θh = ((1-F_high)/(θl^(2*k)))^(-1/(2*k))
θh = Float32(θh)
#(6) Generate grid
Grid = Vector{Float32}(range(θl,θh,length=ϰ))
Grid = reshape(Grid,1,ϰ)
#(7) Build neural networks
bent(x) = (sqrt(x^2+1)-1)/2 + x
𝕍 = Flux.Chain(Dense(1,T,swish),Dense(T,T,swish),Dense(T,1,swish))
Φ = Flux.Chain(Dense(1,T,swish),Dense(T,T,swish),Dense(T,1,swish))
#(8) Set-up boundary conditions
Φ_low = Float32(0.0)
Φ_high = Float32(0.0)
#(9) Define approximators
𝓥(t) = 𝕍([t])[1]
𝝫(t) = (t-θl)*(θh-t)*Φ([t])[1]
#(10) Create derivatives
d𝓥(t) = ForwardDiff.derivative(𝓥,t)
d𝝫(t) = ForwardDiff.derivative(𝝫,t)
#(10) Build residual function
λ = Float32(1.0)
AD = ADAM(Γ=0.0001,T=8,ϰ=5000)
function ℝ(x)
𝕄 = sum((d𝝫.(x)-(λ.-1 ./𝓥.(x)).*f.(x)).^2)
𝔼 = sum((d𝓥.(x)-(((λ.*f.(x))/(λ.*f.(x).*x-2*𝝫.(x))).^2).*(x.^3)).^2)
ℝ = 𝕄 + 𝔼
return ℝ
end
𝝝 = Flux.params(𝕍,Φ)
@time Adam(ℝ,𝝝,Grid,AD,"YES","NO")
I am using my implementation of ADAM. I tested it on other problems (including differential equations), and it works well, so I don’t think there are problems. Any advice regarding this problem? Should I use some different optimizer? @ChrisRackauckas
function Adam(𝕱,𝚯,Data,Par,show,loc="NO")
let
#Initialize momentum estimates
@unpack ϑ,Γ,β1,β2,ϵ,𝚰,𝛀,𝒯,𝓇 = Par
n = length(𝚯)
m = Array{Array{Float32}}(undef,n)
v = Array{Array{Float32}}(undef,n)
∇ = Array{Array{Float32}}(undef,n)
hm = Array{Array{Float32}}(undef,n)
hv = Array{Array{Float32}}(undef,n)
u = Array{Array{Float32}}(undef,n)
if(loc=="YES")
𝚴 = Array{Array{Float32}}(undef,n)
𝕻 = [Array{Array{Float32}}(undef,n)]
𝕸 = Array{Float32}(undef,1)
𝖈 = Array{Int64}(undef,1)
𝕸[1] = 𝕱(Data)
𝖈[1] = 0
end
for j in 1:n
m[j] = zeros(Float32,length(𝚯[j]))
v[j] = zeros(Float32,length(𝚯[j]))
∇[j] = zeros(Float32,length(𝚯[j]))
end
#Main training loop
for i in 1:𝚰
#Sample data for stochastic gradient
𝓓,𝓜 = size(Data)
if(𝓓==1)
xg = reshape(sample(Data,ϑ),1,ϑ)
else
𝓘 = sample(1:𝓜,ϑ)
xg = Data[:,𝓘]
end
#Take gradient
∇𝕱 = Flux.gradient(()->𝕱(xg),𝚯)
#Update gradient
for j in 1:n
#Check duality
𝖙 = ForwardDiff.partials(∇𝕱[𝚯[j]][1])
if(length(𝖙)==1)
λ(t) = ForwardDiff.value(ForwardDiff.partials(∇𝕱[𝚯[j]][t]))[1]
∇[j] = λ.(1:length(𝚯[j]))
else
∇[j] = Array{Float32,1}(∇𝕱[𝚯[j]][:])
end
#Update momentum estimates
m[j] = β1.*m[j] .+ (1-β1).*∇[j]
v[j] = β2.*v[j] .+ (1-β2).*∇[j].^2
hm[j] = m[j]./(1-β1^(i))
hv[j] = v[j]./(1-β2^(i))
#Update parameters
u[j] = hm[j]./(sqrt.(hv[j]) .+ ϵ)
𝚯[j] .= 𝚯[j] .- reshape(Γ.*u[j],size(𝚯[j]))
if(loc=="YES")
𝚴[j] = copy(𝚯[j] .- reshape(Γ.*u[j],size(𝚯[j])))
end
end
#Compute loss
𝕷 = 𝕱(Data)
if(show=="YES")
#println(i)
println(𝕷)
end
#Secundary convergence check
if(loc=="YES")
if(𝕷<𝓇*𝕸[1])
𝕸[1] = 𝕷
𝕻[1] = 𝚴
𝖈[1] = 0
else
𝖈[1] = 𝖈[1] + 1
end
if(𝖈[1]>=𝒯)
for j in 1:length(𝚯)
𝚯[j] .= 𝚴[j]
end
println("Secundary convergence")
return 𝚴
break
end
end
if(𝕷<𝛀)
println("Convergence ",i)
break
end
end
#
return 𝚯
end
end