Dear All,
sorry for bothering with a possibly stupid question.
I would like to ask for some advice about solving the Hamilton-Jacobi-Bellman equation using deep learning. I am trying to solve an HJB equation of a simple neoclassical growth model from macroeconomics. Its HJB equation is
\rho V(k_t) = \frac{c_t^{1-\gamma}}{1-\gamma} + \frac{\partial V}{\partial k} \{A k_t^\alpha - \delta k_t - c_t\}
Control variable in this problem is c_t \geq 0, characterized by following first-order condition.
c_t^{-\gamma} = \frac{\partial V}{\partial k}
Following paper by Fernandéz-Villaverde et. al (2020), I am trying to solve this HJB problem using deep learning. I parameterize the value function V(k_t) and consumption function C(k_t) using feed-forward networks designed to guarantee non-negativity of consumption, and use ADAM to minimize a loss function that includes squared error in HJB equation and FOC over a reasonable interval of state-space.
\mathcal{E}_{HJB} = \rho V(k_t) - \frac{c_t^{1-\gamma}}{1-\gamma} - \frac{\partial V}{\partial k} \{A k_t^\alpha - \delta k_t - c_t\}
\mathcal{E}_{FOC} = c_t^{-\gamma} - \frac{\partial V}{\partial k}
\mathcal{L} = \sum_i {\mathcal{E}_{HJB}}_i + \sum_i {\mathcal{E}_{FOC}}_i
While the minimization procedure converges to \approx 0 loss, instead of getting concave and increasing value function, I got something totally weird.
I checked the result for many different choices of activation functions in my networks, but the problem persists, in all these cases I still got this shape of the value function.
In the paper (page 19), the authors show, that minimization of this loss function composed of HJB and FOC error (should?) converges to a solution of accurate upwind finite-difference scheme. Unfortunately, I am getting something totally different. Should I use some boundary conditions? Unfortunately, I think that there aren’t natural boundary conditions in this problem, other than transversality conditions.
I would be really glad for any insight, especially from
@Tamas_Papp and @jlperla
Best,
Honza
Here is a MWE of my code.
#*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
#*-*-* Solve non-stochastic neoclassical growth model in continuous time *-*
#*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
#(1) Import libraries
using Pkg
Pkg.add("Plots")
Pkg.add("Parameters")
Pkg.add("LinearAlgebra")
Pkg.add("Flux")
Pkg.add("Random")
Pkg.add("Distributions")
Pkg.add("ForwardDiff")
using Plots
using Parameters
using LinearAlgebra
using Flux
using Random
using Distributions
using ForwardDiff
include("FastADAM.jl")
Pkg.add("ZygoteRules")
using ZygoteRules
ZygoteRules.@adjoint function ForwardDiff.Dual{T}(x, ẋ::Tuple) where T
@assert length(ẋ) == 1
ForwardDiff.Dual{T}(x, ẋ), ḋ -> (ḋ.partials[1], (ḋ.value,))
end
ZygoteRules.@adjoint ZygoteRules.literal_getproperty(d::ForwardDiff.Dual{T}, ::Val{:partials}) where T =
d.partials, ṗ -> (ForwardDiff.Dual{T}(ṗ[1], 0),)
ZygoteRules.@adjoint ZygoteRules.literal_getproperty(d::ForwardDiff.Dual{T}, ::Val{:value}) where T =
d.value, ẋ -> (ForwardDiff.Dual{T}(0, ẋ),)
#(2) Build model structure
function uCRRA(c,γ)
if(γ==1.0)
return log(c)
else
return (c^(1-γ))/(1-γ)
end
end
@with_kw struct NCG
#(A) Structural parameters
Α::Float32 = 0.5
α::Float32 = 0.36
β::Float32 = 1/(1+0.04)
ρ::Float32 = 1/β-1
γ::Float32 = 2.0
δ::Float32 = 0.05
#(B) Steady-state
kss::Float32 = (α/(1/β-(1-δ)))^(1/(1-α))
kl::Float32 = 10^(-4)
ku::Float32 = 10.0
#(C) Grid
ϰ::Int64 = 2500
𝓚::Array{Float32} = collect(range(kl,ku,length=ϰ))'
#(D) Approximator
T::Int64 = 8
g::Function = swish
end
@with_kw struct Optimizer
#(C) Networks and ADAM setting
S1::Float32 = 0.0
S2::Float32 = 1.0
ϰ::Int64 = 2500
ϑ::Int64 = 150
T::Int64 = 32
Γ::Float32 = 0.001
β1::Float32 = 0.9
β2::Float32 = 0.99
ϵ::Float32 = 10^(-8)
ν::Float32 = 10^(-5)
𝚰::Int64 = 5000000
𝛀::Float32 = 0.1
𝒯::Int64 = 1500
𝓇::Float32 = 0.9
end
#Try swish,gelu,elu,mish,leakyrelu => all leads to the more or less same result
NC1 = NCG(g=swish)
@unpack Α,T,α,β,ρ,γ,δ,kl,ku,𝓚,g = NC1
#(3) Define approximation networks
#(3.1) Define policy network
𝓒 = Chain(Dense(1,T,g),Dense(T,T,g),
Dense(T,T,g),Dense(T,1,softplus))
#(3.3) Value network
𝓥 = Chain(Dense(1,T,g),Dense(T,T,g),
Dense(T,T,g),Dense(T,T,g),Dense(T,1,identity))
d𝓥(t) = ForwardDiff.derivative(x->𝓥([x])[1],t)
𝚹 = Flux.params(𝓒,𝓥)
#(4) Build production function
f(x) = Α*x^α - δ*x
#(4) Build a loss function
function 𝓡(x)
#HJB Error
ϵ_hjb = sum((ρ*𝓥(x) - uCRRA.(𝓒(x),γ) - d𝓥.(x).*(f.(x)-𝓒(x))).^2)
#FOC Error
ϵ_foc = sum((𝓒(x).^(-γ) - d𝓥.(x)).^2)
return ϵ_hjb+ϵ_foc
end
Ad1 = Optimizer(Γ=0.001)
@time 𝝝 = Adam(𝓡,𝚹,𝓚,Ad1,1,"YES","NO")
plot(𝓚',𝓥(𝓚)',title="Value Function",xlabel="K_t",ylabel="V(K),",legend=false)
plot(𝓚',d𝓥.(𝓚)',title="V_k(K)",xlabel="K_t",ylabel="V(K)",legend=false)
plot(𝓚',𝓒(𝓚)',title="Policy Function",xlabel="K_t",ylabel="C(K),",legend=false)
For replication, here is my implementation of the ADAM optimizer (FastADAM.jl file). There should’t be any problem, I used it sucessfully for many other problems.
function Adam(𝕱,𝚯,Data,Par,per,show,loc="NO")
let
#Initialize momentum estimates
@unpack ϑ,Γ,β1,β2,ϵ,𝚰,𝛀,𝒯,𝓇 = Par
n = length(𝚯)
m = Array{Array{Float32}}(undef,n)
v = Array{Array{Float32}}(undef,n)
∇ = Array{Array{Float32}}(undef,n)
hm = Array{Array{Float32}}(undef,n)
hv = Array{Array{Float32}}(undef,n)
u = Array{Array{Float32}}(undef,n)
if(loc=="YES")
𝚴 = Array{Array{Float32}}(undef,n)
𝕻 = [Array{Array{Float32}}(undef,n)]
𝕸 = Array{Float32}(undef,1)
𝖈 = Array{Int64}(undef,1)
𝕸[1] = 𝕱(Data)
𝖈[1] = 0
end
for j in 1:n
m[j] = zeros(Float32,length(𝚯[j]))
v[j] = zeros(Float32,length(𝚯[j]))
∇[j] = zeros(Float32,length(𝚯[j]))
end
#Main training loop
for i in 1:𝚰
#Sample data for stochastic gradient
𝓓,𝓜 = size(Data)
if(𝓓==1)
xg = reshape(sample(Data,ϑ),1,ϑ)
else
𝓘 = sample(1:𝓜,ϑ)
xg = Data[:,𝓘]
end
#Take gradient
∇𝕱 = Flux.gradient(()->𝕱(xg),𝚯)
#Update gradient
for j in 1:n
#Check duality
𝖙 = ForwardDiff.partials(∇𝕱[𝚯[j]][1])
if(length(𝖙)==1)
λ(t) = ForwardDiff.value(ForwardDiff.partials(∇𝕱[𝚯[j]][t]))[1]
∇[j] = λ.(1:length(𝚯[j]))
else
∇[j] = Array{Float32,1}(∇𝕱[𝚯[j]][:])
end
#Update momentum estimates
m[j] = β1.*m[j] .+ (1-β1).*∇[j]
v[j] = β2.*v[j] .+ (1-β2).*∇[j].^2
hm[j] = m[j]./(1-β1^(i))
hv[j] = v[j]./(1-β2^(i))
#Update parameters
u[j] = hm[j]./(sqrt.(hv[j]) .+ ϵ)
𝚯[j] .= 𝚯[j] .- reshape(Γ.*u[j],size(𝚯[j]))
if(loc=="YES")
𝚴[j] = copy(𝚯[j] .- reshape(Γ.*u[j],size(𝚯[j])))
end
end
#Compute loss
if(i%per==0)
𝕷 = 𝕱(Data)
if(show=="YES")
#println(i)
println(𝕷)
end
#Secundary convergence check
if(loc=="YES")
if(𝕷<𝓇*𝕸[1])
𝕸[1] = 𝕷
𝕻[1] = 𝚴
𝖈[1] = 0
else
𝖈[1] = 𝖈[1] + 1
end
if(𝖈[1]>=𝒯)
for j in 1:length(𝚯)
𝚯[j] .= 𝚴[j]
end
println("Secundary convergence")
return 𝚴
break
end
end
if(𝕷<𝛀)
println("Convergence ",i)
break
end
end
end
#
return 𝚯
end
end