Neural network training for DSGE model

Hi,
I have a DSGE model for which I need to approximate decision functions using neural network. I have already written many parts of my code. But I don’t know how to train it. I have written some training loop in the bottom of my code but I guess it’s wrong. Can someone help me to improve the training part of my code?

using Statistics
using Flux, Plots
using Random, Distributions, Flux.Optimise, ProgressMeter
#Parameters
betta = 0.99;
h = 0.7;
varphi = 1.5;
epsilon = 11;
theta = 0.75;
kappa_p = (epsilon - 1)*theta/((1 - theta)*(1 - betta*theta));
rho_r = 0.6;
mu_pie = 1.5;
mu_y = 0.125;
rho_a = 0.7;
rho_z = 0.7;
sigma_a = 0.05;
sigma_z = 0.03;
sigma_r = 0.07;
mc_ss = (epsilon - 1)/epsilon;
wss = mc_ss;
css = (wss*(1 - h))^(varphi - 1);
nss = css;
piess = 1.01;
Rss = piess/betta;

# Standard deviations for ergodic distributions of exogenous state variables
sigma_e_a = sigma_a/(1-rho_a^2)^0.5
sigma_e_z = sigma_z/(1-rho_z^2)^0.5
sigma_e_r = sigma_r

# bounds for endogenous state variables
Rmin = 1
Rmax = 3*Rss
cmin = 0.1*css;
cmax = 3*css;
function dr(a::Vector, z::Vector, Mon_shock::Vector, R::Vector, C::Vector)
    
    # we normalize exogenous state variables by their 2 standard deviations 
    # so that they are typically between -1 and 1 
    a = a/sigma_e_a/2
    z = z/sigma_e_z/2
    Mon_shock = Mon_shock/sigma_e_r/2
    
    # we normalze interest rate and consumtpion to be between -1 and 1
    R = (R.-Rmin)/(Rmax-Rmin)*2.0.-1.0
    C = (C.-cmin)/(cmax-cmin)*2.0.-1.0
    
    s = [a z Mon_shock R C]
    s =  transpose(s)
    B = convert(Matrix{Float32}, s)
    
    
    x = model(B) # n x 2 matrix
    
    # Marginal utlity consumption is always positive
    lambda_desicion = exp.(x[1,:])
    
    # There is no resticition on inflation
    pie_desicion  = x[2,:]
    
    return (lambda_desicion, pie_desicion)
end

function Residuals(e_r::Vector, e_z::Vector, e_a::Vector, Mon_shock::Vector, z::Vector, a::Vector,  Rinitial::Vector, Cinitial::Vector)

    # all inputs are expected to have the same size n

    # arguments correspond to the values of the states today
    lambda, inflation = dr(a, z, Mon_shock, Rinitial, Cinitial)
    Ccurrent = exp.(z)./lambda + h*Cinitial
    gdp = Ccurrent
    l = gdp./exp.(a)
    w = exp.(z).*l.^varphi./lambda
    mc = w./exp.(a)
    Rshadow = rho_r*Rinitial + (Rss.+(inflation.- piess)*mu_pie.+(gdp.-css)*mu_y)*(1 - rho_r) + Mon_shock
    Rcurrent = [max(1, Rshadow[i]) for i in 1:length(Rshadow)]
    # transitions of the exogenous processes
    anext = a*rho_r + e_a
    znext = z*rho_z + e_z
    Mon_shocknext = e_r
    # (epsilon = (rnext, δnext, pnext, qnext))
    
    # transition of endogenous states (next denotes variables at t+1)
    

    lambdanext, inflationnext = dr(anext, znext, Mon_shocknext, Rcurrent, Ccurrent)
    Cnext = exp.(znext)./lambdanext + h*Ccurrent
    gdpnext = Cnext
    

    R1 = lambda - betta*Rcurrent.*lambdanext./inflationnext
    R2 = -inflation + betta*lambda./lambdanext.*(inflationnext.-piess).*gdpnext./gdp.+epsilon/kappa_p*(mc.-(epsilon - 1)/epsilon).+piess

    return (R1, R2)
end
function Objective(params) # objective function for DL training   
    # residuals for n random grid points under 2 realizations of shocks
    
    R1_e1, R2_e1 = Residuals(e1_r, e1_z, e1_a, Mon_shock, z, a, Rcurrent, Ccurrent)
    R1_e2, R2_e2 = Residuals(e2_r, e2_z, e2_a, Mon_shock, z, a, Rcurrent, Ccurrent)
    
    # construct all-in-one expectation operator
    R_squared = R1_e1.*R1_e2 + R2_e1.*R2_e2 
    
    # compute average across n random draws
    return mean(R_squared)
end
#Neural network construction
model = Chain(
    Dense(5 => 32, relu),   # activation function inside layer
    Dense(32 => 32, relu),
    Dense(32 => 2),
    softmax) |> gpu        # move model to GPU, if available

n = 1000
# randomly draw inputs 
z = vcat((sigma_e_z*randn(n)));
a = vcat((sigma_e_a*randn(n)));
Mon_shock = vcat((sigma_e_r*randn(n)));
Rcurrent = vcat(rand(Uniform(Rmin, Rmax), n));
Ccurrent =  vcat(rand(Uniform(cmin, cmax), n));
e1_a = vcat(sigma_e_a*randn(n));
e1_z = vcat(sigma_e_z*randn(n));
e1_r = vcat(sigma_e_r*randn(n));
e2_a = vcat(sigma_e_a*randn(n));
e2_z = vcat(sigma_e_z*randn(n));
e2_r = vcat(sigma_e_r*randn(n));
                    

optimizer = ADAM(0.01)
params = Flux.params(model)

# Train the neural network
losses = []
@showprogress for epoch in 1:400
    grads = gradient(() -> Objective(params), params)
    Flux.Optimise.update!(optimizer, params, grads)
    push!(losses, Objective(params))  # logging, outside gradient context
end