Hi,
I’m trying to learn Reinforcement learning using Julia. I tried to implement Reinforce algorithm (simple gradient policy) using Julia and Flux, basically translating this Python code.
That is my try:
using PyCall
using Distributions
using DistributionsAD
using Flux
function run()
# init environment
gym = pyimport("gym")
env = gym.make("CartPole-v1")
state_dim = env.observation_space.shape[1]
action_dim = env.action_space.n
# policy
π_θ = policy(state_dim, action_dim)
# optimizer
opt = ADAM(1e-2)
main(env,π_θ,opt)
end
function policy(input,output)
Chain(Dense(input,128),
Dropout(0.6),
x->relu.(x),
Dense(128,output),
softmax)
end
act(π,s) = Categorical(π(s))|>rand
log_prob(π,s,a) = logpdf(Categorical(π(s)), a)
function policy_loss(π, states, actions, returns)
pl = 0.0
for (s,a,R) in zip(states, actions, returns)
pl -= log_prob(π,s,a)*R
end
pl
end
function finish_episode(π, rewards, actions, states ,opt)
R = 0.0
returns = Float64[]
for r in reverse(rewards)
R = r+0.99*R
prepend!(returns,R)
end
returns = (returns .- mean(returns) )./ (std(returns) + eps())
θ = Flux.params(π)
gθ = gradient(()->policy_loss(π, states, actions, returns), θ)
Flux.Optimise.update!(opt,θ,gθ)
end
function main(env,π_θ,opt)
running_reward = 10.0
for i in 1:10000
rewards = Float64[]
states = Any[]
actions = Any[]
ep_reward = 0.0
state = env.reset()
for _=1:10000
#env.render()
action = act(π_θ,state)
state, reward, done, info = env.step(action-1)
push!(rewards, reward)
push!(states, state)
push!(actions, action)
ep_reward+=reward
done&&break
end
running_reward = 0.05 * ep_reward + (1-0.05) * running_reward
finish_episode(π_θ, rewards, actions, states ,opt)
if i%10 == 0
println("Episode $i,\t Last reward: $ep_reward\t Avarage reward: $running_reward")
end
end
end
After couple of epochs the model outputs NaN. I’m guessing that the gradient is to big or something but I cant figure it out.
My question is, how do you debug Flux gradients? or maybe I’m not Flux right?