Debugging Flux NaN problem

Hi,
I’m trying to learn Reinforcement learning using Julia. I tried to implement Reinforce algorithm (simple gradient policy) using Julia and Flux, basically translating this Python code.

That is my try:

using PyCall
using Distributions
using DistributionsAD
using Flux

function run()
    # init environment
    gym = pyimport("gym")
    env = gym.make("CartPole-v1")
    state_dim = env.observation_space.shape[1]
    action_dim = env.action_space.n

    # policy
    π_θ = policy(state_dim, action_dim)
    # optimizer
    opt = ADAM(1e-2)
    main(env,π_θ,opt)
end

function policy(input,output)
    Chain(Dense(input,128),
          Dropout(0.6),
          x->relu.(x),
          Dense(128,output),
          softmax)
end

act(π,s) = Categorical(π(s))|>rand

log_prob(π,s,a) = logpdf(Categorical(π(s)), a)

function policy_loss(π, states, actions, returns)
    pl = 0.0
    for (s,a,R) in zip(states, actions, returns)
        pl -= log_prob(π,s,a)*R
    end
    pl
end


function finish_episode(π, rewards, actions, states ,opt)
    R = 0.0
    returns = Float64[]
    for r in reverse(rewards)
        R = r+0.99*R
        prepend!(returns,R)
    end
    returns = (returns .- mean(returns) )./ (std(returns) + eps())
    θ = Flux.params(π)
    gθ = gradient(()->policy_loss(π, states, actions, returns), θ)
    Flux.Optimise.update!(opt,θ,gθ)
end

function main(env,π_θ,opt)
    running_reward = 10.0
    for i in 1:10000
        rewards = Float64[]
        states = Any[]
        actions = Any[]
        ep_reward = 0.0
        state = env.reset()
        for _=1:10000
            #env.render()
            action = act(π_θ,state)
            state, reward, done, info = env.step(action-1)
            push!(rewards, reward)
            push!(states, state)
            push!(actions, action)
            ep_reward+=reward
            done&&break
        end
        running_reward = 0.05 * ep_reward + (1-0.05) * running_reward
        
        finish_episode(π_θ, rewards, actions, states ,opt)
        
        if i%10 == 0
            println("Episode $i,\t Last reward: $ep_reward\t Avarage reward: $running_reward")
        end
    end   
end

After couple of epochs the model outputs NaN. I’m guessing that the gradient is to big or something but I cant figure it out.

My question is, how do you debug Flux gradients? or maybe I’m not Flux right?