Hello everyone, I’m new to the Julialang and as a toy project to learn it I decided to try out a simple Deep Q Network as I’m actively playing with (Deep) Reinforcement learning and have implemented various algorithms in Python, starting with table based, through Q network, DDPG/TD3, SAC, MAPPO and so on.
Now I’m trying to do the same thing in Julia and am failing miserably, to the point I really need your help. Here’s what I did: I forked an old wrapper for the Python Gym package (formerly OpenAI Gym) providing simple interface to various toy environments to test out RL algorithms (this one). I’m trying to solve the CartPole-v1, really simple one.
I’ve then created a SARSA/Q-Learning table based agents and they sort of learned (on my own discretization of the environment, but not necessary for the neural net). Now I started to use Flux to implement a simple neural network with absolutely simple layout of Dense(4 => 8, gelu), Dense(8 => 2, identity). This tiny network is proved to work in python with tensorflow reliably. No matter the various RL parameters (e.g. the discount factor, epsilon as in “epsilon-greedy policy” etc.), within 1000 episodes I’m always getting to an average of 100+ rewards (using evaluation environment where the greedy action is always chosen, demonstrating the network’s capability without the added noise). Even 4 hidden neurons were able to work.
Just for reference, in Python I used the Adam optimizer with 0.1 learning rate (yes, this high, it works reliably), 0.99 discount factor, epsilon going from 0.8 to 0.1 withing 1000 episodes, default everything like weight initialization, the main network and the target network with target network update after every episode. But it’s not sensitive to the hyperparameters that much in this simple environment. Even with epsilon (exploration factor) fixed at 0.1 it learns, even with other target network updates it learns.
Now in Julia, I tried to absolutely copy-paste this setting and tried every possible combination of parameters but none of it works. The only difference is that in Python, I used gym version 0.26.1 and in Julia 0.11.0. However, through all my testing, the Gym is working properly and every method works as expected. The rest is the DQN code (I’m newbie, it really is a mess):
First the DQN itself:
using Projekt
using GymWrapper
using Flux
using CUDA
using StatsBase
import Projekt.train!
export DQNetwork
struct DQNetwork
network # the network itself
device # Device to run the network on
optimiser # The optimiser state
layers # The individual layers
input_n::Int64 # Currently no support for CNNs
action_space::Vector{Int64} # to utilize for random action sampling
action_n::Int64 # just a helper field for quick access
discount_factor::Float64 # the discount factor as per the Value / Policy iteration
learning_rate::Float64 # Training LR
function DQNetwork(observation_n, action_space; hidden_layer_size::Int64 = 32, hidden_layer_num::Int64 = 1,
discount_factor::Float64 = 0.99, learning_rate::Float64 = 0.01, use_cuda::Bool = false)
(discount_factor <= 1.0 && discount_factor > 0) || error("Discount factor must be a float from the range (0, 1>!")
learning_rate > 0 || error("The learning rate must be positive real number!")
input_n = observation_n
action_space_n = length(action_space)
device = cpu
if use_cuda && CUDA.functional()
device = gpu
end
# Build the model...
layers = []
push!(layers, Dense(input_n => hidden_layer_size, gelu; init=Flux.glorot_uniform) |> device)
for i ∈ 1:hidden_layer_num
if i == hidden_layer_num
push!(layers, Dense(hidden_layer_size => action_space_n, identity; init=Flux.glorot_uniform) |> device)
else
push!(layers, Dense(hidden_layer_size => hidden_layer_size, gelu; init=Flux.glorot_uniform) |> device)
end
end
#model(x) = foldl((in, out) -> out(in), layers, init = x)
model = Chain(layers) |> device
optimiser = Flux.setup(Adam(learning_rate), model)
new(model, device, optimiser, layers, input_n, action_space.items, action_space_n, discount_factor, learning_rate)
end
end
function predict_values(agent::DQNetwork, observations)
agent.network(Flux.batch(observations) |> agent.device)
end
function predict_values(agent::DQNetwork, observation::Vector{Float64})
agent.network(Flux.batch([observation]) |> agent.device)[:, 1]
end
# choose epsilon-greedy action (i.e. with probability of epsilon, choose random action; otherwise choose the greedy action)
function choose_action(agent::DQNetwork, observation; epsilon::Float64 = 0.)
if rand() < epsilon
return rand(agent.action_space) + 1 # action_space is from Python so we convert it to the Julia 1-indexing realm
end
predicted_values = predict_values(agent, observation)
return argmax(predicted_values)
end
# Train the network on a batch of single examples (BATCH IS THE LAST DIMENSION)
function train_on_sample!(agent::DQNetwork, observations::Vector{Vector{Float64}}, values::Vector{Vector{Float64}})
observations_batch = Flux.batch(observations) |> agent.device
values = Flux.batch(values) |> agent.device
# Calculate the gradient of the objective
# with respect to the parameters within the model:
grads = Flux.withgradient(agent.network) do net
predicted_values = net(observations_batch)
loss = mean((predicted_values - values) .^ 2)
end
# Update the parameters so as to reduce the objective,
# according the chosen optimisation rule:
Flux.Optimise.update!(agent.optimiser, agent.network, grads.grad[1])
return grads.val
end
#unzip(a) = map(x->getfield.(a, x), fieldnames(eltype(a))) # proudly copy-pasted from https://stackoverflow.com/a/53645744
unzip(a) = [getindex.(a, i) for i in 1:length(a[1])]
function train!(agent::DQNetwork, target_agent::DQNetwork, env::E, eval_env::E; batch_size=32, episodes=100, target_update_frequency=20, should_render=false, render_each=100,
epsilon_init=1., epsilon_final=.1, epsilon_final_at=1000, close_env=false, quiet=false, eval_episodes=100) where {E <: GymWrapper.AbstractEnvironment}
(epsilon_init <= 1.0 && epsilon_init >= 0) && (epsilon_final <= 1.0 && epsilon_final >= 0) || error("The epsilon is a probability and thus must be from the range <0, 1>!")
eps = epsilon_init
replay_buffer = [] # state(Vector), action(Int), reward(Float), done(Bool), next_state(Vector)
returns = zeros(100)
avg_return = 0
action_n = agent.action_n
values = [zeros(action_n) for i in 1:batch_size]
for i ∈ 1:episodes
T = R = 0
s = Array(GymWrapper.reset!(env))
while !GymWrapper.finished(env)
if should_render && i % render_each == 0
GymWrapper.render(env)
end
action = choose_action(agent, s; epsilon=eps)
r, s′ = GymWrapper.step!(env, action - 1) # zero index cuz it's Python callback
s′ = Array(s′)
done = GymWrapper.finished(env)
push!(replay_buffer, (s, action, r, done, s′))
s = s′
R += r
T += 1
if length(replay_buffer) >= batch_size
batch = sample(replay_buffer, batch_size)
states, actions, rewards, dones, next_states = unzip(batch)
predicted_values = predict_values(target_agent, next_states) |> cpu
next_values = vec(rewards) .+ (1 .- vec(dones)) .* agent.discount_factor .* vec(maximum(predicted_values, dims=1))
for b ∈ 1:batch_size
a_rb = actions[b]
for a ∈ 1:action_n
values[b][a] = ifelse(a_rb == a, next_values[b], predicted_values[a, b])
end
end
loss = train_on_sample!(agent, Vector{Vector{Float64}}(states), values)
end
end
if i % target_update_frequency == 0
Flux.loadmodel!(target_agent.network, agent.network)
end
if i <= epsilon_final_at
eps = lerp(epsilon_init, epsilon_final, i / epsilon_final_at)
end
returns[(i - 1) % 100 + 1] = R
if i % 100 == 0
avg_return = mean(returns)
if !quiet
@info("Episode $i finished after $T steps. Avg. reward per last 100 episodes: $avg_return. Epsilon: $eps")
end
eval_returns = zeros(eval_episodes)
for e ∈ 1:eval_episodes
R = 0
s = GymWrapper.reset!(eval_env)
while !GymWrapper.finished(eval_env)
r, s′ = GymWrapper.step!(eval_env, choose_action(agent, s) - 1)
R += r
s = s′
end
eval_returns[(e - 1) % 100 + 1] = R
end
@info("Avg. eval return: " * string(mean(eval_returns)))
end
end
if close_env
close(env)
end
return avg_return
end
Now the test script:
using Revise
using Projekt
using GymWrapper
using Flux
env = GymEnv(:CartPole, :v1)
eval_env = GymEnv(:CartPole, :v1)
action_space = actions(env)
observation_space = observations(env)
agent = DQNetwork(length(observation_space.lo), action_space; hidden_layer_size = 8, hidden_layer_num = 1, discount_factor = 0.99, learning_rate = 0.1, use_cuda = false)
target_agent = DQNetwork(length(observation_space.lo), action_space; hidden_layer_size = 8, hidden_layer_num = 1, discount_factor = 0.99, learning_rate = 0.1, use_cuda = false)
Flux.loadmodel!(target_agent.network, agent.network)
train!(agent, target_agent, env, eval_env; target_update_frequency=20, batch_size=32, episodes=10000, should_render=false, render_each=1000, epsilon_init=.8, epsilon_final=.1, epsilon_final_at=1000, close_env=true, quiet=false)