Hello. I’m relatively new to Julia, I have only been coding in it the last 3 months. I’m trying to create a DQN RL agent that finds the optimal path while also choosing the maximum speed for doing so. However, it just won’t learn and I really can’t see why. Can someone please help me? I will add the script below.
using ReinforcementLearning
using Flux #Needed for all the Neural Networks functionalities
using Plots
using DelimitedFiles #Needed to read all the txt files
using PolygonOps
using Random
using Intervals
#GeoBoundariesManipulation
include(joinpath(pwd(),"GeoBoundariesManipulation.jl"));
using .GeoBoundariesManipulation
#My problem's parameters
struct ShippingEnvParams
gridworld_dims::Tuple{Int64,Int64} #Gridworld dimensions
velocities::Vector{Int64} #available velocities from 6 knots to 20 knots
acceleration::Vector{Int64} #available acceleration per step: -2, 0, 2
heading::Vector{CartesianIndex{2}} #all heading manoeuvers
punishment::Int64 #punishment per ordinary step
out_of_grid_punishment::Int64 #punishment for going towards an island or out of grid bounds
StartingPoint::CartesianIndex{2}
GoalPoint::CartesianIndex{2}
all_polygons::Vector{Vector{Tuple{Float64,Float64}}} #all the boundaries
end
function ShippingEnvParams(;
gridworld_dims = (50,50),
velocities = Vector((6:2:20)),
acceleration = Vector((-2:2:2)),
heading = [CartesianIndex(0,1);CartesianIndex(0,-1);CartesianIndex(-1,0);CartesianIndex(-1,1);CartesianIndex(-1,-1);CartesianIndex(1,-1);CartesianIndex(1,1);CartesianIndex(1,0)],
punishment = -5,
out_of_grid_punishment = -10,
StartingPoint = GeoBoundariesManipulation.GoalPointToCartesianIndex((-6.733535,61.997345),gridworld_dims[1],gridworld_dims[2]),
EndingPoint = GeoBoundariesManipulation.GoalPointToCartesianIndex((-6.691500,61.535580),gridworld_dims[1],gridworld_dims[2]),
AllPolygons = GeoBoundariesManipulation.load_files("finalboundaries")
)
ShippingEnvParams(
gridworld_dims,
velocities,
acceleration,
heading,
punishment,
out_of_grid_punishment,
StartingPoint,
EndingPoint,
AllPolygons
)
end
###ENVIRONMENT CONSTRUCTION
#Instance
mutable struct ShippingEnv <: AbstractEnv
params::ShippingEnvParams
action_space::Base.OneTo{Int64}
#observation_space::Space{Vector{UnitRange{Int64}}} #state_space
observation_space::Space{Vector{Interval{Int64,Closed,Closed}}}
state::Vector{Int64} #state: (position,velocity)
action::Int64 #action: (heading_angle,acceleration)
done::Bool #checks if agent has reached its goal
position::CartesianIndex{2}
time::Float64
velocity::Int64
distance::Float64
reward::Union{Nothing,Float64}
end
function ShippingEnv()
params1 = ShippingEnvParams()
env = ShippingEnv(
params1,
Base.OneTo(length(params1.heading)*length(params1.acceleration)),
#Space((1:length(params1.heading),1:length(params1.acceleration))), #Space: (1-number of heading options, 1-number of acceleration options)
#Space([1..params.gridworld_dims[1]*params.gridworld_dims[2],minimum(params.velocities)..maximum(params.velocities)]),
Space([0..1,0..1]),
#Space([1:(params1.gridworld_dims[1]*params1.gridworld_dims[2]),(1:length(params1.velocities))]), #(1-number of grid tiles, 1-number of velocity options)
[LinearIndices((params1.gridworld_dims[1],params1.gridworld_dims[2]))[params1.StartingPoint],1],
rand(1:length(params1.heading)*length(params1.acceleration)), #put a random action
false,
params1.StartingPoint,
0.0,
params1.velocities[1],
0.0,
0.0
)
reset!(env)
env
end
function state_normalization(m::ShippingEnv)
max_st_position = m.params.gridworld_dims[1]*m.params.gridworld_dims[2]
min_st_position = 1
max_st_velocity = length(m.params.velocities)
min_st_velocity = 1
position = (m.state[1] - min_st_position)/(max_st_position-min_st_position)
velocity = (m.state[2] - min_st_velocity)/(max_st_velocity-min_st_velocity)
return [position,velocity]
end
#Minimal interfaces implemented
RLBase.action_space(m::ShippingEnv) = m.action_space
RLBase.state_space(m::ShippingEnv) = m.observation_space
RLBase.reward(m::ShippingEnv) = m.done ? 0.0 : m.reward
RLBase.is_terminated(m::ShippingEnv) = m.done
RLBase.state(m::ShippingEnv) = state_normalization(m::ShippingEnv)
#Random.seed!(m::ShippingEnv,seed) = Random.seed!(m.rng,seed)
function RLBase.reset!(m::ShippingEnv)
m.position = m.params.StartingPoint
m.velocity = m.params.velocities[1]
m.done = false
m.time = 0
m.distance = 0
#nothing
end
#Action Space Map Parameters: Object Contruction
struct as_map_params
nheading::Int64
nacceleration::Int64
nvelocities::Int64
end
function as_map_params(;
shipping_env_params = ShippingEnvParams(),
nheading = length(shipping_env_params.heading),
nacceleration = length(shipping_env_params.acceleration),
nvelocities = length(shipping_env_params.velocities)
)
as_map_params(
nheading,
nacceleration,
nvelocities
)
end
function as_map(;map_params = as_map_params())
arr_heading = collect(1:map_params.nheading)
arr_acceleration = collect(1:map_params.nacceleration)
arr_velocities = collect(1:map_params.nvelocities)
arr = [arr_heading, arr_acceleration, arr_velocities]
temp_arr = collect(Base.product(arr[1],arr[2]))
i = 3
while i <= length(arr)
temp_arr = collect(Base.product(temp_arr,arr[i]))
i += 1
end
final_arr = vec(temp_arr)
function remove_internal_tuples(vect)
d = []
d_internal = []
while first(vect)[1] isa Tuple
d = []
for i in 1:length(vect)
empty!(d_internal)
for ii in 1:length(first(vect[i]))
push!(d_internal,first(vect[i])[ii])
end
for iii in 2:length(vect[i])
push!(d_internal, vect[i][iii])
end
append!(d,d_internal)
end
vect = []
push!(vect,d)
end
function number_to_tuples(vect)
t = []
for i in 1:3:length(vect)
push!(t,(vect[i],vect[i+1],vect[i+2]))
end
return t
end
return number_to_tuples(vect[1])
end
all_actions = remove_internal_tuples(final_arr)
return all_actions
end
global all_actions = as_map();
#Function defining what happens every time an action is made
function (m::ShippingEnv)(a::Int64)
nextstep(m,all_actions[a][1],all_actions[a][2])
end
function nextstep(m::ShippingEnv, head_action, acceleration)
heading = m.params.heading[head_action]
r = m.params.punishment #initialized punishment if everything's okay
m.position += heading
dist_covered = sqrt(heading[1]^2 + heading[2]^2)
m.distance += dist_covered
next_state_norm = (m.position[1]/m.params.gridworld_dims[1],m.position[2]/m.params.gridworld_dims[2]) #normalized for going inanypolygon
#Check if next state is out of bounds and assign appropriate punishment
if m.position[1]<1 || m.position[1]>m.params.gridworld_dims[1] || m.position[2]<1 || m.position[2]>m.params.gridworld_dims[2] || inanypolygon(next_state_norm, m.params.all_polygons)
r = m.params.out_of_grid_punishment #replace punishment
m.position -= heading
m.distance -= dist_covered
end
#Checking if velocity+acceleration is out of velocities' bounds
current_acceleration = m.params.acceleration[acceleration] #actual accelaration
if (m.velocity + current_acceleration) > minimum(m.params.velocities) && (m.velocity + current_acceleration < maximum(m.params.velocities))
m.velocity += current_acceleration #-2 is used because accelaration input is 1-3 and we want to either go to lower acceleration or greater
end
m.time = dist_covered/m.velocity
#m.reward = r -m.time
m.reward = r
m.done = m.position == m.params.GoalPoint
m.state = [LinearIndices((m.params.gridworld_dims[1],m.params.gridworld_dims[2]))[m.position],first(findall(x->x==m.velocity,m.params.velocities))]
end
#DQN agent
function agent_construction(;
hidden1=40,
hidden2=50,
updhor = 2)
agent = Agent(
policy=QBasedPolicy(
#DQNLearner will be used because of the option for double dqn.
learner=DQNLearner(
approximator=NeuralNetworkApproximator(
model = Chain(
Dense(length(state(env)),hidden1,sigmoid),
Dense(hidden1,hidden2,sigmoid),
Dense(hidden2,length(action_space(env)),sigmoid)
),
optimizer = ADAM(0.001),
),
target_approximator = NeuralNetworkApproximator(
model = Chain(
Dense(length(state(env)),hidden1,sigmoid),
Dense(hidden1,hidden2,sigmoid),
Dense(hidden2,length(action_space(env)),sigmoid)
),
optimizer = ADAM(0.001),
),
loss_func = Flux.huber_loss,
γ = 0.99f0, #discount rate
batch_size = 50, #mini batch_size
update_horizon = updhor, #G = r .+ γ^n .* (1 .- t) .* q′
#---min_replay_history
#number of transitions that should be made before updating the approximator
#it is the replay_start_size = the count of experiences (frames) to add to replay buffer before starting training
min_replay_history = 25,
update_freq = 4, #the frequency of updating the approximator
#---target_update_freq
#how frequently we sync model weights from the main DQN network to the target DQN network
#(how many frames in between syncing)
target_update_freq = 100,
stack_size = nothing, #use the recent stack_size frames to form a stacked state
traces = SARTS, #current state, action, reward, terminal, next state
rng = Random.GLOBAL_RNG,
is_enable_double_DQN = true #enable double dqn, enabled by default
),
explorer = EpsilonGreedyExplorer(;
kind = :linear,
step = 1, #record the current step
ϵ_init = 0.99, #initial epsilon
warmup_steps = 0, #the number of steps to use ϵ_init
decay_steps = 0, #the number of steps for epsilon to decay from ϵ_init to ϵ_stable
ϵ_stable = 0.1, #the epislon after warmup_steps + decay_steps
is_break_tie = true, #randomly select an action of the same maximum values if set to true.
rng = Random.GLOBAL_RNG, #set the internal rng
is_training = true #in training mode, step will not be updated and the epsilon will be set to 0.
)
),
#A trajectory is the sequence of what has happened over a set of continuous timestamps
trajectory=CircularArraySARTTrajectory(;
capacity = 200,
state = Vector{Float64} => (length(state(env)),),
# action = Int => (),
# reward = Float32 => (),
# terminal = Bool => (),
) #when using NN you have to change from VectorSARTTrajectory to CircularArraysTraject
)
return agent
end
#Customized hook
Base.@kwdef mutable struct customized_hook <: AbstractHook
velocity::Vector{Int64} = []
velocity_total::Vector{Vector{Int64}} = []
position:: Vector{CartesianIndex{2}} = []
position_total:: Vector{Vector{CartesianIndex{2}}} = []
reward::Vector{Float64} = []
reward_total::Vector{Vector{Float64}} =[]
end
(h::customized_hook)(::PostActStage,agent,env) =
begin
push!(h.velocity,env.velocity)
push!(h.position,env.position)
push!(h.reward,env.reward)
end
(h::customized_hook)(::PostEpisodeStage,agent,env) =
begin
h.velocity_total = vcat(h.velocity_total,[h.velocity])
h.position_total = vcat(h.position_total,[h.position])
h.reward_total = vcat(h.reward_total,[h.reward])
end
(h::customized_hook)(::PreEpisodeStage,agent,env) =
begin
h.velocity = []
h.position = []
h.reward = []
end
#Environment defining
env = ShippingEnv()
agent = agent_construction(;hidden1=40,hidden2=50,updhor=1)
#Run Experiment
hook = customized_hook();
stop_condition = StopAfterEpisode(50, is_show_progress = true)
ex = Experiment(agent, env, stop_condition, hook, "#Test")
RLBase.test_runnable!(env)
@time run(ex)```