Hello. I am trying to use reinforcement learning to solve a simple problem as a proof of concept. I am using DeepQLearning and defining the AbstractEnv(an interface from CommonRLInterface). I am getting the following error and cannot figure out how to make it happy. I believe commonRLInterface and DeepQLearning should work togethor but maybe I misunderstood the documentation. Has anyone seen this error and know how to fix it?
LoadError: TypeError: in DQExperience, in A, expected A<:(AbstractArray{T<:Real, N} where N), got Type{SVector{2, Integer}}
I figured out that it is referring to this struct:
struct DQExperience{N <: Real,T <: Real, A<:AbstractArray{T}}
s::A
a::N
r::T
sp::A
done::Bool
end
Source code of my project:
# A simple grid world MDP
# All cells with reward are also terminal
using CommonRLInterface
using StaticArrays
using Compose
using DeepQLearning
using POMDPs
using Flux
using POMDPModels
using POMDPSimulators
using POMDPPolicies
import ColorSchemes
const RL = CommonRLInterface
mutable struct GridWorld <: AbstractEnv
size::SVector{2, Integer}
rewards::Dict{SVector{2, Integer}, Float64}
state::SVector{2, Integer}
end
function GridWorld()
rewards = Dict(SA[9,3]=> 10.0,
SA[8,8]=> 3.0,
SA[4,3]=>-10.0,
SA[4,6]=> -5.0)
return GridWorld(SA[10, 10], rewards, SA[rand(1:10), rand(1:10)])
end
RL.reset!(env::GridWorld) = (env.state = SA[rand(1:env.size[1]), rand(1:env.size[2])])
RL.actions(env::GridWorld) = (SA[1,0], SA[-1,0], SA[0,1], SA[0,-1])
POMDPs.actions(env::GridWorld) = RL.actions(env::GridWorld)
RL.observe(env::GridWorld) = env.state
RL.terminated(env::GridWorld) = haskey(env.rewards, env.state)
function RL.act!(env::GridWorld, a)
if rand() < 0.4 # 40% chance of going in a random direction (=30% chance of going in a wrong direction)
a = rand(POMDPs.actions(env))
end
env.state = clamp.(env.state + a, SA[1,1], env.size)
return get(env.rewards, env.state, 0.0)
end
# optional functions
@provide RL.observations(env::GridWorld) = [SA[x, y] for x in 1:env.size[1], y in 1:env.size[2]]
@provide RL.clone(env::GridWorld) = GridWorld(env.size, copy(env.rewards), env.state)
@provide RL.state(env::GridWorld) = env.state
@provide RL.setstate!(env::GridWorld, s) = (env.state = s)
@provide function RL.render(env::GridWorld)
nx, ny = env.size
cells = []
for s in observations(env)
r = get(env.rewards, s, 0.0)
clr = get(ColorSchemes.redgreensplit, (r+10.0)/20.0)
cell = context((s[1]-1)/nx, (ny-s[2])/ny, 1/nx, 1/ny)
compose!(cell, rectangle(), fill(clr), stroke("gray"))
push!(cells, cell)
end
grid = compose(context(), linewidth(0.5mm), cells...)
outline = compose(context(), linewidth(1mm), rectangle(), stroke("gray"))
s = env.state
agent_ctx = context((s[1]-1)/nx, (ny-s[2])/ny, 1/nx, 1/ny)
agent = compose(agent_ctx, circle(0.5, 0.5, 0.4), fill("orange"))
sz = min(w,h)
return compose(context((w-sz)/2, (h-sz)/2, sz, sz), agent, grid, outline)
end
# load MDP model from POMDPModels or define your own!
env = GridWorld();
# Define the Q network (see Flux.jl documentation)
# the gridworld state is represented by a 2 dimensional vector.
model = Chain(Dense(2, 32), Dense(32, length(POMDPs.actions(env))))
exploration = EpsGreedyPolicy(env, LinearDecaySchedule(start=1.0, stop=0.01, steps=10000/2))
solver = DeepQLearningSolver(qnetwork = model, max_steps=10000,
exploration_policy = exploration,
learning_rate=0.005,log_freq=500,
recurrence=false,double_q=true, dueling=true, prioritized_replay=true)
policy = solve(solver, env)
sim = RolloutSimulator(max_steps=30)
r_tot = simulate(sim, env, policy)
println("Total discounted reward for 1 simulation: $r_tot")