I have recently started working with ReinforcementLearning.jl, and I went through the given examples to familiarize myself with the syntax. I have had success with the built-in environments, and most of my own custom environments, except for one. I wrote the env below:
strs_by_size(arr::Array{String,1}, s::Int) = filter(x->length(x)==s, arr)
ints_to_str(arr::Array{Int,1})::String = join(Char.(arr.+=96))
str_to_ints(s::String)::Array{Int,1} = map(x->Int(x)-96, [i for i in s])
words_arr = load_words()
mutable struct HangmanEnv{T} <: AbstractEnv where T<:Int
word_to_guess::Array{Int,1}
guessed_letters::Array{Int,1}
current_state::Array{Int,1}
remaining_guesses::Int
HangmanEnv() = begin
w = rand(words_arr)
new{length(w)}(
w |> str_to_ints,
Int[],
[-1 for i in 1:length(w)],
6
)
end
HangmanEnv(n::Int) = begin
w = rand(strs_by_size(words_arr, n))
new{length(w)}(
w |> str_to_ints,
Int[],
[-1 for i in 1:length(w)],
6
)
end
HangmanEnv(w::String) = begin
new{length(w)}(
w |> str_to_ints,
Int[],
[-1 for i in 1:length(w)],
6
)
end
end
RLBase.action_space(env::HangmanEnv) = begin
Base.OneTo(26)
end
RLBase.legal_action_space(env::HangmanEnv) = begin
filter(
x -> x ∉ env.guessed_letters,
Array(Base.OneTo(26))
)
end
RLBase.legal_action_space_mask(env::HangmanEnv) = begin
filter(
x -> x ∉ env.guessed_letters,
Array(Base.OneTo(26))
)
end
RLBase.state(env::HangmanEnv) = begin
env.current_state
end
RLBase.state_space(env::HangmanEnv) = begin
Space([-1..26 for i in 1:length(env.word_to_guess)])
end
RLBase.reward(env::HangmanEnv) = begin
if is_terminated(env)
env.current_state == env.word_to_guess ? 1 : -1
else
0
end
end
RLBase.is_terminated(env::HangmanEnv) = begin
if env.remaining_guesses ≤ 0 || env.current_state == env.word_to_guess
return true
else
return false
end
end
RLBase.reset!(env::HangmanEnv) = begin
w = rand(strs_by_size(words_arr, length(env.word_to_guess)))
env.word_to_guess = w |> str_to_ints
env.guessed_letters = Int[]
env.current_state = [-1 for i in 1:length(w)]
env.remaining_guesses = 6
end
(env::HangmanEnv)(a::Int) = begin
if !is_terminated(env)
push!(env.guessed_letters, a)
inds = findall(x->x==a, env.word_to_guess)
env.current_state[inds] .= a
if length(inds) == 0
env.remaining_guesses -= 1
end
end
end
RLBase.ActionStyle(::HangmanEnv) = FULL_ACTION_SET
RLBase.ChanceStyle(::HangmanEnv) = STOCHASTIC
RLBase.DynamicStyle(::HangmanEnv) = SEQUENTIAL
RLBase.InformationStyle(::HangmanEnv) = PERFECT_INFORMATION
RLBase.NumAgentStyle(::HangmanEnv) = SINGLE_AGENT
RLBase.RewardStyle(::HangmanEnv) = TERMINAL_REWARD
RLBase.StateStyle(::HangmanEnv) = Observation{Array{Int,1}}()
RLBase.UtilityStyle(::HangmanEnv) = GENERAL_SUM
When I run RLBase.test_runnable!(HangmanEnv())
, it works well and no errors are shown. When I play the env manually, it works as expected without any errors. However, when I try to apply the BasicDQN model below, I get an error. (The code is basically copied over from the CartPole example in the documentation, but with a few modifications)
word_len = 5
rng = StableRNG(3435)
env = HangmanEnv(word_len)
ns, na = length(state(env)), length(legal_action_space(env))
agent = Agent(
policy = QBasedPolicy(
learner = BasicDQNLearner(
approximator = NeuralNetworkApproximator(
model = Chain(
Dense(ns, 128, relu; init = glorot_uniform(rng)),
Dense(128, 128, relu; init = glorot_uniform(rng)),
Dense(128, na; init = glorot_uniform(rng)),
) |> cpu,
optimizer = ADAM(),
),
batch_size = 32,
min_replay_history = 100,
loss_func = huber_loss,
rng = rng,
),
explorer = EpsilonGreedyExplorer(
kind = :exp,
ϵ_stable = 0.01,
decay_steps = 500,
rng = rng,
),
),
trajectory = CircularArraySARTTrajectory(
capacity = 1000,
state = Array{Int64,1} => (ns,),
),
)
stop_condition = StopAfterEpisode(10000; is_show_progress=false)
hook = TotalRewardPerEpisode()
run(agent, env, stop_condition, hook)
This is the error:
TypeError: non-boolean (Int64) used in boolean context
iterate@iterators.jl:451[inlined]
iterate@generator.jl:44[inlined]
grow_to!(::Vector{Int64}, ::Base.Generator{Base.Iterators.Filter{typeof(last), Base.Iterators.Pairs{Int64, Int64, LinearIndices{1, Tuple{Base.OneTo{Int64}}}, Vector{Int64}}}, typeof(first)})@array.jl:739
collect@array.jl:676[inlined]
findall@array.jl:2194[inlined]
(::ReinforcementLearningCore.EpsilonGreedyExplorer{:exp, false, StableRNGs.LehmerRNG})(::Vector{Float32}, ::Vector{Int64})@epsilon_greedy_explorer.jl:132
(::ReinforcementLearningCore.QBasedPolicy{ReinforcementLearningZoo.BasicDQNLearner{ReinforcementLearningCore.NeuralNetworkApproximator{Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}, Flux.Optimise.ADAM}, typeof(Flux.Losses.huber_loss), StableRNGs.LehmerRNG}, ReinforcementLearningCore.EpsilonGreedyExplorer{:exp, false, StableRNGs.LehmerRNG}})(::Main.workspace1213.HangmanEnv{5}, ::ReinforcementLearningBase.FullActionSet, ::Base.OneTo{Int64})@q_based_policy.jl:24
QBasedPolicy@q_based_policy.jl:21[inlined]
Agent@agent.jl:24[inlined]
_run(::ReinforcementLearningCore.Agent{ReinforcementLearningCore.QBasedPolicy{ReinforcementLearningZoo.BasicDQNLearner{ReinforcementLearningCore.NeuralNetworkApproximator{Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}, Flux.Optimise.ADAM}, typeof(Flux.Losses.huber_loss), StableRNGs.LehmerRNG}, ReinforcementLearningCore.EpsilonGreedyExplorer{:exp, false, StableRNGs.LehmerRNG}}, ReinforcementLearningCore.CircularArraySARTTrajectory{NamedTuple{(:state, :action, :reward, :terminal), Tuple{CircularArrayBuffers.CircularArrayBuffer{Int64, 2}, CircularArrayBuffers.CircularVectorBuffer{Int64}, CircularArrayBuffers.CircularVectorBuffer{Float32}, CircularArrayBuffers.CircularVectorBuffer{Bool}}}}}, ::Main.workspace1213.HangmanEnv{5}, ::ReinforcementLearningCore.StopAfterEpisode{Nothing}, ::ReinforcementLearningCore.TotalRewardPerEpisode)@run.jl:26
run(::ReinforcementLearningCore.Agent{ReinforcementLearningCore.QBasedPolicy{ReinforcementLearningZoo.BasicDQNLearner{ReinforcementLearningCore.NeuralNetworkApproximator{Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}, Flux.Optimise.ADAM}, typeof(Flux.Losses.huber_loss), StableRNGs.LehmerRNG}, ReinforcementLearningCore.EpsilonGreedyExplorer{:exp, false, StableRNGs.LehmerRNG}}, ReinforcementLearningCore.CircularArraySARTTrajectory{NamedTuple{(:state, :action, :reward, :terminal), Tuple{CircularArrayBuffers.CircularArrayBuffer{Int64, 2}, CircularArrayBuffers.CircularVectorBuffer{Int64}, CircularArrayBuffers.CircularVectorBuffer{Float32}, CircularArrayBuffers.CircularVectorBuffer{Bool}}}}}, ::Main.workspace1213.HangmanEnv{5}, ::ReinforcementLearningCore.StopAfterEpisode{Nothing}, ::ReinforcementLearningCore.TotalRewardPerEpisode)@run.jl:10
top-level scope@Local: 37[inlined]
I think that there was an error with how the NeuralNetworkApproximator was choosing actions, but I don’t know how to fix this. Could anyone help me identify where the error is occurring and any possible solutions?