Issue with ReinforcementLearning.jl BasicDQN with custom environment

SatvikDuddukuru · June 20, 2021, 10:39pm

I have recently started working with ReinforcementLearning.jl, and I went through the given examples to familiarize myself with the syntax. I have had success with the built-in environments, and most of my own custom environments, except for one. I wrote the env below:

strs_by_size(arr::Array{String,1}, s::Int) = filter(x->length(x)==s, arr)
ints_to_str(arr::Array{Int,1})::String = join(Char.(arr.+=96))
str_to_ints(s::String)::Array{Int,1} = map(x->Int(x)-96, [i for i in s])
words_arr = load_words()

mutable struct HangmanEnv{T} <: AbstractEnv where T<:Int
    word_to_guess::Array{Int,1}
	guessed_letters::Array{Int,1}
	current_state::Array{Int,1}
	remaining_guesses::Int
	HangmanEnv() = begin
		w = rand(words_arr)
		new{length(w)}(
			w |> str_to_ints,
			Int[],
			[-1 for i in 1:length(w)],
			6
		)
	end
	HangmanEnv(n::Int) = begin
		w = rand(strs_by_size(words_arr, n))
		new{length(w)}(
			w |> str_to_ints,
			Int[],
			[-1 for i in 1:length(w)],
			6
		)
	end
	HangmanEnv(w::String) = begin
		new{length(w)}(
			w |> str_to_ints,
			Int[],
			[-1 for i in 1:length(w)],
			6
		)
	end
end

RLBase.action_space(env::HangmanEnv) = begin
	Base.OneTo(26)
end
RLBase.legal_action_space(env::HangmanEnv) = begin
	filter(
		x -> x ∉ env.guessed_letters, 
		Array(Base.OneTo(26))
	)
end
RLBase.legal_action_space_mask(env::HangmanEnv) = begin
	filter(
		x -> x ∉ env.guessed_letters, 
		Array(Base.OneTo(26))
	)
end
RLBase.state(env::HangmanEnv) = begin 
	env.current_state
end
RLBase.state_space(env::HangmanEnv) = begin 
	Space([-1..26 for i in 1:length(env.word_to_guess)])
end
RLBase.reward(env::HangmanEnv) = begin 
	if is_terminated(env)
		env.current_state == env.word_to_guess ? 1 : -1
	else
		0
	end
end
RLBase.is_terminated(env::HangmanEnv) = begin 
	if env.remaining_guesses ≤ 0 || env.current_state == env.word_to_guess
		return true
	else
		return false
	end
end
RLBase.reset!(env::HangmanEnv) = begin 
	w = rand(strs_by_size(words_arr, length(env.word_to_guess))) 
	env.word_to_guess = w |> str_to_ints
	env.guessed_letters = Int[]
	env.current_state = [-1 for i in 1:length(w)]
	env.remaining_guesses = 6
end
(env::HangmanEnv)(a::Int) = begin 
	if !is_terminated(env)
		push!(env.guessed_letters, a)
		inds = findall(x->x==a, env.word_to_guess)
		env.current_state[inds] .= a
		if length(inds) == 0
			env.remaining_guesses -= 1
		end
	end
end

RLBase.ActionStyle(::HangmanEnv) = FULL_ACTION_SET
RLBase.ChanceStyle(::HangmanEnv) = STOCHASTIC
RLBase.DynamicStyle(::HangmanEnv) = SEQUENTIAL
RLBase.InformationStyle(::HangmanEnv) = PERFECT_INFORMATION
RLBase.NumAgentStyle(::HangmanEnv) = SINGLE_AGENT
RLBase.RewardStyle(::HangmanEnv) = TERMINAL_REWARD
RLBase.StateStyle(::HangmanEnv) = Observation{Array{Int,1}}()
RLBase.UtilityStyle(::HangmanEnv) =  GENERAL_SUM

When I run RLBase.test_runnable!(HangmanEnv()), it works well and no errors are shown. When I play the env manually, it works as expected without any errors. However, when I try to apply the BasicDQN model below, I get an error. (The code is basically copied over from the CartPole example in the documentation, but with a few modifications)

word_len = 5
rng = StableRNG(3435)
env = HangmanEnv(word_len)
ns, na = length(state(env)), length(legal_action_space(env))
agent = Agent(
	policy = QBasedPolicy(
		learner = BasicDQNLearner(
			approximator = NeuralNetworkApproximator(
				model = Chain(
					Dense(ns, 128, relu; init = glorot_uniform(rng)),
					Dense(128, 128, relu; init = glorot_uniform(rng)),
					Dense(128, na; init = glorot_uniform(rng)),
				) |> cpu,
				optimizer = ADAM(),
			),
			batch_size = 32,
			min_replay_history = 100,
			loss_func = huber_loss,
			rng = rng,
		),
		explorer = EpsilonGreedyExplorer(
			kind = :exp,
			ϵ_stable = 0.01,
			decay_steps = 500,
			rng = rng,
		),
	),
	trajectory = CircularArraySARTTrajectory(
		capacity = 1000,
		state = Array{Int64,1} => (ns,),
	),
)
stop_condition = StopAfterEpisode(10000; is_show_progress=false)
hook = TotalRewardPerEpisode()
run(agent, env, stop_condition, hook)

This is the error:

TypeError: non-boolean (Int64) used in boolean context
iterate@iterators.jl:451[inlined]
iterate@generator.jl:44[inlined]
grow_to!(::Vector{Int64}, ::Base.Generator{Base.Iterators.Filter{typeof(last), Base.Iterators.Pairs{Int64, Int64, LinearIndices{1, Tuple{Base.OneTo{Int64}}}, Vector{Int64}}}, typeof(first)})@array.jl:739
collect@array.jl:676[inlined]
findall@array.jl:2194[inlined]
(::ReinforcementLearningCore.EpsilonGreedyExplorer{:exp, false, StableRNGs.LehmerRNG})(::Vector{Float32}, ::Vector{Int64})@epsilon_greedy_explorer.jl:132
(::ReinforcementLearningCore.QBasedPolicy{ReinforcementLearningZoo.BasicDQNLearner{ReinforcementLearningCore.NeuralNetworkApproximator{Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}, Flux.Optimise.ADAM}, typeof(Flux.Losses.huber_loss), StableRNGs.LehmerRNG}, ReinforcementLearningCore.EpsilonGreedyExplorer{:exp, false, StableRNGs.LehmerRNG}})(::Main.workspace1213.HangmanEnv{5}, ::ReinforcementLearningBase.FullActionSet, ::Base.OneTo{Int64})@q_based_policy.jl:24
QBasedPolicy@q_based_policy.jl:21[inlined]
Agent@agent.jl:24[inlined]
_run(::ReinforcementLearningCore.Agent{ReinforcementLearningCore.QBasedPolicy{ReinforcementLearningZoo.BasicDQNLearner{ReinforcementLearningCore.NeuralNetworkApproximator{Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}, Flux.Optimise.ADAM}, typeof(Flux.Losses.huber_loss), StableRNGs.LehmerRNG}, ReinforcementLearningCore.EpsilonGreedyExplorer{:exp, false, StableRNGs.LehmerRNG}}, ReinforcementLearningCore.CircularArraySARTTrajectory{NamedTuple{(:state, :action, :reward, :terminal), Tuple{CircularArrayBuffers.CircularArrayBuffer{Int64, 2}, CircularArrayBuffers.CircularVectorBuffer{Int64}, CircularArrayBuffers.CircularVectorBuffer{Float32}, CircularArrayBuffers.CircularVectorBuffer{Bool}}}}}, ::Main.workspace1213.HangmanEnv{5}, ::ReinforcementLearningCore.StopAfterEpisode{Nothing}, ::ReinforcementLearningCore.TotalRewardPerEpisode)@run.jl:26
run(::ReinforcementLearningCore.Agent{ReinforcementLearningCore.QBasedPolicy{ReinforcementLearningZoo.BasicDQNLearner{ReinforcementLearningCore.NeuralNetworkApproximator{Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}, Flux.Optimise.ADAM}, typeof(Flux.Losses.huber_loss), StableRNGs.LehmerRNG}, ReinforcementLearningCore.EpsilonGreedyExplorer{:exp, false, StableRNGs.LehmerRNG}}, ReinforcementLearningCore.CircularArraySARTTrajectory{NamedTuple{(:state, :action, :reward, :terminal), Tuple{CircularArrayBuffers.CircularArrayBuffer{Int64, 2}, CircularArrayBuffers.CircularVectorBuffer{Int64}, CircularArrayBuffers.CircularVectorBuffer{Float32}, CircularArrayBuffers.CircularVectorBuffer{Bool}}}}}, ::Main.workspace1213.HangmanEnv{5}, ::ReinforcementLearningCore.StopAfterEpisode{Nothing}, ::ReinforcementLearningCore.TotalRewardPerEpisode)@run.jl:10
top-level scope@Local: 37[inlined]

I think that there was an error with how the NeuralNetworkApproximator was choosing actions, but I don’t know how to fix this. Could anyone help me identify where the error is occurring and any possible solutions?

findmyway · June 21, 2021, 3:32am

Hi @SatvikDuddukuru ,

This minor fix should be enough:

RLBase.legal_action_space_mask(env::HangmanEnv) = begin
    map(x -> x∉env.guessed_letters, 1:26)
end

legal_action_space_mask expects a bool vector.

~~I’ll add a check in RLBase.test_runnable! soon.~~ (This seems to be covered in the RLBase.test_interfaces!)

github.com

JuliaReinforcementLearning/ReinforcementLearning.jl/blob/7671d8a0dea4988d24aa3d7e5da00c0ef609385b/src/ReinforcementLearningBase/src/base.jl#L199-L201


      
          elseif ActionStyle(env) === FULL_ACTION_SET
              @test legal_action_space(env) ==
                    action_space(env)[legal_action_space_mask(env)]

Also note that BasicDQN does not support FULL_ACTION_SET style games yet. You may want to switch to DQN instead.

By the way, this seems like an interesting game. Are there any references discussing this kind of game?

SatvikDuddukuru · June 21, 2021, 4:46pm

Thank you for your help. The model works and I am eager to see how well it can be optimized. As for references to this type of game, I have seen a few GitHub repositories of an implementation in Python with OpenAI Gym, but nothing very extensive. If I get it working well, do you think it could be an example Experiment since it is pretty straightforward to understand and different from the other mechanics-based experiments?

findmyway · June 22, 2021, 3:45am

That would be great!

Topic		Replies	Views
DQN Reinforcement Learning Agent is Not Learning New to Julia question , neural-network	16	1730	April 27, 2022
Error with CircularBufferArrays in ReinforcementLearning.jl Machine Learning question , package , error	5	767	July 11, 2021
Reinforcement Learning Package Machine Learning package , announcement	2	2018	October 15, 2018
Using AbstractEnv from CommonRLInterface with POMDPs Machine Learning	22	861	September 27, 2021
How do I check my custom Env in Reinforce.jl? Machine Learning	4	430	May 15, 2022

Issue with ReinforcementLearning.jl BasicDQN with custom environment

Related topics