Issue with ReinforcementLearning.jl BasicDQN with custom environment

I have recently started working with ReinforcementLearning.jl, and I went through the given examples to familiarize myself with the syntax. I have had success with the built-in environments, and most of my own custom environments, except for one. I wrote the env below:

strs_by_size(arr::Array{String,1}, s::Int) = filter(x->length(x)==s, arr)
ints_to_str(arr::Array{Int,1})::String = join(Char.(arr.+=96))
str_to_ints(s::String)::Array{Int,1} = map(x->Int(x)-96, [i for i in s])
words_arr = load_words()

mutable struct HangmanEnv{T} <: AbstractEnv where T<:Int
    word_to_guess::Array{Int,1}
	guessed_letters::Array{Int,1}
	current_state::Array{Int,1}
	remaining_guesses::Int
	HangmanEnv() = begin
		w = rand(words_arr)
		new{length(w)}(
			w |> str_to_ints,
			Int[],
			[-1 for i in 1:length(w)],
			6
		)
	end
	HangmanEnv(n::Int) = begin
		w = rand(strs_by_size(words_arr, n))
		new{length(w)}(
			w |> str_to_ints,
			Int[],
			[-1 for i in 1:length(w)],
			6
		)
	end
	HangmanEnv(w::String) = begin
		new{length(w)}(
			w |> str_to_ints,
			Int[],
			[-1 for i in 1:length(w)],
			6
		)
	end
end

RLBase.action_space(env::HangmanEnv) = begin
	Base.OneTo(26)
end
RLBase.legal_action_space(env::HangmanEnv) = begin
	filter(
		x -> x ∉ env.guessed_letters, 
		Array(Base.OneTo(26))
	)
end
RLBase.legal_action_space_mask(env::HangmanEnv) = begin
	filter(
		x -> x ∉ env.guessed_letters, 
		Array(Base.OneTo(26))
	)
end
RLBase.state(env::HangmanEnv) = begin 
	env.current_state
end
RLBase.state_space(env::HangmanEnv) = begin 
	Space([-1..26 for i in 1:length(env.word_to_guess)])
end
RLBase.reward(env::HangmanEnv) = begin 
	if is_terminated(env)
		env.current_state == env.word_to_guess ? 1 : -1
	else
		0
	end
end
RLBase.is_terminated(env::HangmanEnv) = begin 
	if env.remaining_guesses ≤ 0 || env.current_state == env.word_to_guess
		return true
	else
		return false
	end
end
RLBase.reset!(env::HangmanEnv) = begin 
	w = rand(strs_by_size(words_arr, length(env.word_to_guess))) 
	env.word_to_guess = w |> str_to_ints
	env.guessed_letters = Int[]
	env.current_state = [-1 for i in 1:length(w)]
	env.remaining_guesses = 6
end
(env::HangmanEnv)(a::Int) = begin 
	if !is_terminated(env)
		push!(env.guessed_letters, a)
		inds = findall(x->x==a, env.word_to_guess)
		env.current_state[inds] .= a
		if length(inds) == 0
			env.remaining_guesses -= 1
		end
	end
end

RLBase.ActionStyle(::HangmanEnv) = FULL_ACTION_SET
RLBase.ChanceStyle(::HangmanEnv) = STOCHASTIC
RLBase.DynamicStyle(::HangmanEnv) = SEQUENTIAL
RLBase.InformationStyle(::HangmanEnv) = PERFECT_INFORMATION
RLBase.NumAgentStyle(::HangmanEnv) = SINGLE_AGENT
RLBase.RewardStyle(::HangmanEnv) = TERMINAL_REWARD
RLBase.StateStyle(::HangmanEnv) = Observation{Array{Int,1}}()
RLBase.UtilityStyle(::HangmanEnv) =  GENERAL_SUM

When I run RLBase.test_runnable!(HangmanEnv()), it works well and no errors are shown. When I play the env manually, it works as expected without any errors. However, when I try to apply the BasicDQN model below, I get an error. (The code is basically copied over from the CartPole example in the documentation, but with a few modifications)

word_len = 5
rng = StableRNG(3435)
env = HangmanEnv(word_len)
ns, na = length(state(env)), length(legal_action_space(env))
agent = Agent(
	policy = QBasedPolicy(
		learner = BasicDQNLearner(
			approximator = NeuralNetworkApproximator(
				model = Chain(
					Dense(ns, 128, relu; init = glorot_uniform(rng)),
					Dense(128, 128, relu; init = glorot_uniform(rng)),
					Dense(128, na; init = glorot_uniform(rng)),
				) |> cpu,
				optimizer = ADAM(),
			),
			batch_size = 32,
			min_replay_history = 100,
			loss_func = huber_loss,
			rng = rng,
		),
		explorer = EpsilonGreedyExplorer(
			kind = :exp,
			ϵ_stable = 0.01,
			decay_steps = 500,
			rng = rng,
		),
	),
	trajectory = CircularArraySARTTrajectory(
		capacity = 1000,
		state = Array{Int64,1} => (ns,),
	),
)
stop_condition = StopAfterEpisode(10000; is_show_progress=false)
hook = TotalRewardPerEpisode()
run(agent, env, stop_condition, hook)

This is the error:

TypeError: non-boolean (Int64) used in boolean context
iterate@iterators.jl:451[inlined]
iterate@generator.jl:44[inlined]
grow_to!(::Vector{Int64}, ::Base.Generator{Base.Iterators.Filter{typeof(last), Base.Iterators.Pairs{Int64, Int64, LinearIndices{1, Tuple{Base.OneTo{Int64}}}, Vector{Int64}}}, typeof(first)})@array.jl:739
collect@array.jl:676[inlined]
findall@array.jl:2194[inlined]
(::ReinforcementLearningCore.EpsilonGreedyExplorer{:exp, false, StableRNGs.LehmerRNG})(::Vector{Float32}, ::Vector{Int64})@epsilon_greedy_explorer.jl:132
(::ReinforcementLearningCore.QBasedPolicy{ReinforcementLearningZoo.BasicDQNLearner{ReinforcementLearningCore.NeuralNetworkApproximator{Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}, Flux.Optimise.ADAM}, typeof(Flux.Losses.huber_loss), StableRNGs.LehmerRNG}, ReinforcementLearningCore.EpsilonGreedyExplorer{:exp, false, StableRNGs.LehmerRNG}})(::Main.workspace1213.HangmanEnv{5}, ::ReinforcementLearningBase.FullActionSet, ::Base.OneTo{Int64})@q_based_policy.jl:24
QBasedPolicy@q_based_policy.jl:21[inlined]
Agent@agent.jl:24[inlined]
_run(::ReinforcementLearningCore.Agent{ReinforcementLearningCore.QBasedPolicy{ReinforcementLearningZoo.BasicDQNLearner{ReinforcementLearningCore.NeuralNetworkApproximator{Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}, Flux.Optimise.ADAM}, typeof(Flux.Losses.huber_loss), StableRNGs.LehmerRNG}, ReinforcementLearningCore.EpsilonGreedyExplorer{:exp, false, StableRNGs.LehmerRNG}}, ReinforcementLearningCore.CircularArraySARTTrajectory{NamedTuple{(:state, :action, :reward, :terminal), Tuple{CircularArrayBuffers.CircularArrayBuffer{Int64, 2}, CircularArrayBuffers.CircularVectorBuffer{Int64}, CircularArrayBuffers.CircularVectorBuffer{Float32}, CircularArrayBuffers.CircularVectorBuffer{Bool}}}}}, ::Main.workspace1213.HangmanEnv{5}, ::ReinforcementLearningCore.StopAfterEpisode{Nothing}, ::ReinforcementLearningCore.TotalRewardPerEpisode)@run.jl:26
run(::ReinforcementLearningCore.Agent{ReinforcementLearningCore.QBasedPolicy{ReinforcementLearningZoo.BasicDQNLearner{ReinforcementLearningCore.NeuralNetworkApproximator{Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}, Flux.Optimise.ADAM}, typeof(Flux.Losses.huber_loss), StableRNGs.LehmerRNG}, ReinforcementLearningCore.EpsilonGreedyExplorer{:exp, false, StableRNGs.LehmerRNG}}, ReinforcementLearningCore.CircularArraySARTTrajectory{NamedTuple{(:state, :action, :reward, :terminal), Tuple{CircularArrayBuffers.CircularArrayBuffer{Int64, 2}, CircularArrayBuffers.CircularVectorBuffer{Int64}, CircularArrayBuffers.CircularVectorBuffer{Float32}, CircularArrayBuffers.CircularVectorBuffer{Bool}}}}}, ::Main.workspace1213.HangmanEnv{5}, ::ReinforcementLearningCore.StopAfterEpisode{Nothing}, ::ReinforcementLearningCore.TotalRewardPerEpisode)@run.jl:10
top-level scope@Local: 37[inlined]

I think that there was an error with how the NeuralNetworkApproximator was choosing actions, but I don’t know how to fix this. Could anyone help me identify where the error is occurring and any possible solutions?

1 Like

Hi @SatvikDuddukuru ,

This minor fix should be enough:

RLBase.legal_action_space_mask(env::HangmanEnv) = begin
    map(x -> x∉env.guessed_letters, 1:26)
end

legal_action_space_mask expects a bool vector.

I’ll add a check in RLBase.test_runnable! soon. (This seems to be covered in the RLBase.test_interfaces!)

Also note that BasicDQN does not support FULL_ACTION_SET style games yet. You may want to switch to DQN instead.

By the way, this seems like an interesting game. Are there any references discussing this kind of game?

Thank you for your help. The model works and I am eager to see how well it can be optimized. As for references to this type of game, I have seen a few GitHub repositories of an implementation in Python with OpenAI Gym, but nothing very extensive. If I get it working well, do you think it could be an example Experiment since it is pretty straightforward to understand and different from the other mechanics-based experiments?

That would be great! :heart: