Error with CircularBufferArrays in ReinforcementLearning.jl

I made a Connect 4 Environment with ReinforcementLearning.jl:

begin
	
	mutable struct Connect4Env <: AbstractEnv
		board::Array{Int,2}
		current_player::Int
		full_columns::Array{Int,1}
		is_terminated::Bool
		Connect4Env() = new(
			zeros(Int, 6, 7),
			1,
			Int[],
			false
		)
	end
	
	RLBase.action_space(env::Connect4Env, ::Int) = Base.OneTo(7)
	RLBase.legal_action_space(env::Connect4Env) = filter(x->x∉env.full_columns, Base.OneTo(7))
	RLBase.legal_action_space_mask(env::Connect4Env) = map(x->x∉env.full_columns, Base.OneTo(7))
	RLBase.state(env::Connect4Env) = vcat(env.board...)
	RLBase.state(env::Connect4Env, a...) = vcat(env.board...)
	RLBase.state_space(env::Connect4Env) = Space([0..2 for i in 1:42])
	RLBase.reward(env::Connect4Env, p) = is_terminated(env) ? 2*Int(getWinner(env.board)==p)-1 : 0
	RLBase.is_terminated(env::Connect4Env) = env.is_terminated
	RLBase.reset!(env::Connect4Env) = begin 
		env.board = zeros(Int, 6, 7)
		env.current_player = 1
		env.full_columns = Int[]
		env.is_terminated = false
	end
	(env::Connect4Env)(a::Int) = begin 
		if a ∉ env.full_columns && !is_terminated(env)
			try
				env.board[maximum(findall(x->x==0, env.board[:, a])), a] = env.current_player
				env.current_player = env.current_player==1 ? 2 : 1
				if count(x->x==0, env.board[:, a]) == 0
					push!(env.full_columns, a)
				end
				if length(env.full_columns)==7 || check4s(env.board)
					env.is_terminated = true
				end
			catch
				@warn "error"
			end
		end
	end
	RLBase.players(::Connect4Env) = (1, 2)
	RLBase.current_player(env::Connect4Env) = env.current_player
	
	RLBase.ActionStyle(::Connect4Env) = FULL_ACTION_SET
	RLBase.ChanceStyle(::Connect4Env) = DETERMINISTIC
	RLBase.DynamicStyle(::Connect4Env) = SEQUENTIAL
	RLBase.InformationStyle(::Connect4Env) = PERFECT_INFORMATION
	RLBase.NumAgentStyle(::Connect4Env) = MultiAgent(2)
	RLBase.RewardStyle(::Connect4Env) = TERMINAL_REWARD
	RLBase.StateStyle(::Connect4Env) = Observation{Array{Int,1}}()
	RLBase.UtilityStyle(::Connect4Env) = ZERO_SUM

end

I am trying to use a MultiAgent Deep Q Network to find the optimal strategy. My code for the algorithm is below:

begin
	env = Connect4Env()
	num_iterations = 1000
	ns, na = size(state(env), 1), length(legal_action_space(env))
	rng = StableRNG(3435)
	base_model = Chain(
        Dense(ns, 128, relu; init = glorot_uniform(rng)),
        Dense(128, 128, relu; init = glorot_uniform(rng)),
        Dense(128, na; init = glorot_uniform(rng))
	)   
	agents = MultiAgentManager(
		(
			Agent(
				policy = NamedPolicy(
					p => QBasedPolicy(;
						learner = DQNLearner(
							approximator = NeuralNetworkApproximator(
								model = build_dueling_network(base_model) |> cpu,
								optimizer = ADAM(),
							),
							target_approximator = NeuralNetworkApproximator(
								model = build_dueling_network(base_model) |> cpu,
							),
							loss_func = huber_loss,
							stack_size = nothing,
							batch_size = 32,
							update_horizon = 1,
							min_replay_history = 100,
							update_freq = 1,
							target_update_freq = 100,
							rng = rng,
							traces = SLARTSL
						),
						explorer = EpsilonGreedyExplorer(
							kind = :exp,
							ϵ_stable = 0.01,
							decay_steps = 500,
							rng = rng,
						),
					)
				),
				trajectory = CircularArraySARTTrajectory(
					capacity = 1000,
					state = Array{Int,1} => (ns,)
				)
			)
			for p in players(env)
		)...
	)
	multi_agent_hook = MultiAgentHook(
		(
			p => TotalRewardPerEpisode()
			for p in players(env)
		)...
	)
	run(agents, env, StopAfterEpisode(num_iterations), multi_agent_hook)
end

When I try to run this, I get an error, which I think is caused by CircularBufferArrays, but I am not sure if it is my code that is causing this or an error in the package itself. Here is the error:

MethodError: Cannot `convert` an object of type ReinforcementLearningCore.NoOp to an object of type Int64
Closest candidates are:
convert(::Type{T}, !Matched::LLVM.GenericValue) where T<:Signed at /Users/satvikd/.julia/packages/LLVM/XEOgl/src/execution.jl:27
convert(::Type{T}, !Matched::LLVM.ConstantInt) where T<:Signed at /Users/satvikd/.julia/packages/LLVM/XEOgl/src/core/value/constant.jl:76
convert(::Type{T}, !Matched::Intervals.AnchoredInterval{P, T, L, R} where {L<:Intervals.Bounded, R<:Intervals.Bounded}) where {P, T} at /Users/satvikd/.julia/packages/Intervals/ua9cq/src/anchoredinterval.jl:181
...
setindex!(::Vector{Int64}, ::ReinforcementLearningCore.NoOp, ::Int64)@array.jl:839
setindex!(::CircularArrayBuffers.CircularVectorBuffer{Int64}, ::ReinforcementLearningCore.NoOp, ::Int64)@CircularArrayBuffers.jl:36
push!(::CircularArrayBuffers.CircularVectorBuffer{Int64}, ::ReinforcementLearningCore.NoOp)@CircularArrayBuffers.jl:75
update!(::ReinforcementLearningCore.CircularArraySARTTrajectory{NamedTuple{(:state, :action, :reward, :terminal), Tuple{CircularArrayBuffers.CircularArrayBuffer{Int64, 2}, CircularArrayBuffers.CircularVectorBuffer{Int64}, CircularArrayBuffers.CircularVectorBuffer{Float32}, CircularArrayBuffers.CircularVectorBuffer{Bool}}}}, ::ReinforcementLearningCore.NamedPolicy{ReinforcementLearningCore.QBasedPolicy{ReinforcementLearningZoo.DQNLearner{ReinforcementLearningCore.NeuralNetworkApproximator{ReinforcementLearningZoo.DuelingNetwork{Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}}}, Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}, Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}}, Flux.Optimise.ADAM}, ReinforcementLearningCore.NeuralNetworkApproximator{ReinforcementLearningZoo.DuelingNetwork{Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}}}, Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}, Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}}, Nothing}, typeof(Flux.Losses.huber_loss), StableRNGs.LehmerRNG}, ReinforcementLearningCore.EpsilonGreedyExplorer{:exp, false, StableRNGs.LehmerRNG}}, Int64}, ::Main.workspace513.Connect4Env, ::ReinforcementLearningCore.PreActStage, ::ReinforcementLearningCore.NoOp)@agent.jl:119
(::ReinforcementLearningCore.Agent{ReinforcementLearningCore.NamedPolicy{ReinforcementLearningCore.QBasedPolicy{ReinforcementLearningZoo.DQNLearner{ReinforcementLearningCore.NeuralNetworkApproximator{ReinforcementLearningZoo.DuelingNetwork{Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}}}, Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}, Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}}, Flux.Optimise.ADAM}, ReinforcementLearningCore.NeuralNetworkApproximator{ReinforcementLearningZoo.DuelingNetwork{Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}}}, Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}, Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}}, Nothing}, typeof(Flux.Losses.huber_loss), StableRNGs.LehmerRNG}, ReinforcementLearningCore.EpsilonGreedyExplorer{:exp, false, StableRNGs.LehmerRNG}}, Int64}, ReinforcementLearningCore.CircularArraySARTTrajectory{NamedTuple{(:state, :action, :reward, :terminal), Tuple{CircularArrayBuffers.CircularArrayBuffer{Int64, 2}, CircularArrayBuffers.CircularVectorBuffer{Int64}, CircularArrayBuffers.CircularVectorBuffer{Float32}, CircularArrayBuffers.CircularVectorBuffer{Bool}}}}})(::ReinforcementLearningCore.PreActStage, ::Main.workspace513.Connect4Env, ::ReinforcementLearningCore.NoOp)@agent.jl:73
(::ReinforcementLearningCore.MultiAgentManager)(::ReinforcementLearningCore.PreActStage, ::Main.workspace513.Connect4Env, ::ReinforcementLearningBase.Sequential, ::Int64)@multi_agent.jl:46
(::ReinforcementLearningCore.MultiAgentManager)(::ReinforcementLearningCore.PreActStage, ::Main.workspace513.Connect4Env, ::Int64)@multi_agent.jl:37
_run(::ReinforcementLearningCore.MultiAgentManager, ::Main.workspace513.Connect4Env, ::ReinforcementLearningCore.StopAfterEpisode{ProgressMeter.Progress}, ::ReinforcementLearningCore.MultiAgentHook)@run.jl:28
run(::ReinforcementLearningCore.MultiAgentManager, ::Main.workspace513.Connect4Env, ::ReinforcementLearningCore.StopAfterEpisode{ProgressMeter.Progress}, ::ReinforcementLearningCore.MultiAgentHook)@run.jl:10
top-level scope@Local: 56[inlined]

If anyone could provide any help, I would really appreciate it.

The multi-agent rl related algorithms do not work very fluently right now. So you need to do some customizations here.

First, I assume you’ve read Chapter01_Tic_Tac_Toe.jl, especially the Training section and understand how MultiAgentManager works.

Note that the trajectory is defined like this above:

19
            trajectory =VectorSARTTrajectory(
20
                    ;state=Int,
21
                    action=Union{Int, NoOp},
22
                    reward=Int,
23
                    terminal=Bool
24
            )

The action is a Union{Int, NoOp}.

But when we want to apply the QBasedPolicy instead of a simple MonteCarloLearner based policy, the actions in the trajectory are required to be positive integers. That’s why we get the convert error above.

There’re some other ways to handle the NoOp here. One of them is to expand the action space by 1. (Here in the connect four game, now we define the action space to be 1:8 instead of 1:7, 8 means we do nothing when it’s not the current player’s turn). And then you define a new method to convert NoOp to 8. (Or a better solution is to define your customized MultiAgentManager ). You should also define legal_action_space_mask correctly in this case.

One student (@PeterChen ) will focus on MARL this summer. Hope things will become easier later.

Thank you for your response. I implemented the idea you suggested by increasing the action space by one and changing the legal_action_space_mask. My new code for the environment is below:

begin
	
	mutable struct Connect4Env <: AbstractEnv
		board::Array{Int,2}
		current_player::Int
		full_columns::Array{Int,1}
		is_terminated::Bool
		Connect4Env() = new(
			zeros(Int, 6, 7),
			1,
			Int[],
			false
		)
	end
	
	RLBase.action_space(env::Connect4Env, ::Int) = Base.OneTo(8)
	RLBase.legal_action_space(env::Connect4Env) = filter(x->x∉env.full_columns, Base.OneTo(8))
	RLBase.legal_action_space_mask(env::Connect4Env, ::Int) = map(x->x∉env.full_columns, Base.OneTo(8))
	RLBase.state(env::Connect4Env) = vcat(env.board...)
	RLBase.state(env::Connect4Env, a...) = vcat(env.board...)
	RLBase.state_space(env::Connect4Env) = Space([0..2 for i in 1:42])
	RLBase.reward(env::Connect4Env, p::Int) = is_terminated(env) ? 2*Int(getWinner(env.board)==p)-1 : 0
	RLBase.is_terminated(env::Connect4Env) = env.is_terminated
	RLBase.reset!(env::Connect4Env) = begin 
		env.board = zeros(Int, 6, 7)
		env.current_player = 1
		env.full_columns = Int[]
		env.is_terminated = false
	end
	(env::Connect4Env)(a) = begin 
		if a ∈ Base.OneTo(7)
			if a ∉ env.full_columns && !is_terminated(env)
				try
					env.board[maximum(findall(x->x==0, env.board[:, a])), a] = env.current_player
					env.current_player = env.current_player==1 ? 2 : 1
					if count(x->x==0, env.board[:, a]) == 0
						push!(env.full_columns, a)
					end
					if length(env.full_columns)==7 || check4s(env.board)
						env.is_terminated = true
					end
				catch
					@warn "error"
				end
			end
		end
	end
	RLBase.players(::Connect4Env) = (1, 2)
	RLBase.current_player(env::Connect4Env) = env.current_player
	
	RLBase.ActionStyle(::Connect4Env) = FULL_ACTION_SET
	RLBase.ChanceStyle(::Connect4Env) = DETERMINISTIC
	RLBase.DynamicStyle(::Connect4Env) = SEQUENTIAL
	RLBase.InformationStyle(::Connect4Env) = PERFECT_INFORMATION
	RLBase.NumAgentStyle(::Connect4Env) = MultiAgent(2)
	RLBase.RewardStyle(::Connect4Env) = TERMINAL_REWARD
	RLBase.StateStyle(::Connect4Env) = Observation{Array{Int,1}}()
	RLBase.UtilityStyle(::Connect4Env) = ZERO_SUM

    Base.convert(t::Type{Int}, a::ReinforcementLearningCore.NoOp) = 8

end

I now get a different error when I run the code:

begin
	env = Connect4Env()
	num_iterations = 100
	ns, na = size(state(env), 1), length(legal_action_space(env))
	rng = StableRNG(3435)
	base_model = Chain(
        Dense(ns, 128, relu; init = glorot_uniform(rng)),
        Dense(128, 128, relu; init = glorot_uniform(rng)),
        Dense(128, na; init = glorot_uniform(rng))
	)   
	agents = MultiAgentManager(
		(
			Agent(
				policy = NamedPolicy(
					p => QBasedPolicy(;
						learner = DQNLearner(
							approximator = NeuralNetworkApproximator(
								model = build_dueling_network(base_model) |> cpu,
								optimizer = ADAM(),
							),
							target_approximator = NeuralNetworkApproximator(
								model = build_dueling_network(base_model) |> cpu,
							),
							loss_func = huber_loss,
							stack_size = nothing,
							batch_size = 32,
							update_horizon = 1,
							min_replay_history = 100,
							update_freq = 1,
							target_update_freq = 100,
							rng = rng,
							traces = SLARTSL
						),
						explorer = EpsilonGreedyExplorer(
							kind = :exp,
							ϵ_stable = 0.01,
							decay_steps = 500,
							rng = rng,
						),
					)
				),
				trajectory = CircularArraySLARTTrajectory(
					capacity = 1000,
					state = Array{Int,1} => (ns,),
					legal_actions_mask = Array{Bool,1} => (8,),
					#next_legal_actions_mask = Array{Bool,1} => (8,),
				)
			)
			for p in players(env)
		)...
	)
	multi_agent_hook = MultiAgentHook(
		(
			p => TotalRewardPerEpisode()
			for p in players(env)
		)...
	)
	run(agents, env, StopAfterEpisode(num_iterations), multi_agent_hook)
end
type NamedTuple has no field next_legal_actions_mask
getindex(::NamedTuple{(:state, :legal_actions_mask, :action, :reward, :terminal), Tuple{CircularArrayBuffers.CircularArrayBuffer{Int64, 2}, CircularArrayBuffers.CircularArrayBuffer{Bool, 2}, CircularArrayBuffers.CircularVectorBuffer{Int64}, CircularArrayBuffers.CircularVectorBuffer{Float32}, CircularArrayBuffers.CircularVectorBuffer{Bool}}}, ::Symbol)@namedtuple.jl:118
#getindex#50@forward.jl:18[inlined]
getindex@forward.jl:18[inlined]
fetch!(::ReinforcementLearningCore.NStepBatchSampler{(:state, :legal_actions_mask, :action, :reward, :terminal, :next_state, :next_legal_actions_mask)}, ::ReinforcementLearningCore.CircularArraySLARTTrajectory{NamedTuple{(:state, :legal_actions_mask, :action, :reward, :terminal), Tuple{CircularArrayBuffers.CircularArrayBuffer{Int64, 2}, CircularArrayBuffers.CircularArrayBuffer{Bool, 2}, CircularArrayBuffers.CircularVectorBuffer{Int64}, CircularArrayBuffers.CircularVectorBuffer{Float32}, CircularArrayBuffers.CircularVectorBuffer{Bool}}}}, ::Vector{Int64})@trajectory_extension.jl:175
sample(::StableRNGs.LehmerRNG, ::ReinforcementLearningCore.CircularArraySLARTTrajectory{NamedTuple{(:state, :legal_actions_mask, :action, :reward, :terminal), Tuple{CircularArrayBuffers.CircularArrayBuffer{Int64, 2}, CircularArrayBuffers.CircularArrayBuffer{Bool, 2}, CircularArrayBuffers.CircularVectorBuffer{Int64}, CircularArrayBuffers.CircularVectorBuffer{Float32}, CircularArrayBuffers.CircularVectorBuffer{Bool}}}}, ::ReinforcementLearningCore.NStepBatchSampler{(:state, :legal_actions_mask, :action, :reward, :terminal, :next_state, :next_legal_actions_mask)})@trajectory_extension.jl:122
update!(::ReinforcementLearningZoo.DQNLearner{ReinforcementLearningCore.NeuralNetworkApproximator{ReinforcementLearningZoo.DuelingNetwork{Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}}}, Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}, Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}}, Flux.Optimise.ADAM}, ReinforcementLearningCore.NeuralNetworkApproximator{ReinforcementLearningZoo.DuelingNetwork{Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}}}, Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}, Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}}, Nothing}, typeof(Flux.Losses.huber_loss), StableRNGs.LehmerRNG}, ::ReinforcementLearningCore.CircularArraySLARTTrajectory{NamedTuple{(:state, :legal_actions_mask, :action, :reward, :terminal), Tuple{CircularArrayBuffers.CircularArrayBuffer{Int64, 2}, CircularArrayBuffers.CircularArrayBuffer{Bool, 2}, CircularArrayBuffers.CircularVectorBuffer{Int64}, CircularArrayBuffers.CircularVectorBuffer{Float32}, CircularArrayBuffers.CircularVectorBuffer{Bool}}}})@common.jl:20
update!@abstract_learner.jl:35[inlined]
update!@q_based_policy.jl:63[inlined]
update!@named_policy.jl:41[inlined]
(::ReinforcementLearningCore.Agent{ReinforcementLearningCore.NamedPolicy{ReinforcementLearningCore.QBasedPolicy{ReinforcementLearningZoo.DQNLearner{ReinforcementLearningCore.NeuralNetworkApproximator{ReinforcementLearningZoo.DuelingNetwork{Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}}}, Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}, Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}}, Flux.Optimise.ADAM}, ReinforcementLearningCore.NeuralNetworkApproximator{ReinforcementLearningZoo.DuelingNetwork{Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}}}, Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}, Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}}, Nothing}, typeof(Flux.Losses.huber_loss), StableRNGs.LehmerRNG}, ReinforcementLearningCore.EpsilonGreedyExplorer{:exp, false, StableRNGs.LehmerRNG}}, Int64}, ReinforcementLearningCore.CircularArraySLARTTrajectory{NamedTuple{(:state, :legal_actions_mask, :action, :reward, :terminal), Tuple{CircularArrayBuffers.CircularArrayBuffer{Int64, 2}, CircularArrayBuffers.CircularArrayBuffer{Bool, 2}, CircularArrayBuffers.CircularVectorBuffer{Int64}, CircularArrayBuffers.CircularVectorBuffer{Float32}, CircularArrayBuffers.CircularVectorBuffer{Bool}}}}})(::ReinforcementLearningCore.PreActStage, ::Main.workspace2.Connect4Env, ::ReinforcementLearningCore.NoOp)@agent.jl:74
(::ReinforcementLearningCore.MultiAgentManager)(::ReinforcementLearningCore.PreActStage, ::Main.workspace2.Connect4Env, ::ReinforcementLearningBase.Sequential, ::Int64)@multi_agent.jl:46
(::ReinforcementLearningCore.MultiAgentManager)(::ReinforcementLearningCore.PreActStage, ::Main.workspace2.Connect4Env, ::Int64)@multi_agent.jl:37
_run(::ReinforcementLearningCore.MultiAgentManager, ::Main.workspace2.Connect4Env, ::ReinforcementLearningCore.StopAfterEpisode{ProgressMeter.Progress}, ::ReinforcementLearningCore.MultiAgentHook)@run.jl:28
run(::ReinforcementLearningCore.MultiAgentManager, ::Main.workspace2.Connect4Env, ::ReinforcementLearningCore.StopAfterEpisode{ProgressMeter.Progress}, ::ReinforcementLearningCore.MultiAgentHook)@run.jl:10
top-level scope@Local: 58[inlined]

The error makes it clear that next_legal_actions_mask has to be defined, but when I uncomment the line, I get this error:

MethodError: no method matching (ReinforcementLearningCore.CircularArraySLARTTrajectory{var"#s90"} where var"#s90"<:(NamedTuple{(:state, :legal_actions_mask, :action, :reward, :terminal), var"#s54"} where var"#s54"<:(Tuple{var"#s5", var"#s4", var"#s3", var"#s1", var"#s91"} where {var"#s5"<:CircularArrayBuffers.CircularArrayBuffer, var"#s4"<:CircularArrayBuffers.CircularArrayBuffer, var"#s3"<:CircularArrayBuffers.CircularArrayBuffer, var"#s1"<:CircularArrayBuffers.CircularArrayBuffer, var"#s91"<:CircularArrayBuffers.CircularArrayBuffer})))(; capacity=1000, state=Vector{Int64} => (42,), legal_actions_mask=Vector{Bool} => (8,), next_legal_actions_mask=Vector{Bool} => (8,))
Closest candidates are:
(ReinforcementLearningCore.CircularArraySLARTTrajectory{var"#s90"} where var"#s90"<:(NamedTuple{(:state, :legal_actions_mask, :action, :reward, :terminal), var"#s54"} where var"#s54"<:(Tuple{var"#s5", var"#s4", var"#s3", var"#s1", var"#s91"} where {var"#s5"<:CircularArrayBuffers.CircularArrayBuffer, var"#s4"<:CircularArrayBuffers.CircularArrayBuffer, var"#s3"<:CircularArrayBuffers.CircularArrayBuffer, var"#s1"<:CircularArrayBuffers.CircularArrayBuffer, var"#s91"<:CircularArrayBuffers.CircularArrayBuffer})))(; capacity, state, legal_actions_mask, action, reward, terminal) at /Users/satvikd/.julia/packages/ReinforcementLearningCore/FfTaa/src/policies/agents/trajectories/trajectory.jl:101 got unsupported keyword argument "next_legal_actions_mask"
kwerr(::NamedTuple{(:capacity, :state, :legal_actions_mask, :next_legal_actions_mask), Tuple{Int64, Pair{DataType, Tuple{Int64}}, Pair{DataType, Tuple{Int64}}, Pair{DataType, Tuple{Int64}}}}, ::Type)@error.jl:157
(::var"#1#3"{UnionAll, UnionAll, DataType, UnionAll, DataType, UnionAll, typeof(Main.workspace2.build_dueling_network), UnionAll, typeof(Flux.cpu), typeof(|>), typeof(Flux.Losses.huber_loss), NTuple{7, Symbol}, UnionAll, Nothing, DataType, UnionAll, UnionAll, UnionAll, Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}, StableRNGs.LehmerRNG, Int64})(::Int64)@none:0
iterate(::Base.Generator{Tuple{Int64, Int64}, var"#1#3"{UnionAll, UnionAll, DataType, UnionAll, DataType, UnionAll, typeof(Main.workspace2.build_dueling_network), UnionAll, typeof(Flux.cpu), typeof(|>), typeof(Flux.Losses.huber_loss), NTuple{7, Symbol}, UnionAll, Nothing, DataType, UnionAll, UnionAll, UnionAll, Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}, StableRNGs.LehmerRNG, Int64}})@generator.jl:47
top-level scope@Local: 11[inlined]

It seems to me like I am making a simple error, since these errors seem to contradict each other, but I can’t seem to find where. Do you have any ideas on how I can make this work?

This seems like an unexpected bug in RL.jl. I’ll fix it soon.

Should be fixed in Fix legal_actions_mask indexing error in CircularSLART by findmyway · Pull Request #350 · JuliaReinforcementLearning/ReinforcementLearning.jl · GitHub Let me know if you have any other problems.

A new release will be created soon. New version: ReinforcementLearningCore v0.8.1 by JuliaRegistrator · Pull Request #40440 · JuliaRegistries/General · GitHub

By the way, it’s always better to file an issue on github to get credit for you for reporting such bugs. :slightly_smiling_face:

Thanks for fixing the error (and all of your help with this package). I think I am very close to finally making this work, but I still get an error when I use a Q-Based Policy. When I try running the code with a RandomPolicy, it works as expected, but when I switch to a Q-Based Policy, I get this error:

BoundsError: attempt to access 7-element LinearIndices{1, Tuple{Base.OneTo{Int64}}} at index [Bool[0, 1, 1, 1, 1, 1, 1, 0]]
throw_boundserror(::LinearIndices{1, Tuple{Base.OneTo{Int64}}}, ::Tuple{Base.LogicalIndex{Int64, Vector{Bool}}})@abstractarray.jl:651
checkbounds@abstractarray.jl:616[inlined]
view@subarray.jl:177[inlined]
findmax(::Vector{Float32}, ::Vector{Bool})@base.jl:152
(::ReinforcementLearningCore.EpsilonGreedyExplorer{:exp, false, StableRNGs.LehmerRNG})(::Vector{Float32}, ::Vector{Bool})@epsilon_greedy_explorer.jl:132
(::ReinforcementLearningCore.QBasedPolicy{ReinforcementLearningZoo.DQNLearner{ReinforcementLearningCore.NeuralNetworkApproximator{ReinforcementLearningZoo.DuelingNetwork{Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}}}, Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}, Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}}, Flux.Optimise.ADAM}, ReinforcementLearningCore.NeuralNetworkApproximator{ReinforcementLearningZoo.DuelingNetwork{Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}}}, Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}, Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}}, Nothing}, typeof(Flux.Losses.huber_loss), StableRNGs.LehmerRNG}, ReinforcementLearningCore.EpsilonGreedyExplorer{:exp, false, StableRNGs.LehmerRNG}})(::Main.workspace184.Connect4Env, ::ReinforcementLearningBase.FullActionSet, ::Vector{Int64})@q_based_policy.jl:28
(::ReinforcementLearningCore.QBasedPolicy{ReinforcementLearningZoo.DQNLearner{ReinforcementLearningCore.NeuralNetworkApproximator{ReinforcementLearningZoo.DuelingNetwork{Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}}}, Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}, Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}}, Flux.Optimise.ADAM}, ReinforcementLearningCore.NeuralNetworkApproximator{ReinforcementLearningZoo.DuelingNetwork{Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}}}, Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}, Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}}, Nothing}, typeof(Flux.Losses.huber_loss), StableRNGs.LehmerRNG}, ReinforcementLearningCore.EpsilonGreedyExplorer{:exp, false, StableRNGs.LehmerRNG}})(::Main.workspace184.Connect4Env)@q_based_policy.jl:21
NamedPolicy@named_policy.jl:45[inlined]
(::ReinforcementLearningCore.Agent{ReinforcementLearningCore.NamedPolicy{ReinforcementLearningCore.QBasedPolicy{ReinforcementLearningZoo.DQNLearner{ReinforcementLearningCore.NeuralNetworkApproximator{ReinforcementLearningZoo.DuelingNetwork{Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}}}, Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}, Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}}, Flux.Optimise.ADAM}, ReinforcementLearningCore.NeuralNetworkApproximator{ReinforcementLearningZoo.DuelingNetwork{Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}}}, Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}, Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}}, Nothing}, typeof(Flux.Losses.huber_loss), StableRNGs.LehmerRNG}, ReinforcementLearningCore.EpsilonGreedyExplorer{:exp, false, StableRNGs.LehmerRNG}}, Int64}, ReinforcementLearningCore.CircularArraySLARTTrajectory{NamedTuple{(:state, :legal_actions_mask, :action, :reward, :terminal), Tuple{CircularArrayBuffers.CircularArrayBuffer{Int64, 2}, CircularArrayBuffers.CircularArrayBuffer{Int64, 2}, CircularArrayBuffers.CircularVectorBuffer{Int64}, CircularArrayBuffers.CircularVectorBuffer{Float32}, CircularArrayBuffers.CircularVectorBuffer{Bool}}}}})(::Main.workspace184.Connect4Env)@agent.jl:24
(::ReinforcementLearningCore.MultiAgentManager)(::Main.workspace184.Connect4Env, ::ReinforcementLearningBase.Sequential)@multi_agent.jl:26
MultiAgentManager@multi_agent.jl:25[inlined]
_run(::ReinforcementLearningCore.MultiAgentManager, ::Main.workspace184.Connect4Env, ::ReinforcementLearningCore.StopAfterEpisode{ProgressMeter.Progress}, ::ReinforcementLearningCore.MultiAgentHook)@run.jl:26
run(::ReinforcementLearningCore.MultiAgentManager, ::Main.workspace184.Connect4Env, ::ReinforcementLearningCore.StopAfterEpisode{ProgressMeter.Progress}, ::ReinforcementLearningCore.MultiAgentHook)@run.jl:10
top-level scope@Local: 58[inlined]

This error does not make sense to me, since when I try the following code: LinearIndices{1, Tuple{Base.OneTo{Int64}}}((Base.OneTo(8),))[Bool[0, 1, 1, 1, 1, 1, 1, 0]], it works perfectly.

Here is the code for the environment and the algorithm:

begin
	
	mutable struct Connect4Env <: AbstractEnv
		board::Array{Int,2}
		current_player::Int
		full_columns::Array{Int,1}
		is_terminated::Bool
		Connect4Env() = new(
			zeros(Int, 6, 7),
			1,
			Int[],
			false
		)
	end
	
	RLBase.action_space(env::Connect4Env, ::Int) = [1, 2, 3, 4, 5, 6, 7, 8]
	RLBase.legal_action_space(env::Connect4Env, p::Int) = findall(legal_action_space_mask(env))
	RLBase.legal_action_space_mask(env::Connect4Env, p::Int) = p==env.current_player ? vcat(map(x->x∉env.full_columns, [1, 2, 3, 4, 5, 6, 7]), false) : vcat(zeros(Bool, 7), true)
	RLBase.state(env::Connect4Env) = vcat(env.board...)
	RLBase.state(env::Connect4Env, a...) = vcat(env.board...)
	RLBase.state_space(env::Connect4Env) = Space([0..2 for i in 1:42])
	RLBase.reward(env::Connect4Env, p::Int) = is_terminated(env)&&check4s(env.board) ? 2*Int(getWinner(env.board)==p)-1 : 0
	RLBase.is_terminated(env::Connect4Env) = env.is_terminated
	RLBase.reset!(env::Connect4Env) = begin 
		env.board = zeros(Int, 6, 7)
		env.current_player = 1
		env.full_columns = Int[]
		env.is_terminated = false
	end
	(env::Connect4Env)(a) = begin 
		if a ∈ 1:7
			if a ∉ env.full_columns && !is_terminated(env)
				try
					env.board[maximum(findall(x->x==0, env.board[:, a])), a] = env.current_player
					env.current_player = env.current_player==1 ? 2 : 1
					if count(x->x==0, env.board[:, a]) == 0
						push!(env.full_columns, a)
					end
					if length(env.full_columns)==7 || check4s(env.board)==true
						env.is_terminated = true
					end
				catch
					@warn "error"
				end
			end
		end
	end
	RLBase.players(::Connect4Env) = (1, 2)
	RLBase.current_player(env::Connect4Env) = env.current_player
	
	RLBase.ActionStyle(::Connect4Env) = FULL_ACTION_SET
	RLBase.ChanceStyle(::Connect4Env) = DETERMINISTIC
	RLBase.DynamicStyle(::Connect4Env) = SEQUENTIAL
	RLBase.InformationStyle(::Connect4Env) = PERFECT_INFORMATION
	RLBase.NumAgentStyle(::Connect4Env) = MultiAgent(2)
	RLBase.RewardStyle(::Connect4Env) = TERMINAL_REWARD
	RLBase.StateStyle(::Connect4Env) = Observation{Array{Int,1}}()
	RLBase.UtilityStyle(::Connect4Env) = ZERO_SUM

end
begin
	env = Connect4Env()
	num_iterations = 10000
	ns, na = size(state(env), 1), length(legal_action_space(env))
	rng = StableRNG(3435)
	base_model = Chain(
        Dense(ns, 128, relu; init = glorot_uniform(rng)),
        Dense(128, 128, relu; init = glorot_uniform(rng)),
        Dense(128, na; init = glorot_uniform(rng))
	)   
	agents = MultiAgentManager(
		(
			Agent(
				policy = NamedPolicy(
					p => QBasedPolicy(;
						learner = DQNLearner(
							approximator = NeuralNetworkApproximator(
								model = build_dueling_network(base_model) |> cpu,
								optimizer = ADAM(),
							),
							target_approximator = NeuralNetworkApproximator(
								model = build_dueling_network(base_model) |> cpu,
							),
							loss_func = huber_loss,
							stack_size = nothing,
							batch_size = 32,
							update_horizon = 1,
							min_replay_history = 100,
							update_freq = 1,
							target_update_freq = 100,
							rng = rng,
							traces = SLARTSL
						),
						explorer = EpsilonGreedyExplorer(
							kind = :exp,
							ϵ_stable = 0.01,
							decay_steps = 500,
							rng = rng,
						),
					)
				),
				trajectory = CircularArraySLARTTrajectory(
					capacity = 1000,
					state = Array{Int,1} => (ns,),
					legal_actions_mask = Array{Int,1} => (8,),
				)
			)
			for p in players(env)
		)...
	)
	multi_agent_hook = MultiAgentHook(
		(
			p => TotalRewardPerEpisode()
			for p in players(env)
		)...
	)
	
	run(agents, env, StopAfterEpisode(num_iterations), multi_agent_hook)
end

I would log this issue in GitHub, but I am not exactly sure which package is causing the error. Do you have any ideas on how I can solve this?

It seems the length of the legal_action_space_mask and action_space is inconsistent. (I didn’t test it on my machine)