I made a Connect 4 Environment with ReinforcementLearning.jl:
begin
mutable struct Connect4Env <: AbstractEnv
board::Array{Int,2}
current_player::Int
full_columns::Array{Int,1}
is_terminated::Bool
Connect4Env() = new(
zeros(Int, 6, 7),
1,
Int[],
false
)
end
RLBase.action_space(env::Connect4Env, ::Int) = Base.OneTo(7)
RLBase.legal_action_space(env::Connect4Env) = filter(x->x∉env.full_columns, Base.OneTo(7))
RLBase.legal_action_space_mask(env::Connect4Env) = map(x->x∉env.full_columns, Base.OneTo(7))
RLBase.state(env::Connect4Env) = vcat(env.board...)
RLBase.state(env::Connect4Env, a...) = vcat(env.board...)
RLBase.state_space(env::Connect4Env) = Space([0..2 for i in 1:42])
RLBase.reward(env::Connect4Env, p) = is_terminated(env) ? 2*Int(getWinner(env.board)==p)-1 : 0
RLBase.is_terminated(env::Connect4Env) = env.is_terminated
RLBase.reset!(env::Connect4Env) = begin
env.board = zeros(Int, 6, 7)
env.current_player = 1
env.full_columns = Int[]
env.is_terminated = false
end
(env::Connect4Env)(a::Int) = begin
if a ∉ env.full_columns && !is_terminated(env)
try
env.board[maximum(findall(x->x==0, env.board[:, a])), a] = env.current_player
env.current_player = env.current_player==1 ? 2 : 1
if count(x->x==0, env.board[:, a]) == 0
push!(env.full_columns, a)
end
if length(env.full_columns)==7 || check4s(env.board)
env.is_terminated = true
end
catch
@warn "error"
end
end
end
RLBase.players(::Connect4Env) = (1, 2)
RLBase.current_player(env::Connect4Env) = env.current_player
RLBase.ActionStyle(::Connect4Env) = FULL_ACTION_SET
RLBase.ChanceStyle(::Connect4Env) = DETERMINISTIC
RLBase.DynamicStyle(::Connect4Env) = SEQUENTIAL
RLBase.InformationStyle(::Connect4Env) = PERFECT_INFORMATION
RLBase.NumAgentStyle(::Connect4Env) = MultiAgent(2)
RLBase.RewardStyle(::Connect4Env) = TERMINAL_REWARD
RLBase.StateStyle(::Connect4Env) = Observation{Array{Int,1}}()
RLBase.UtilityStyle(::Connect4Env) = ZERO_SUM
end
I am trying to use a MultiAgent Deep Q Network to find the optimal strategy. My code for the algorithm is below:
begin
env = Connect4Env()
num_iterations = 1000
ns, na = size(state(env), 1), length(legal_action_space(env))
rng = StableRNG(3435)
base_model = Chain(
Dense(ns, 128, relu; init = glorot_uniform(rng)),
Dense(128, 128, relu; init = glorot_uniform(rng)),
Dense(128, na; init = glorot_uniform(rng))
)
agents = MultiAgentManager(
(
Agent(
policy = NamedPolicy(
p => QBasedPolicy(;
learner = DQNLearner(
approximator = NeuralNetworkApproximator(
model = build_dueling_network(base_model) |> cpu,
optimizer = ADAM(),
),
target_approximator = NeuralNetworkApproximator(
model = build_dueling_network(base_model) |> cpu,
),
loss_func = huber_loss,
stack_size = nothing,
batch_size = 32,
update_horizon = 1,
min_replay_history = 100,
update_freq = 1,
target_update_freq = 100,
rng = rng,
traces = SLARTSL
),
explorer = EpsilonGreedyExplorer(
kind = :exp,
ϵ_stable = 0.01,
decay_steps = 500,
rng = rng,
),
)
),
trajectory = CircularArraySARTTrajectory(
capacity = 1000,
state = Array{Int,1} => (ns,)
)
)
for p in players(env)
)...
)
multi_agent_hook = MultiAgentHook(
(
p => TotalRewardPerEpisode()
for p in players(env)
)...
)
run(agents, env, StopAfterEpisode(num_iterations), multi_agent_hook)
end
When I try to run this, I get an error, which I think is caused by CircularBufferArrays, but I am not sure if it is my code that is causing this or an error in the package itself. Here is the error:
MethodError: Cannot `convert` an object of type ReinforcementLearningCore.NoOp to an object of type Int64
Closest candidates are:
convert(::Type{T}, !Matched::LLVM.GenericValue) where T<:Signed at /Users/satvikd/.julia/packages/LLVM/XEOgl/src/execution.jl:27
convert(::Type{T}, !Matched::LLVM.ConstantInt) where T<:Signed at /Users/satvikd/.julia/packages/LLVM/XEOgl/src/core/value/constant.jl:76
convert(::Type{T}, !Matched::Intervals.AnchoredInterval{P, T, L, R} where {L<:Intervals.Bounded, R<:Intervals.Bounded}) where {P, T} at /Users/satvikd/.julia/packages/Intervals/ua9cq/src/anchoredinterval.jl:181
...
setindex!(::Vector{Int64}, ::ReinforcementLearningCore.NoOp, ::Int64)@array.jl:839
setindex!(::CircularArrayBuffers.CircularVectorBuffer{Int64}, ::ReinforcementLearningCore.NoOp, ::Int64)@CircularArrayBuffers.jl:36
push!(::CircularArrayBuffers.CircularVectorBuffer{Int64}, ::ReinforcementLearningCore.NoOp)@CircularArrayBuffers.jl:75
update!(::ReinforcementLearningCore.CircularArraySARTTrajectory{NamedTuple{(:state, :action, :reward, :terminal), Tuple{CircularArrayBuffers.CircularArrayBuffer{Int64, 2}, CircularArrayBuffers.CircularVectorBuffer{Int64}, CircularArrayBuffers.CircularVectorBuffer{Float32}, CircularArrayBuffers.CircularVectorBuffer{Bool}}}}, ::ReinforcementLearningCore.NamedPolicy{ReinforcementLearningCore.QBasedPolicy{ReinforcementLearningZoo.DQNLearner{ReinforcementLearningCore.NeuralNetworkApproximator{ReinforcementLearningZoo.DuelingNetwork{Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}}}, Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}, Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}}, Flux.Optimise.ADAM}, ReinforcementLearningCore.NeuralNetworkApproximator{ReinforcementLearningZoo.DuelingNetwork{Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}}}, Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}, Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}}, Nothing}, typeof(Flux.Losses.huber_loss), StableRNGs.LehmerRNG}, ReinforcementLearningCore.EpsilonGreedyExplorer{:exp, false, StableRNGs.LehmerRNG}}, Int64}, ::Main.workspace513.Connect4Env, ::ReinforcementLearningCore.PreActStage, ::ReinforcementLearningCore.NoOp)@agent.jl:119
(::ReinforcementLearningCore.Agent{ReinforcementLearningCore.NamedPolicy{ReinforcementLearningCore.QBasedPolicy{ReinforcementLearningZoo.DQNLearner{ReinforcementLearningCore.NeuralNetworkApproximator{ReinforcementLearningZoo.DuelingNetwork{Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}}}, Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}, Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}}, Flux.Optimise.ADAM}, ReinforcementLearningCore.NeuralNetworkApproximator{ReinforcementLearningZoo.DuelingNetwork{Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}}}, Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}, Flux.Chain{Tuple{Flux.Dense{typeof(NNlib.relu), Matrix{Float32}, Vector{Float32}}, Flux.Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}}, Nothing}, typeof(Flux.Losses.huber_loss), StableRNGs.LehmerRNG}, ReinforcementLearningCore.EpsilonGreedyExplorer{:exp, false, StableRNGs.LehmerRNG}}, Int64}, ReinforcementLearningCore.CircularArraySARTTrajectory{NamedTuple{(:state, :action, :reward, :terminal), Tuple{CircularArrayBuffers.CircularArrayBuffer{Int64, 2}, CircularArrayBuffers.CircularVectorBuffer{Int64}, CircularArrayBuffers.CircularVectorBuffer{Float32}, CircularArrayBuffers.CircularVectorBuffer{Bool}}}}})(::ReinforcementLearningCore.PreActStage, ::Main.workspace513.Connect4Env, ::ReinforcementLearningCore.NoOp)@agent.jl:73
(::ReinforcementLearningCore.MultiAgentManager)(::ReinforcementLearningCore.PreActStage, ::Main.workspace513.Connect4Env, ::ReinforcementLearningBase.Sequential, ::Int64)@multi_agent.jl:46
(::ReinforcementLearningCore.MultiAgentManager)(::ReinforcementLearningCore.PreActStage, ::Main.workspace513.Connect4Env, ::Int64)@multi_agent.jl:37
_run(::ReinforcementLearningCore.MultiAgentManager, ::Main.workspace513.Connect4Env, ::ReinforcementLearningCore.StopAfterEpisode{ProgressMeter.Progress}, ::ReinforcementLearningCore.MultiAgentHook)@run.jl:28
run(::ReinforcementLearningCore.MultiAgentManager, ::Main.workspace513.Connect4Env, ::ReinforcementLearningCore.StopAfterEpisode{ProgressMeter.Progress}, ::ReinforcementLearningCore.MultiAgentHook)@run.jl:10
top-level scope@Local: 56[inlined]
If anyone could provide any help, I would really appreciate it.