Key error while using QLearningSolver

I’m a beginner with julia and I’m trying to model an inventory menagement using sarsa and q-learning algo. The code gives me the following key error: “ERROR: LoadError: KeyError: key [0.01781510003808362, 0.10435077985604246, 0.6513233559305432, 0.026058479739375584, 0.5718719213047856, 0.4894292308985352, 0.8914244892776971, 0.6229389399919408, 0.9589445614036257, 0.1799414868226339] not found” and Stacktrace: [1] getindex(h::Dict{Int64, Int64}, key::Vector{Float64}) @ Base .\dict.jl:477 [2] _call(v::Val{:stateindex}, d::Dict{Int64, Int64}, args::Tuple{Vector{Float64}}, kwargs::@Kwargs{}) @ QuickPOMDPs C:\Users\loren\.julia\packages\QuickPOMDPs\WmJG9\src\quick.jl:206 [3] _call(namev::Val{:stateindex}, m::QuickMDP{Base.UUID("b6c06324-007b-4e3b-bc11-80d9fdde00b6"), Int64, Int64, @NamedTuple{stateindex::Dict{Int64, Int64}, isterminal::Bool, actionindex::Dict{Int64, Int64}, initialstate::Int64, transition::var"#1#6", states::Vector{Int64}, actions::Vector{Int64}, discount::Float64, reward::var"#3#8"}}, args::Tuple{Vector{Float64}}, kwargs::@Kwargs{}) @ QuickPOMDPs C:\Users\loren\.julia\packages\QuickPOMDPs\WmJG9\src\quick.jl:196 [4] stateindex(m::QuickMDP{Base.UUID("b6c06324-007b-4e3b-bc11-80d9fdde00b6"), Int64, Int64, @NamedTuple{stateindex::Dict{Int64, Int64}, isterminal::Bool, actionindex::Dict{Int64, Int64}, initialstate::Int64, transition::var"#1#6", states::Vector{Int64}, actions::Vector{Int64}, discount::Float64, reward::var"#3#8"}}, args::Vector{Float64}; kwargs::@Kwargs{}) @ QuickPOMDPs C:\Users\loren\.julia\packages\QuickPOMDPs\WmJG9\src\quick.jl:211 [5] action(p::ValuePolicy{QuickMDP{Base.UUID("b6c06324-007b-4e3b-bc11-80d9fdde00b6"), Int64, Int64, @NamedTuple{stateindex::Dict{Int64, Int64}, isterminal::Bool, actionindex::Dict{Int64, Int64}, initialstate::Int64, transition::var"#1#6", states::Vector{Int64}, actions::Vector{Int64}, discount::Float64, reward::var"#3#8"}}, Matrix{Float64}, Int64}, s::Vector{Float64}) @ POMDPTools.Policies C:\Users\loren\.julia\packages\POMDPTools\7Rekv\src\Policies\vector.jl:62 [6] action(p::EpsGreedyPolicy{POMDPTools.Policies.var"#20#21"{Float64}, TaskLocalRNG, QuickMDP{Base.UUID("b6c06324-007b-4e3b-bc11-80d9fdde00b6"), Int64, Int64, @NamedTuple{stateindex::Dict{Int64, Int64}, isterminal::Bool, actionindex::Dict{Int64, Int64}, initialstate::Int64, transition::var"#1#6", states::Vector{Int64}, actions::Vector{Int64}, discount::Float64, reward::var"#3#8"}}}, on_policy::ValuePolicy{QuickMDP{Base.UUID("b6c06324-007b-4e3b-bc11-80d9fdde00b6"), Int64, Int64, @NamedTuple{stateindex::Dict{Int64, Int64}, isterminal::Bool, actionindex::Dict{Int64, Int64}, initialstate::Int64, transition::var"#1#6", states::Vector{Int64}, actions::Vector{Int64}, discount::Float64, reward::var"#3#8"}}, Matrix{Float64}, Int64}, k::Int64, s::Vector{Float64}) @ POMDPTools.Policies C:\Users\loren\.julia\packages\POMDPTools\7Rekv\src\Policies\exploration_policies.jl:77 [7] solve(solver::QLearningSolver{EpsGreedyPolicy{POMDPTools.Policies.var"#20#21"{Float64}, TaskLocalRNG, QuickMDP{Base.UUID("b6c06324-007b-4e3b-bc11-80d9fdde00b6"), Int64, Int64, @NamedTuple{stateindex::Dict{Int64, Int64}, isterminal::Bool, actionindex::Dict{Int64, Int64}, initialstate::Int64, transition::var"#1#6", states::Vector{Int64}, actions::Vector{Int64}, discount::Float64, reward::var"#3#8"}}}, TaskLocalRNG}, mdp::QuickMDP{Base.UUID("b6c06324-007b-4e3b-bc11-80d9fdde00b6"), Int64, Int64, @NamedTuple{stateindex::Dict{Int64, Int64}, isterminal::Bool, actionindex::Dict{Int64, Int64}, initialstate::Int64, transition::var"#1#6", states::Vector{Int64}, actions::Vector{Int64}, discount::Float64, reward::var"#3#8"}}) @ TabularTDLearning C:\Users\loren\.julia\packages\TabularTDLearning\IPIqw\src\q_learn.jl:57 [8] top-level scope @ d:\OneDrive\Desktop\Punto.jl:77 in expression starting at d:\OneDrive\Desktop\Punto.jl:77.
That is the code:

### Pluto notebook per LINMA2470 - Punto 3

### 1) Import dei pacchetti
using Random
using StatsPlots         # per eventuali grafici
using POMDPs
using QuickPOMDPs
using TabularTDLearning
using POMDPTools
using POMDPModels
using POMDPTools.Policies # per EpsGreedyPolicy

### 2) Parametri dell'ambiente
const INITIAL_STATE = 10
const STATES        = collect(0:20)
const ACTIONS       = collect([0, 1])   # 0 = no-order, 1 = order
const DEMAND_DIST   = collect(0:10)
const DISCOUNT      = 0.99
const ORDER_AMOUNT  = 5
const MAX_STORE     = 10
const MAX_CAPACITY  = 20
const TRANSPORT_COST = 20.0
const HOLD_STORE     = 2.0
const HOLD_PARKING   = 4.0
const STOCKOUT_COST  = 50.0

### 3) Definizione MDP con QuickPOMDPs
bike_inventory_mdp = QuickMDP(
    states      = STATES,
    actions     = ACTIONS,
    discount    = DISCOUNT,
    initialstate = INITIAL_STATE,
    transition  = (s, a, sp) -> begin
        # calcola probabilità di passare da s ad sp con azione a
        order     = a==1 ? min(ORDER_AMOUNT, MAX_CAPACITY - s) : 0
        post_order= s + order
        p         = 1/length(DEMAND_DIST)
        sum(d-> (max(0, post_order-d)==sp ? p : 0), DEMAND_DIST)
    end,
    reward      = (s, a, sp) -> begin
        # reward atteso dato s,a 
        order      = a==1 ? min(ORDER_AMOUNT, MAX_CAPACITY - s) : 0
        post_order = s + order
        transport  = a==1 ? TRANSPORT_COST : 0.0
        exp_penalty= sum(d-> max(0, d-post_order)*STOCKOUT_COST, DEMAND_DIST) / length(DEMAND_DIST)
        exp_hold   = sum(d-> begin
                          inv = max(0, post_order - d)
                          min(inv, MAX_STORE)*HOLD_STORE + max(0, inv-MAX_STORE)*HOLD_PARKING
                        end, DEMAND_DIST) / length(DEMAND_DIST)
        -(transport + exp_penalty + exp_hold)
    end
)

### 4) Politica di esplorazione ε-greedy

rng = Random.default_rng()
# epsilon come secondo argomento posizionale
exploration = EpsGreedyPolicy(bike_inventory_mdp, 0.1; rng=rng)


### 5) Risolutori Q-learning e SARSA da TabularTDLearning) Risolutori Q-learning e SARSA da TabularTDLearning
q_solver_pkg = QLearningSolver(
    n_episodes         = 5000,
    max_episode_length = 50,
    learning_rate      = 0.1,
    exploration_policy = exploration
)

sarsa_solver_pkg = SARSASolver(
    n_episodes         = 5000,
    max_episode_length = 50,
    learning_rate      = 0.1,
    exploration_policy = exploration
)

### 6) Risoluzione e confronto delle politiche
policy_q_pkg     = solve(q_solver_pkg, bike_inventory_mdp)
policy_sarsa_pkg = solve(sarsa_solver_pkg, bike_inventory_mdp)

println("Politica Q-Learning (pacchetto):")
for s in STATES
    println("s = ", s, " → a = ", action(policy_q_pkg, s))
end

println("\nPolitica SARSA (pacchetto):")
for s in STATES
    println("s = ", s, " → a = ", action(policy_sarsa_pkg, s))
end

I think it has to do with the definition of the MDP, but I don’t know how to solve it

You could try to run the MDP manually to see if it works, i.e. initialize and execute a few actions in the REPL, before using any solver to find optimized actions.