Hi yolhan,
Yes using a slightly modified version of the PPO pendulum from the reinforcementlearning.jl examples page. The execution on the first run is around 5 minutes, while without restarting the terminal or vscode the second execution is 30 minutes.
GPU: RTX4070 Ti Super
Julia Version 1.9.4
Commit 8e5136fa29 (2023-11-14 08:46 UTC)
Build Info:
Official https://julialang.org/ release
Platform Info:
OS: Windows (x86_64-w64-mingw32)
CPU: 8 × Intel(R) Core(TM) i7-9700KF CPU @ 3.60GHz
WORD_SIZE: 64
LIBM: libopenlibm
LLVM: libLLVM-14.0.6 (ORCJIT, skylake)
Threads: 8 on 8 virtual cores
Environment:
JULIA_EDITOR = code
JULIA_NUM_THREADS = 8
Status `C:\Users\j.tucker\master\lib\GNCReinforcementLearning\Project.toml`
⌅ [052768ef] CUDA v3.13.1
⌃ [a93c6f00] DataFrames v1.3.6
[31c24e10] Distributions v0.25.107
⌅ [587475ba] Flux v0.12.10
[af0dad03] GNCModelling v0.9.0 `..\GNCModelling`
⌅ [8197267c] IntervalSets v0.5.4
[033835bb] JLD2 v0.4.46
[e4faabce] PProf v3.1.0
⌅ [91a5bcdd] Plots v1.39.0
⌃ [92933f4c] ProgressMeter v1.9.0
[158674fc] ReinforcementLearning v0.10.2
[860ef19b] StableRNGs v1.0.1
[90137ffa] StaticArrays v1.9.3
[9abbd945] Profile
[9a3f8284] Random
using ReinforcementLearning
using StableRNGs
using Flux
using Flux.Losses
using Distributions
using Random
using IntervalSets
struct PendulumEnv3Params{T}
max_speed::T
max_torque::T
g::T
m::T
l::T
dt::T
max_steps::Int
end
mutable struct PendulumEnv3{A,T,R<:AbstractRNG} <: AbstractEnv
params::PendulumEnv3Params{T}
action_space::A
action::T
observation_space::Space{Vector{ClosedInterval{T}}}
state::Vector{T}
done::Bool
t::Int
rng::R
reward::T
n_actions::Int
end
"""
PendulumEnv3(;kwargs...)
# Keyword arguments
- `T = Float64`
- `max_speed = T(8)`
- `max_torque = T(2)`
- `g = T(10)`
- `m = T(1)`
- `l = T(1)`
- `dt = T(0.05)`
- `max_steps = 200`
- `continuous::Bool = true`
- `n_actions::Int = 3`
- `rng = Random.GLOBAL_RNG`
"""
function PendulumEnv3(;
T = Float64,
max_speed = T(8),
max_torque = T(2),
g = T(10),
m = T(1),
l = T(1),
dt = T(0.05),
max_steps = 200,
continuous::Bool = true,
n_actions::Int = 3,
rng = Random.GLOBAL_RNG,
)
high = T.([1, 1, max_speed])
action_space = continuous ? -2.0..2.0 : Base.OneTo(n_actions)
env = PendulumEnv3(
PendulumEnv3Params(max_speed, max_torque, g, m, l, dt, max_steps),
action_space,
zero(T),
Space(ClosedInterval{T}.(-high, high)),
zeros(T, 2),
false,
0,
rng,
zero(T),
n_actions,
)
reset!(env)
env
end
Random.seed!(env::PendulumEnv3, seed) = Random.seed!(env.rng, seed)
pendulum_observation(s) = [cos(s[1]), sin(s[1]), s[2]]
angle_normalize(x) = Base.mod((x + Base.π), (2 * Base.π)) - Base.π
RLBase.action_space(env::PendulumEnv3) = env.action_space
RLBase.state_space(env::PendulumEnv3) = env.observation_space
RLBase.reward(env::PendulumEnv3) = env.reward
RLBase.is_terminated(env::PendulumEnv3) = env.done
RLBase.state(env::PendulumEnv3) = pendulum_observation(env.state)
function RLBase.reset!(env::PendulumEnv3{A,T}) where {A,T}
env.state[1] = 2 * π * (rand(env.rng, T) .- 1)
env.state[2] = 2 * (rand(env.rng, T) .- 1)
env.action = zero(T)
env.t = 0
env.done = false
env.reward = zero(T)
nothing
end
function (env::PendulumEnv3)(a::Union{Int, AbstractFloat})
@assert a in env.action_space
env.action = torque(env, a)
_step!(env, env.action)
end
function _step!(env::PendulumEnv3, a)
env.t += 1
th, thdot = env.state
a = clamp(a, -env.params.max_torque, env.params.max_torque)
costs = angle_normalize(th)^2 + 0.1 * thdot^2 + 0.001 * a^2
newthdot =
thdot +
(
-3 * env.params.g / (2 * env.params.l) * sin(th + pi) +
3 * a / (env.params.m * env.params.l^2)
) * env.params.dt
th += newthdot * env.params.dt
newthdot = clamp(newthdot, -env.params.max_speed, env.params.max_speed)
env.state[1] = th
env.state[2] = newthdot
env.done = env.t >= env.params.max_steps
env.reward = -costs
nothing
end
function torque(env::PendulumEnv3{<:Base.OneTo}, a::Int)
return (4 / (env.n_actions - 1)) * (a - (env.n_actions - 1) / 2 - 1)
end
torque(env::PendulumEnv3{<:ClosedInterval}, a::AbstractFloat) = a
seed = 123
rng = StableRNG(seed)
inner_env = PendulumEnv3(T=Float32, rng=rng)
A = action_space(inner_env)
low = A.left
high = A.right
ns = length(state(inner_env))
neurons = 2048
N_ENV = 8
UPDATE_FREQ = 2048
env = MultiThreadEnv([
PendulumEnv3(T=Float32, rng=StableRNG(hash(seed + i))) |>
env -> ActionTransformedEnv(env, action_mapping=x -> clamp(x * 2, low, high)) for i in 1:N_ENV
])
init = glorot_uniform(rng)
agent = Agent(
policy=PPOPolicy(
approximator=ActorCritic(
actor=GaussianNetwork(
pre=Chain(
Dense(ns, neurons, relu; init=glorot_uniform(rng)),
Dense(neurons, neurons, relu; init=glorot_uniform(rng)),
),
μ=Chain(Dense(neurons, 1, tanh; init=glorot_uniform(rng)), vec),
logσ=Chain(Dense(neurons, 1; init=glorot_uniform(rng)), vec), normalizer=x -> x
),
critic=Chain(
Dense(ns, neurons, relu; init=glorot_uniform(rng)),
Dense(neurons, neurons, relu; init=glorot_uniform(rng)),
Dense(neurons, 1; init=glorot_uniform(rng)),
),
optimizer=ADAM(3e-4),
) |> gpu,
γ=0.99f0,
λ=1.00f0,
clip_range=0.2f0,
max_grad_norm=0.5f0,
n_epochs=10,
n_microbatches=32,
actor_loss_weight=1.0f0,
critic_loss_weight=0.5f0,
entropy_loss_weight=0.00f0,
dist=Normal,
rng=rng,
update_freq=UPDATE_FREQ,
),
trajectory=PPOTrajectory(;
capacity=UPDATE_FREQ,
state=Matrix{Float32} => (ns, N_ENV),
action=Vector{Float32} => (N_ENV,),
action_log_prob=Vector{Float32} => (N_ENV,),
reward=Vector{Float32} => (N_ENV,),
terminal=Vector{Bool} => (N_ENV,),
),
)
stop_condition = StopAfterStep(100_000, is_show_progress=!haskey(ENV, "CI"))
hook = TotalBatchRewardPerEpisode(N_ENV)
ex = Experiment(agent, env, stop_condition, hook, "# Play Pendulum with PPO")
run(ex)
NN = cpu(ex.policy.policy.approximator.actor)