Deep Learning Vscode Windows Faster first execution

Hello!

I have been doing some deep reinforcement learning experiments with reinforcementlearning.jl and flux.jl in VScode windows. I have noticed that the first experiment i run after opening Vscode or after restarting the REPL is significantly faster than if i were to run the exact experiment a second time without restarting the REPL. Has anyone experienced this?

The difference between the exact sme script (experiment) is 5 minutes to 25.

Thanks, Jared

Do you have a MWe (minimal working example) that would be easier,
Otherwise, this may arrive when you filled up your CPU memory meaning it sents the job to your hard disk ( new option on windows that we all hate and disable)
If you’re using GPU then it can be memory leaking and in this case without the code can’t help.

Ps : you’ve made two thread it seems like, it’s better in new to Julia I think but don’t put it twice it will confuse people.

1 Like

Hi yolhan,

Yes using a slightly modified version of the PPO pendulum from the reinforcementlearning.jl examples page. The execution on the first run is around 5 minutes, while without restarting the terminal or vscode the second execution is 30 minutes.

GPU: RTX4070 Ti Super

Julia Version 1.9.4
Commit 8e5136fa29 (2023-11-14 08:46 UTC)
Build Info:
  Official https://julialang.org/ release
Platform Info:
  OS: Windows (x86_64-w64-mingw32)
  CPU: 8 × Intel(R) Core(TM) i7-9700KF CPU @ 3.60GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-14.0.6 (ORCJIT, skylake)
  Threads: 8 on 8 virtual cores
Environment:
  JULIA_EDITOR = code
  JULIA_NUM_THREADS = 8
Status `C:\Users\j.tucker\master\lib\GNCReinforcementLearning\Project.toml`
⌅ [052768ef] CUDA v3.13.1
⌃ [a93c6f00] DataFrames v1.3.6
  [31c24e10] Distributions v0.25.107
⌅ [587475ba] Flux v0.12.10
  [af0dad03] GNCModelling v0.9.0 `..\GNCModelling`
⌅ [8197267c] IntervalSets v0.5.4
  [033835bb] JLD2 v0.4.46
  [e4faabce] PProf v3.1.0
⌅ [91a5bcdd] Plots v1.39.0
⌃ [92933f4c] ProgressMeter v1.9.0
  [158674fc] ReinforcementLearning v0.10.2
  [860ef19b] StableRNGs v1.0.1
  [90137ffa] StaticArrays v1.9.3
  [9abbd945] Profile
  [9a3f8284] Random

using ReinforcementLearning
using StableRNGs
using Flux
using Flux.Losses
using Distributions
using Random
using IntervalSets

struct PendulumEnv3Params{T}
    max_speed::T
    max_torque::T
    g::T
    m::T
    l::T
    dt::T
    max_steps::Int
end

mutable struct PendulumEnv3{A,T,R<:AbstractRNG} <: AbstractEnv
    params::PendulumEnv3Params{T}
    action_space::A
    action::T
    observation_space::Space{Vector{ClosedInterval{T}}}
    state::Vector{T}
    done::Bool
    t::Int
    rng::R
    reward::T
    n_actions::Int
end

"""
    PendulumEnv3(;kwargs...)

# Keyword arguments

- `T = Float64`
- `max_speed = T(8)`
- `max_torque = T(2)`
- `g = T(10)`
- `m = T(1)`
- `l = T(1)`
- `dt = T(0.05)`
- `max_steps = 200`
- `continuous::Bool = true`
- `n_actions::Int = 3`
- `rng = Random.GLOBAL_RNG`
"""
function PendulumEnv3(;
    T = Float64,
    max_speed = T(8),
    max_torque = T(2),
    g = T(10),
    m = T(1),
    l = T(1),
    dt = T(0.05),
    max_steps = 200,
    continuous::Bool = true,
    n_actions::Int = 3,
    rng = Random.GLOBAL_RNG,
)
    high = T.([1, 1, max_speed])
    action_space = continuous ? -2.0..2.0 : Base.OneTo(n_actions)
    env = PendulumEnv3(
        PendulumEnv3Params(max_speed, max_torque, g, m, l, dt, max_steps),
        action_space,
        zero(T),
        Space(ClosedInterval{T}.(-high, high)),
        zeros(T, 2),
        false,
        0,
        rng,
        zero(T),
        n_actions,
    )
    reset!(env)
    env
end

Random.seed!(env::PendulumEnv3, seed) = Random.seed!(env.rng, seed)

pendulum_observation(s) = [cos(s[1]), sin(s[1]), s[2]]
angle_normalize(x) = Base.mod((x + Base.π), (2 * Base.π)) - Base.π

RLBase.action_space(env::PendulumEnv3) = env.action_space
RLBase.state_space(env::PendulumEnv3) = env.observation_space
RLBase.reward(env::PendulumEnv3) = env.reward
RLBase.is_terminated(env::PendulumEnv3) = env.done
RLBase.state(env::PendulumEnv3) = pendulum_observation(env.state)

function RLBase.reset!(env::PendulumEnv3{A,T}) where {A,T}
    env.state[1] = 2 * π * (rand(env.rng, T) .- 1)
    env.state[2] = 2 * (rand(env.rng, T) .- 1)
    env.action = zero(T)
    env.t = 0
    env.done = false
    env.reward = zero(T)
    nothing
end

function (env::PendulumEnv3)(a::Union{Int, AbstractFloat})
    @assert a in env.action_space
    env.action = torque(env, a)
    _step!(env, env.action)
end

function _step!(env::PendulumEnv3, a)
    env.t += 1
    th, thdot = env.state
    a = clamp(a, -env.params.max_torque, env.params.max_torque)
    costs = angle_normalize(th)^2 + 0.1 * thdot^2 + 0.001 * a^2
    newthdot =
        thdot +
        (
            -3 * env.params.g / (2 * env.params.l) * sin(th + pi) +
            3 * a / (env.params.m * env.params.l^2)
        ) * env.params.dt
    th += newthdot * env.params.dt
    newthdot = clamp(newthdot, -env.params.max_speed, env.params.max_speed)
    env.state[1] = th
    env.state[2] = newthdot
    env.done = env.t >= env.params.max_steps
    env.reward = -costs
    nothing
end

function torque(env::PendulumEnv3{<:Base.OneTo}, a::Int)
    return (4 / (env.n_actions - 1)) * (a - (env.n_actions - 1) / 2 - 1)
end

torque(env::PendulumEnv3{<:ClosedInterval}, a::AbstractFloat) = a


seed = 123
rng = StableRNG(seed)
inner_env = PendulumEnv3(T=Float32, rng=rng)
A = action_space(inner_env)
low = A.left
high = A.right
ns = length(state(inner_env))

neurons = 2048
N_ENV = 8
UPDATE_FREQ = 2048
env = MultiThreadEnv([
    PendulumEnv3(T=Float32, rng=StableRNG(hash(seed + i))) |>
    env -> ActionTransformedEnv(env, action_mapping=x -> clamp(x * 2, low, high)) for i in 1:N_ENV
])

init = glorot_uniform(rng)

agent = Agent(
    policy=PPOPolicy(
        approximator=ActorCritic(
            actor=GaussianNetwork(
                pre=Chain(
                    Dense(ns, neurons, relu; init=glorot_uniform(rng)),
                    Dense(neurons, neurons, relu; init=glorot_uniform(rng)),
                ),
                μ=Chain(Dense(neurons, 1, tanh; init=glorot_uniform(rng)), vec),
                logσ=Chain(Dense(neurons, 1; init=glorot_uniform(rng)), vec), normalizer=x -> x
            ),
            critic=Chain(
                Dense(ns, neurons, relu; init=glorot_uniform(rng)),
                Dense(neurons, neurons, relu; init=glorot_uniform(rng)),
                Dense(neurons, 1; init=glorot_uniform(rng)),
            ),
            optimizer=ADAM(3e-4),
        ) |> gpu,
        γ=0.99f0,
        λ=1.00f0,
        clip_range=0.2f0,
        max_grad_norm=0.5f0,
        n_epochs=10,
        n_microbatches=32,
        actor_loss_weight=1.0f0,
        critic_loss_weight=0.5f0,
        entropy_loss_weight=0.00f0,
        dist=Normal,
        rng=rng,
        update_freq=UPDATE_FREQ,
    ),
    trajectory=PPOTrajectory(;
        capacity=UPDATE_FREQ,
        state=Matrix{Float32} => (ns, N_ENV),
        action=Vector{Float32} => (N_ENV,),
        action_log_prob=Vector{Float32} => (N_ENV,),
        reward=Vector{Float32} => (N_ENV,),
        terminal=Vector{Bool} => (N_ENV,),
    ),
)

stop_condition = StopAfterStep(100_000, is_show_progress=!haskey(ENV, "CI"))
hook = TotalBatchRewardPerEpisode(N_ENV)
ex = Experiment(agent, env, stop_condition, hook, "# Play Pendulum with PPO")
run(ex)
NN = cpu(ex.policy.policy.approximator.actor)