I am pretty new to Flux and try to get the Pendulum environment (using a slightly modified Reinforce.jl environment: Pendulum update) running with DDPG with newer package versions (based on Flux model-zoo).
It seems to work on my CPU but on the GPU get a weird error which seems to be related to CUDA.jl not liking the multiplication in the last layer of the actor network in line 84:
actor = Chain(Dense(STATE_SIZE, 400, relu),
Dense(400, 300, relu),
Dense(300, ACTION_SIZE, tanh, initW=w_init),
x → x * ACTION_BOUND) |> gpu
Probably the stacktrace is easy to read once you know how CUDA and Zygote interact. Can anyone point me in the right direction? Maybe it is also just not necessary to have the multiplication there (but instead when when calling step!) but I would still be interested to know where the error comes from.
Julia version 1.5.2
Cuda version 10.2
Package Versions:
[fbb218c0] BSON v0.2.6
[052768ef] CUDA v2.1.0
[864edb3b] DataStructures v0.18.8
[31c24e10] Distributions v0.23.8
[587475ba] Flux v0.11.2
[91a5bcdd] Plots v1.7.3
[0376cc21] Reinforce v0.3.0
[6a2ea274] Torch v0.1.2
[e88e6eb3] Zygote v0.5.9
Stacktrace:
┌ Warning: Performing scalar operations on GPU arrays: This is very slow, consider disallowing these operations with `allowscalar(false)`
└ @ GPUArrays ~/.julia/packages/GPUArrays/ZxsKE/src/host/indexing.jl:43
ERROR: LoadError: MethodError: no method matching Float32(::ForwardDiff.Dual{Nothing,Float64,1})
Closest candidates are:
Float32(::Real, !Matched::RoundingMode) where T<:AbstractFloat at rounding.jl:200
Float32(::T) where T<:Number at boot.jl:716
Float32(!Matched::Irrational{:inv4π}) at irrationals.jl:190
...
Stacktrace:
[1] (::CUDA.var"#895#896"{Float32})(::ForwardDiff.Dual{Nothing,Float64,1}) at /homes2/ipdm/llanger/.julia/packages/CUDA/0p5fn/src/broadcast.jl:21
[2] (::Zygote.var"#1091#1094"{CUDA.var"#895#896"{Float32}})(::Float64) at /homes2/ipdm/llanger/.julia/packages/Zygote/c0awc/src/lib/broadcast.jl:182
[3] _broadcast_getindex_evalf at ./broadcast.jl:648 [inlined]
[4] _broadcast_getindex at ./broadcast.jl:621 [inlined]
[5] getindex at ./broadcast.jl:575 [inlined]
[6] copy at ./broadcast.jl:876 [inlined]
[7] materialize(::Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{2},Nothing,Zygote.var"#1091#1094"{CUDA.var"#895#896"{Float32}},Tuple{CuArray{Float64,2}}}) at ./broadcast.jl:837
[8] broadcast_forward(::Function, ::CuArray{Float64,2}) at /homes2/ipdm/llanger/.julia/packages/Zygote/c0awc/src/lib/broadcast.jl:188
[9] adjoint at /homes2/ipdm/llanger/.julia/packages/Zygote/c0awc/src/lib/broadcast.jl:200 [inlined]
[10] _pullback at /homes2/ipdm/llanger/.julia/packages/ZygoteRules/6nssF/src/adjoint.jl:47 [inlined]
[11] adjoint at /homes2/ipdm/llanger/.julia/packages/Zygote/c0awc/src/lib/lib.jl:188 [inlined]
[12] _pullback at /homes2/ipdm/llanger/.julia/packages/ZygoteRules/6nssF/src/adjoint.jl:47 [inlined]
[13] broadcasted at ./broadcast.jl:1257 [inlined]
[14] Dense at /homes2/ipdm/llanger/.julia/packages/Flux/q3zeA/src/layers/basic.jl:137 [inlined]
[15] _pullback(::Zygote.Context, ::Dense{typeof(identity),CuArray{Float32,2},CuArray{Float32,1}}, ::CuArray{Float64,2}) at /homes2/ipdm/llanger/.julia/packages/Zygote/c0awc/src/compiler/interface2.jl:0
[16] crit at /net/work/llanger/DDPG_reinforce_v6.jl:100 [inlined]
[17] _pullback(::Zygote.Context, ::crit, ::CuArray{Float32,2}, ::CuArray{Float64,2}) at /homes2/ipdm/llanger/.julia/packages/Zygote/c0awc/src/compiler/interface2.jl:0
[18] loss_act at /net/work/llanger/DDPG_reinforce_v6.jl:135 [inlined]
[19] _pullback(::Zygote.Context, ::typeof(loss_act), ::CuArray{Float32,2}) at /homes2/ipdm/llanger/.julia/packages/Zygote/c0awc/src/compiler/interface2.jl:0
[20] adjoint at /homes2/ipdm/llanger/.julia/packages/Zygote/c0awc/src/lib/lib.jl:188 [inlined]
[21] _pullback at /homes2/ipdm/llanger/.julia/packages/ZygoteRules/6nssF/src/adjoint.jl:47 [inlined]
[22] #4 at /net/work/llanger/DDPG_reinforce_v6.jl:121 [inlined]
[23] _pullback(::Zygote.Context, ::var"#4#5"{typeof(loss_act),Tuple{CuArray{Float32,2}}}) at /homes2/ipdm/llanger/.julia/packages/Zygote/c0awc/src/compiler/interface2.jl:0
[24] pullback(::Function, ::Params) at /homes2/ipdm/llanger/.julia/packages/Zygote/c0awc/src/compiler/interface.jl:172
[25] gradient(::Function, ::Params) at /homes2/ipdm/llanger/.julia/packages/Zygote/c0awc/src/compiler/interface.jl:53
[26] update_model!(::Chain{Tuple{Dense{typeof(relu),CuArray{Float32,2},CuArray{Float32,1}},Dense{typeof(relu),CuArray{Float32,2},CuArray{Float32,1}},Dense{typeof(tanh),CuArray{Float32,2},CuArray{Float32,1}}}}, ::ADAM, ::Function, ::CuArray{Float32,2}) at /net/work/llanger/DDPG_reinforce_v6.jl:121
[27] replay() at /net/work/llanger/DDPG_reinforce_v6.jl:150
[28] episode!(::Pendulum; train::Bool, show::Bool) at /net/work/llanger/DDPG_reinforce_v6.jl:185
[29] top-level scope at /net/work/llanger/DDPG_reinforce_v6.jl:217
[30] include(::Function, ::Module, ::String) at ./Base.jl:380
[31] include(::Module, ::String) at ./Base.jl:368
[32] exec_options(::Base.JLOptions) at ./client.jl:296
[33] _start() at ./client.jl:506
in expression starting at /net/work/llanger/DDPG_reinforce_v6.jl:215
Max steps= 3000, Max episodes= 200, Actor_target + Action_bounds, loss act mean
Code:
using Flux, Printf, Zygote, CUDA
using Flux.Optimise: update!
using BSON: @save, @load
using Statistics: mean
using DataStructures: CircularBuffer
using Distributions: sample, Uniform
using Random
using Reinforce
using Reinforce.PendulumEnv: Pendulum
using Plots
gr()
Random.seed!(123)
#Load game environment
env = Pendulum()
reset!(env)
# ----------------------------- Parameters -------------------------------------
const STATE_SIZE = length(env.state) # state is modeled as struct
const ACTION_SIZE = length(env.a)
const ACTION_BOUND = actions(env, env.state).hi[1]
const MAX_EP = 50_000
const MAX_EP_STEPS = 200
const UPDATE_EVERY = 1
const BATCH_SIZE = 64
const MEM_SIZE = 100_000
const MIN_EXP_SIZE = 50_000
const γ = 0.99f0 # discount rate
const τ = 1f-3 # for running average while updating target networks
const η_act = 1f-4 # Learning rate
const η_crit = 1f-3
const L2_DECAY = 0.01f0
# Ornstein-Uhlenbeck Noise params
const μ = 0f0
const θ = 0.15f0
const σ = 0.2f0
# --------------------------------- Memory ------------------------------------
memory = CircularBuffer{Any}(MEM_SIZE)
function getData(batch_size = BATCH_SIZE)
# Getting data in shape
minibatch = sample(memory, batch_size)
x = hcat(minibatch...)
s = hcat(x[1, :]...) |> gpu
a = hcat(x[2, :]...) |> gpu
r = hcat(x[3, :]...) |> gpu
s′ = hcat(x[4, :]...) |> gpu
s_mask = .!hcat(x[5, :]...) |> gpu
return s, a, r, s′, s_mask
end
# -------------------------------- Action Noise --------------------------------
struct OUNoise
μ
θ
σ
X
end
ou = OUNoise(μ, θ, σ, zeros(Float32, ACTION_SIZE))
function sample_noise(ou::OUNoise)
dx = ou.θ * (ou.μ .- ou.X)
dx .+= ou.σ * randn(Float32, length(ou.X))
ou.X .+= dx
end
# Noise scale
const τ_ = 25
const ϵ = exp(-1f0 / τ_)
noise_scale = 1f0 #/ ACTION_BOUND
# ----------------------------- Model Architecture -----------------------------
w_init(dims...) = rand(Uniform(-3f-3, 3f-3), dims...)
actor = Chain(
Dense(STATE_SIZE, 400, relu),
Dense(400, 300, relu),
Dense(300, ACTION_SIZE, tanh, initW=w_init),
x -> x * ACTION_BOUND # not working on gpu/added in action()
) |> gpu
actor_target = deepcopy(actor)
# Critic model
struct crit
state_crit
act_crit
sa_crit
end
Flux.@functor crit
function (c::crit)(state, action)
s = c.state_crit(state)
a = c.act_crit(action)
c.sa_crit(relu.(s .+ a))
end
Base.deepcopy(c::crit) = crit(deepcopy(c.state_crit),
deepcopy(c.act_crit),
deepcopy(c.sa_crit))
critic = crit(Chain(Dense(STATE_SIZE, 400, relu), Dense(400, 300)) |> gpu,
Dense(ACTION_SIZE, 300) |> gpu,
Dense(300, 1, initW=w_init) |> gpu)
critic_target = deepcopy(critic)
# ---------------------- Param Update Functions --------------------------------
function update_target!(target, model; τ = 1f0)
for (p_t, p_m) in zip(params(target), params(model))
p_t .= (1f0 - τ) * p_t .+ τ * p_m
end
end
function update_model!(model, opt, loss, inp...)
grads = gradient(()->loss(inp...), params(model))
update!(opt, params(model), grads)
end
# ---------------------------------- Training ----------------------------------
# Losses
function L2_loss(model)
l2_loss = sum(map(p->sum(p.^2), params(model)))
return L2_DECAY * l2_loss
end
loss_crit(y, s, a) = Flux.mse(critic(s, a), y) + L2_loss(critic)
function loss_act(s)
actions = actor(s |> gpu)
crit_out = critic(s, actions)
return -sum(crit_out) # sum
end
# Optimizers
#Optimiser(WeightDecay(lambda), opt)
opt_crit = ADAM(η_crit)
opt_act = ADAM(η_act)
function replay()
s, a, r, s′, s_mask = getData()
# update Critic
a′ = actor_target(s′ |> gpu)
v′ = critic_target(s′, a′)
y = r .+ γ * v′ .* s_mask # set v′ to 0 where s_ is terminal state
update_model!(critic, opt_crit, loss_crit, y, s, a)
update_model!(actor, opt_act, loss_act, s)
# Update Target models
update_target!(actor_target, actor; τ = τ)
update_target!(critic_target, critic; τ = τ)
return nothing
end
# ---------------------------- Helper Functions --------------------------------
# Stores tuple of state, action, reward, next_state, and done
remember(state, action, reward, next_state, done) =
push!(memory, [(state, action, reward, next_state)..., done])
# Choose action according to policy PendulumPolicy
function action(env; train=true)
s = Flux.unsqueeze(env.state, 2)
act_pred = cpu(actor(s |> gpu)) .+ # scale action to bound
train * noise_scale * sample_noise(ou) # add noise only in training
return clamp.(act_pred[1], -ACTION_BOUND, ACTION_BOUND) # returns action, assumes ACTION_SIZE=1
end
function episode!(env::Pendulum; train=true, show=false)
reset!(env)
total_reward=0f0
for ep=1:MAX_EP_STEPS
s = env.state
a = action(env, train=train)
r, s′ = step!(env, s, a)
total_reward += r
if show == true
sleep(0.001)
gui(plot(env))
end
if train == true
remember(s, a, r, s′, finished(env, s′))
finished(env, s′) && break
if ep % UPDATE_EVERY == 0
replay()
end
end
end
return total_reward
end
# -------------------------------- Testing -------------------------------------
# Returns average score over 100 episodes
function test(env::Pendulum; show=false)
score_mean = 0f0
for e=1:100
total_reward = episode!(env, train=false, show=show)
score_mean += total_reward / 100
end
return score_mean
end
# ------------------------------ Training --------------------------------------
# Populate memory with random actions
reset!(env)
Random.seed!(123)
for e=1:MIN_EXP_SIZE
s = env.state
a = (2rand(Float32) * ACTION_BOUND) - ACTION_BOUND
r, s′ = step!(env, s, a)
remember(s, a, r, s′, finished(env, s′))
end
println("Max steps= $(MAX_EP_STEPS), Max episodes= $(MAX_EP), Actor_target + Action_bounds")
total_reward=Float32[]
score_mean=Float32[]
for e=1:MAX_EP
global noise_scale
push!(total_reward, episode!(env, train=true))
tr = @sprintf "%9.3f" total_reward[e]
print("Episode: $e | Score: $tr | ")
push!(score_mean, test(env))
sm = @sprintf "%9.3f" score_mean[e]
println("Mean score over 100 test episodes: $sm")
noise_scale *= ϵ
end