Flux Pendulum DDPG example fails on GPU

I am pretty new to Flux and try to get the Pendulum environment (using a slightly modified Reinforce.jl environment: Pendulum update) running with DDPG with newer package versions (based on Flux model-zoo).
It seems to work on my CPU but on the GPU get a weird error which seems to be related to CUDA.jl not liking the multiplication in the last layer of the actor network in line 84:

actor = Chain(Dense(STATE_SIZE, 400, relu),
Dense(400, 300, relu),
Dense(300, ACTION_SIZE, tanh, initW=w_init),
x → x * ACTION_BOUND) |> gpu

Probably the stacktrace is easy to read once you know how CUDA and Zygote interact. Can anyone point me in the right direction? Maybe it is also just not necessary to have the multiplication there (but instead when when calling step!) but I would still be interested to know where the error comes from.

Julia version 1.5.2
Cuda version 10.2

Package Versions:

  [fbb218c0] BSON v0.2.6
  [052768ef] CUDA v2.1.0
  [864edb3b] DataStructures v0.18.8
  [31c24e10] Distributions v0.23.8
  [587475ba] Flux v0.11.2
  [91a5bcdd] Plots v1.7.3
  [0376cc21] Reinforce v0.3.0
  [6a2ea274] Torch v0.1.2
  [e88e6eb3] Zygote v0.5.9

Stacktrace:

┌ Warning: Performing scalar operations on GPU arrays: This is very slow, consider disallowing these operations with `allowscalar(false)`
└ @ GPUArrays ~/.julia/packages/GPUArrays/ZxsKE/src/host/indexing.jl:43
ERROR: LoadError: MethodError: no method matching Float32(::ForwardDiff.Dual{Nothing,Float64,1})
Closest candidates are:
  Float32(::Real, !Matched::RoundingMode) where T<:AbstractFloat at rounding.jl:200
  Float32(::T) where T<:Number at boot.jl:716
  Float32(!Matched::Irrational{:inv4π}) at irrationals.jl:190
  ...
Stacktrace:
 [1] (::CUDA.var"#895#896"{Float32})(::ForwardDiff.Dual{Nothing,Float64,1}) at /homes2/ipdm/llanger/.julia/packages/CUDA/0p5fn/src/broadcast.jl:21
 [2] (::Zygote.var"#1091#1094"{CUDA.var"#895#896"{Float32}})(::Float64) at /homes2/ipdm/llanger/.julia/packages/Zygote/c0awc/src/lib/broadcast.jl:182
 [3] _broadcast_getindex_evalf at ./broadcast.jl:648 [inlined]
 [4] _broadcast_getindex at ./broadcast.jl:621 [inlined]
 [5] getindex at ./broadcast.jl:575 [inlined]
 [6] copy at ./broadcast.jl:876 [inlined]
 [7] materialize(::Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{2},Nothing,Zygote.var"#1091#1094"{CUDA.var"#895#896"{Float32}},Tuple{CuArray{Float64,2}}}) at ./broadcast.jl:837
 [8] broadcast_forward(::Function, ::CuArray{Float64,2}) at /homes2/ipdm/llanger/.julia/packages/Zygote/c0awc/src/lib/broadcast.jl:188
 [9] adjoint at /homes2/ipdm/llanger/.julia/packages/Zygote/c0awc/src/lib/broadcast.jl:200 [inlined]
 [10] _pullback at /homes2/ipdm/llanger/.julia/packages/ZygoteRules/6nssF/src/adjoint.jl:47 [inlined]
 [11] adjoint at /homes2/ipdm/llanger/.julia/packages/Zygote/c0awc/src/lib/lib.jl:188 [inlined]
 [12] _pullback at /homes2/ipdm/llanger/.julia/packages/ZygoteRules/6nssF/src/adjoint.jl:47 [inlined]
 [13] broadcasted at ./broadcast.jl:1257 [inlined]
 [14] Dense at /homes2/ipdm/llanger/.julia/packages/Flux/q3zeA/src/layers/basic.jl:137 [inlined]
 [15] _pullback(::Zygote.Context, ::Dense{typeof(identity),CuArray{Float32,2},CuArray{Float32,1}}, ::CuArray{Float64,2}) at /homes2/ipdm/llanger/.julia/packages/Zygote/c0awc/src/compiler/interface2.jl:0
 [16] crit at /net/work/llanger/DDPG_reinforce_v6.jl:100 [inlined]
 [17] _pullback(::Zygote.Context, ::crit, ::CuArray{Float32,2}, ::CuArray{Float64,2}) at /homes2/ipdm/llanger/.julia/packages/Zygote/c0awc/src/compiler/interface2.jl:0
 [18] loss_act at /net/work/llanger/DDPG_reinforce_v6.jl:135 [inlined]
 [19] _pullback(::Zygote.Context, ::typeof(loss_act), ::CuArray{Float32,2}) at /homes2/ipdm/llanger/.julia/packages/Zygote/c0awc/src/compiler/interface2.jl:0
 [20] adjoint at /homes2/ipdm/llanger/.julia/packages/Zygote/c0awc/src/lib/lib.jl:188 [inlined]
 [21] _pullback at /homes2/ipdm/llanger/.julia/packages/ZygoteRules/6nssF/src/adjoint.jl:47 [inlined]
 [22] #4 at /net/work/llanger/DDPG_reinforce_v6.jl:121 [inlined]
 [23] _pullback(::Zygote.Context, ::var"#4#5"{typeof(loss_act),Tuple{CuArray{Float32,2}}}) at /homes2/ipdm/llanger/.julia/packages/Zygote/c0awc/src/compiler/interface2.jl:0
 [24] pullback(::Function, ::Params) at /homes2/ipdm/llanger/.julia/packages/Zygote/c0awc/src/compiler/interface.jl:172
 [25] gradient(::Function, ::Params) at /homes2/ipdm/llanger/.julia/packages/Zygote/c0awc/src/compiler/interface.jl:53
 [26] update_model!(::Chain{Tuple{Dense{typeof(relu),CuArray{Float32,2},CuArray{Float32,1}},Dense{typeof(relu),CuArray{Float32,2},CuArray{Float32,1}},Dense{typeof(tanh),CuArray{Float32,2},CuArray{Float32,1}}}}, ::ADAM, ::Function, ::CuArray{Float32,2}) at /net/work/llanger/DDPG_reinforce_v6.jl:121
 [27] replay() at /net/work/llanger/DDPG_reinforce_v6.jl:150
 [28] episode!(::Pendulum; train::Bool, show::Bool) at /net/work/llanger/DDPG_reinforce_v6.jl:185
 [29] top-level scope at /net/work/llanger/DDPG_reinforce_v6.jl:217
 [30] include(::Function, ::Module, ::String) at ./Base.jl:380
 [31] include(::Module, ::String) at ./Base.jl:368
 [32] exec_options(::Base.JLOptions) at ./client.jl:296
 [33] _start() at ./client.jl:506
in expression starting at /net/work/llanger/DDPG_reinforce_v6.jl:215
Max steps= 3000, Max episodes= 200, Actor_target + Action_bounds, loss act mean

Code:

using Flux, Printf, Zygote, CUDA
using Flux.Optimise: update!
using BSON: @save, @load
using Statistics: mean
using DataStructures: CircularBuffer
using Distributions: sample, Uniform
using Random
using Reinforce
using Reinforce.PendulumEnv: Pendulum
using Plots
gr()

Random.seed!(123)

#Load game environment
env = Pendulum()
reset!(env)
# ----------------------------- Parameters -------------------------------------
const STATE_SIZE = length(env.state) # state is modeled as struct
const ACTION_SIZE = length(env.a)
const ACTION_BOUND = actions(env, env.state).hi[1]
const MAX_EP = 50_000
const MAX_EP_STEPS = 200
const UPDATE_EVERY = 1

const BATCH_SIZE = 64
const MEM_SIZE = 100_000 
const MIN_EXP_SIZE = 50_000 

const γ = 0.99f0     # discount rate

const τ = 1f-3 # for running average while updating target networks
const η_act = 1f-4   # Learning rate
const η_crit = 1f-3
const L2_DECAY = 0.01f0

# Ornstein-Uhlenbeck Noise params
const μ = 0f0
const θ = 0.15f0
const σ = 0.2f0
# --------------------------------- Memory ------------------------------------
memory = CircularBuffer{Any}(MEM_SIZE)

function getData(batch_size = BATCH_SIZE)
  # Getting data in shape
  minibatch = sample(memory, batch_size)
  x = hcat(minibatch...)

  s      =   hcat(x[1, :]...) |> gpu
  a      =   hcat(x[2, :]...) |> gpu
  r      =   hcat(x[3, :]...) |> gpu
  s′     =   hcat(x[4, :]...) |> gpu
  s_mask = .!hcat(x[5, :]...) |> gpu

  return s, a, r, s′, s_mask
end
# -------------------------------- Action Noise --------------------------------
struct OUNoise
  μ
  θ
  σ
  X
end

ou = OUNoise(μ, θ, σ, zeros(Float32, ACTION_SIZE))

function sample_noise(ou::OUNoise)
  dx     = ou.θ * (ou.μ .- ou.X)
  dx   .+= ou.σ * randn(Float32, length(ou.X))
  ou.X .+= dx
end

# Noise scale
const τ_ = 25
const ϵ  = exp(-1f0 / τ_)
noise_scale = 1f0 #/ ACTION_BOUND
# ----------------------------- Model Architecture -----------------------------
w_init(dims...) = rand(Uniform(-3f-3, 3f-3), dims...)

actor = Chain(
			Dense(STATE_SIZE, 400, relu),
	      	        Dense(400, 300, relu),
                        Dense(300, ACTION_SIZE, tanh, initW=w_init),
                        x -> x * ACTION_BOUND        # not working on gpu/added in action()
			) |> gpu

actor_target = deepcopy(actor)

# Critic model
struct crit
  state_crit
  act_crit
  sa_crit
end

Flux.@functor crit

function (c::crit)(state, action)
  s = c.state_crit(state)
  a = c.act_crit(action)
  c.sa_crit(relu.(s .+ a))
end

Base.deepcopy(c::crit) = crit(deepcopy(c.state_crit),
                              deepcopy(c.act_crit),
			      deepcopy(c.sa_crit))

critic = crit(Chain(Dense(STATE_SIZE, 400, relu), Dense(400, 300)) |> gpu,
                  Dense(ACTION_SIZE, 300) |> gpu,
	      	  Dense(300, 1, initW=w_init) |> gpu)

critic_target = deepcopy(critic)
# ---------------------- Param Update Functions --------------------------------
function update_target!(target, model; τ = 1f0)
  for (p_t, p_m) in zip(params(target), params(model))
    p_t .= (1f0 - τ) * p_t .+ τ * p_m
  end
end

function update_model!(model, opt, loss, inp...)
  grads = gradient(()->loss(inp...), params(model))
  update!(opt, params(model), grads)
end
# ---------------------------------- Training ----------------------------------
# Losses
function L2_loss(model)
  l2_loss = sum(map(p->sum(p.^2), params(model)))
  return L2_DECAY * l2_loss
end

loss_crit(y, s, a) = Flux.mse(critic(s, a), y) + L2_loss(critic)

function loss_act(s)
  actions = actor(s |> gpu)
  crit_out = critic(s, actions)
  return -sum(crit_out)  # sum
end
# Optimizers
#Optimiser(WeightDecay(lambda), opt)
opt_crit = ADAM(η_crit)
opt_act  = ADAM(η_act)

function replay()
  s, a, r, s′, s_mask = getData()
  # update Critic
  a′ = actor_target(s′ |> gpu)
  v′ = critic_target(s′, a′)
  y = r .+ γ * v′ .* s_mask	# set v′ to 0 where s_ is terminal state
  update_model!(critic, opt_crit, loss_crit, y, s, a)
  update_model!(actor, opt_act, loss_act, s)
  # Update Target models
  update_target!(actor_target, actor; τ = τ)
  update_target!(critic_target, critic; τ = τ)
  return nothing
end
# ---------------------------- Helper Functions --------------------------------
# Stores tuple of state, action, reward, next_state, and done
remember(state, action, reward, next_state, done) =
  push!(memory, [(state, action, reward, next_state)..., done])

# Choose action according to policy PendulumPolicy
function action(env; train=true)
	s = Flux.unsqueeze(env.state, 2)
	act_pred = cpu(actor(s |> gpu)) .+		# scale action to bound
	 	train * noise_scale * sample_noise(ou) 				# add noise only in training
	return clamp.(act_pred[1], -ACTION_BOUND, ACTION_BOUND) # returns action, assumes ACTION_SIZE=1
end

function episode!(env::Pendulum; train=true, show=false)
  reset!(env)
  total_reward=0f0
  for ep=1:MAX_EP_STEPS
	s = env.state
    a = action(env, train=train)
    r, s′ = step!(env, s, a)
	total_reward += r
	if show == true
		sleep(0.001)
		gui(plot(env))
	end
	if train == true
      remember(s, a, r, s′, finished(env, s′))
	  finished(env, s′) && break
	  if ep % UPDATE_EVERY == 0
		  replay()
	  end
    end
  end
  return total_reward
end
# -------------------------------- Testing -------------------------------------
# Returns average score over 100 episodes
function test(env::Pendulum; show=false)
  score_mean = 0f0
  for e=1:100
    total_reward = episode!(env, train=false, show=show)
    score_mean += total_reward / 100
  end
  return score_mean
end
# ------------------------------ Training --------------------------------------
# Populate memory with random actions
reset!(env)
Random.seed!(123)
for e=1:MIN_EXP_SIZE
  s = env.state
  a = (2rand(Float32) * ACTION_BOUND) - ACTION_BOUND
  r, s′ = step!(env, s, a)
  remember(s, a, r, s′, finished(env, s′))
end

println("Max steps= $(MAX_EP_STEPS), Max episodes= $(MAX_EP), Actor_target + Action_bounds")
total_reward=Float32[]
score_mean=Float32[]
for e=1:MAX_EP
  global noise_scale
  push!(total_reward, episode!(env, train=true))
  tr = @sprintf "%9.3f" total_reward[e]
  print("Episode: $e | Score: $tr | ")
  push!(score_mean, test(env))
  sm = @sprintf "%9.3f" score_mean[e]
  println("Mean score over 100 test episodes: $sm")
  noise_scale *= ϵ
end

Is ACTION_BOUND a Float64 number? Maybe const ACTION_BOUND = Float32(actions(env, env.state).hi[1]) helps… (I didn’t test).

If the problem is with Reinforce.jl you may have more luck with the DDPG implementation in ReinforcementLearning.jl.

1 Like

OMG, this works like charm :see_no_evil:
I actually like the Flux model zoo implementation because everything is in one spot, this is much easier to grasp (for me at least). I will check if performance is better when adjusting for ACTION_BOUNDS in the actor network or when calling the environment and then make a pull request to have an updated version in the Model zoo.
Thanks a lot!