I am pretty new to Flux and try to get the Pendulum environment (using a slightly modified Reinforce.jl environment: Pendulum update) running with DDPG with newer package versions (based on Flux model-zoo).
It seems to work on my CPU but on the GPU get a weird error which seems to be related to CUDA.jl not liking the multiplication in the last layer of the actor network in line 84:
actor = Chain(Dense(STATE_SIZE, 400, relu),
Dense(400, 300, relu),
Dense(300, ACTION_SIZE, tanh, initW=w_init),
x → x * ACTION_BOUND) |> gpu
Probably the stacktrace is easy to read once you know how CUDA and Zygote interact. Can anyone point me in the right direction? Maybe it is also just not necessary to have the multiplication there (but instead when when calling step!) but I would still be interested to know where the error comes from.
Julia version 1.5.2
Cuda version 10.2
Package Versions:
  [fbb218c0] BSON v0.2.6
  [052768ef] CUDA v2.1.0
  [864edb3b] DataStructures v0.18.8
  [31c24e10] Distributions v0.23.8
  [587475ba] Flux v0.11.2
  [91a5bcdd] Plots v1.7.3
  [0376cc21] Reinforce v0.3.0
  [6a2ea274] Torch v0.1.2
  [e88e6eb3] Zygote v0.5.9
Stacktrace:
┌ Warning: Performing scalar operations on GPU arrays: This is very slow, consider disallowing these operations with `allowscalar(false)`
└ @ GPUArrays ~/.julia/packages/GPUArrays/ZxsKE/src/host/indexing.jl:43
ERROR: LoadError: MethodError: no method matching Float32(::ForwardDiff.Dual{Nothing,Float64,1})
Closest candidates are:
  Float32(::Real, !Matched::RoundingMode) where T<:AbstractFloat at rounding.jl:200
  Float32(::T) where T<:Number at boot.jl:716
  Float32(!Matched::Irrational{:inv4π}) at irrationals.jl:190
  ...
Stacktrace:
 [1] (::CUDA.var"#895#896"{Float32})(::ForwardDiff.Dual{Nothing,Float64,1}) at /homes2/ipdm/llanger/.julia/packages/CUDA/0p5fn/src/broadcast.jl:21
 [2] (::Zygote.var"#1091#1094"{CUDA.var"#895#896"{Float32}})(::Float64) at /homes2/ipdm/llanger/.julia/packages/Zygote/c0awc/src/lib/broadcast.jl:182
 [3] _broadcast_getindex_evalf at ./broadcast.jl:648 [inlined]
 [4] _broadcast_getindex at ./broadcast.jl:621 [inlined]
 [5] getindex at ./broadcast.jl:575 [inlined]
 [6] copy at ./broadcast.jl:876 [inlined]
 [7] materialize(::Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{2},Nothing,Zygote.var"#1091#1094"{CUDA.var"#895#896"{Float32}},Tuple{CuArray{Float64,2}}}) at ./broadcast.jl:837
 [8] broadcast_forward(::Function, ::CuArray{Float64,2}) at /homes2/ipdm/llanger/.julia/packages/Zygote/c0awc/src/lib/broadcast.jl:188
 [9] adjoint at /homes2/ipdm/llanger/.julia/packages/Zygote/c0awc/src/lib/broadcast.jl:200 [inlined]
 [10] _pullback at /homes2/ipdm/llanger/.julia/packages/ZygoteRules/6nssF/src/adjoint.jl:47 [inlined]
 [11] adjoint at /homes2/ipdm/llanger/.julia/packages/Zygote/c0awc/src/lib/lib.jl:188 [inlined]
 [12] _pullback at /homes2/ipdm/llanger/.julia/packages/ZygoteRules/6nssF/src/adjoint.jl:47 [inlined]
 [13] broadcasted at ./broadcast.jl:1257 [inlined]
 [14] Dense at /homes2/ipdm/llanger/.julia/packages/Flux/q3zeA/src/layers/basic.jl:137 [inlined]
 [15] _pullback(::Zygote.Context, ::Dense{typeof(identity),CuArray{Float32,2},CuArray{Float32,1}}, ::CuArray{Float64,2}) at /homes2/ipdm/llanger/.julia/packages/Zygote/c0awc/src/compiler/interface2.jl:0
 [16] crit at /net/work/llanger/DDPG_reinforce_v6.jl:100 [inlined]
 [17] _pullback(::Zygote.Context, ::crit, ::CuArray{Float32,2}, ::CuArray{Float64,2}) at /homes2/ipdm/llanger/.julia/packages/Zygote/c0awc/src/compiler/interface2.jl:0
 [18] loss_act at /net/work/llanger/DDPG_reinforce_v6.jl:135 [inlined]
 [19] _pullback(::Zygote.Context, ::typeof(loss_act), ::CuArray{Float32,2}) at /homes2/ipdm/llanger/.julia/packages/Zygote/c0awc/src/compiler/interface2.jl:0
 [20] adjoint at /homes2/ipdm/llanger/.julia/packages/Zygote/c0awc/src/lib/lib.jl:188 [inlined]
 [21] _pullback at /homes2/ipdm/llanger/.julia/packages/ZygoteRules/6nssF/src/adjoint.jl:47 [inlined]
 [22] #4 at /net/work/llanger/DDPG_reinforce_v6.jl:121 [inlined]
 [23] _pullback(::Zygote.Context, ::var"#4#5"{typeof(loss_act),Tuple{CuArray{Float32,2}}}) at /homes2/ipdm/llanger/.julia/packages/Zygote/c0awc/src/compiler/interface2.jl:0
 [24] pullback(::Function, ::Params) at /homes2/ipdm/llanger/.julia/packages/Zygote/c0awc/src/compiler/interface.jl:172
 [25] gradient(::Function, ::Params) at /homes2/ipdm/llanger/.julia/packages/Zygote/c0awc/src/compiler/interface.jl:53
 [26] update_model!(::Chain{Tuple{Dense{typeof(relu),CuArray{Float32,2},CuArray{Float32,1}},Dense{typeof(relu),CuArray{Float32,2},CuArray{Float32,1}},Dense{typeof(tanh),CuArray{Float32,2},CuArray{Float32,1}}}}, ::ADAM, ::Function, ::CuArray{Float32,2}) at /net/work/llanger/DDPG_reinforce_v6.jl:121
 [27] replay() at /net/work/llanger/DDPG_reinforce_v6.jl:150
 [28] episode!(::Pendulum; train::Bool, show::Bool) at /net/work/llanger/DDPG_reinforce_v6.jl:185
 [29] top-level scope at /net/work/llanger/DDPG_reinforce_v6.jl:217
 [30] include(::Function, ::Module, ::String) at ./Base.jl:380
 [31] include(::Module, ::String) at ./Base.jl:368
 [32] exec_options(::Base.JLOptions) at ./client.jl:296
 [33] _start() at ./client.jl:506
in expression starting at /net/work/llanger/DDPG_reinforce_v6.jl:215
Max steps= 3000, Max episodes= 200, Actor_target + Action_bounds, loss act mean
Code:
using Flux, Printf, Zygote, CUDA
using Flux.Optimise: update!
using BSON: @save, @load
using Statistics: mean
using DataStructures: CircularBuffer
using Distributions: sample, Uniform
using Random
using Reinforce
using Reinforce.PendulumEnv: Pendulum
using Plots
gr()
Random.seed!(123)
#Load game environment
env = Pendulum()
reset!(env)
# ----------------------------- Parameters -------------------------------------
const STATE_SIZE = length(env.state) # state is modeled as struct
const ACTION_SIZE = length(env.a)
const ACTION_BOUND = actions(env, env.state).hi[1]
const MAX_EP = 50_000
const MAX_EP_STEPS = 200
const UPDATE_EVERY = 1
const BATCH_SIZE = 64
const MEM_SIZE = 100_000 
const MIN_EXP_SIZE = 50_000 
const γ = 0.99f0     # discount rate
const τ = 1f-3 # for running average while updating target networks
const η_act = 1f-4   # Learning rate
const η_crit = 1f-3
const L2_DECAY = 0.01f0
# Ornstein-Uhlenbeck Noise params
const μ = 0f0
const θ = 0.15f0
const σ = 0.2f0
# --------------------------------- Memory ------------------------------------
memory = CircularBuffer{Any}(MEM_SIZE)
function getData(batch_size = BATCH_SIZE)
  # Getting data in shape
  minibatch = sample(memory, batch_size)
  x = hcat(minibatch...)
  s      =   hcat(x[1, :]...) |> gpu
  a      =   hcat(x[2, :]...) |> gpu
  r      =   hcat(x[3, :]...) |> gpu
  s′     =   hcat(x[4, :]...) |> gpu
  s_mask = .!hcat(x[5, :]...) |> gpu
  return s, a, r, s′, s_mask
end
# -------------------------------- Action Noise --------------------------------
struct OUNoise
  μ
  θ
  σ
  X
end
ou = OUNoise(μ, θ, σ, zeros(Float32, ACTION_SIZE))
function sample_noise(ou::OUNoise)
  dx     = ou.θ * (ou.μ .- ou.X)
  dx   .+= ou.σ * randn(Float32, length(ou.X))
  ou.X .+= dx
end
# Noise scale
const τ_ = 25
const ϵ  = exp(-1f0 / τ_)
noise_scale = 1f0 #/ ACTION_BOUND
# ----------------------------- Model Architecture -----------------------------
w_init(dims...) = rand(Uniform(-3f-3, 3f-3), dims...)
actor = Chain(
			Dense(STATE_SIZE, 400, relu),
	      	        Dense(400, 300, relu),
                        Dense(300, ACTION_SIZE, tanh, initW=w_init),
                        x -> x * ACTION_BOUND        # not working on gpu/added in action()
			) |> gpu
actor_target = deepcopy(actor)
# Critic model
struct crit
  state_crit
  act_crit
  sa_crit
end
Flux.@functor crit
function (c::crit)(state, action)
  s = c.state_crit(state)
  a = c.act_crit(action)
  c.sa_crit(relu.(s .+ a))
end
Base.deepcopy(c::crit) = crit(deepcopy(c.state_crit),
                              deepcopy(c.act_crit),
			      deepcopy(c.sa_crit))
critic = crit(Chain(Dense(STATE_SIZE, 400, relu), Dense(400, 300)) |> gpu,
                  Dense(ACTION_SIZE, 300) |> gpu,
	      	  Dense(300, 1, initW=w_init) |> gpu)
critic_target = deepcopy(critic)
# ---------------------- Param Update Functions --------------------------------
function update_target!(target, model; τ = 1f0)
  for (p_t, p_m) in zip(params(target), params(model))
    p_t .= (1f0 - τ) * p_t .+ τ * p_m
  end
end
function update_model!(model, opt, loss, inp...)
  grads = gradient(()->loss(inp...), params(model))
  update!(opt, params(model), grads)
end
# ---------------------------------- Training ----------------------------------
# Losses
function L2_loss(model)
  l2_loss = sum(map(p->sum(p.^2), params(model)))
  return L2_DECAY * l2_loss
end
loss_crit(y, s, a) = Flux.mse(critic(s, a), y) + L2_loss(critic)
function loss_act(s)
  actions = actor(s |> gpu)
  crit_out = critic(s, actions)
  return -sum(crit_out)  # sum
end
# Optimizers
#Optimiser(WeightDecay(lambda), opt)
opt_crit = ADAM(η_crit)
opt_act  = ADAM(η_act)
function replay()
  s, a, r, s′, s_mask = getData()
  # update Critic
  a′ = actor_target(s′ |> gpu)
  v′ = critic_target(s′, a′)
  y = r .+ γ * v′ .* s_mask	# set v′ to 0 where s_ is terminal state
  update_model!(critic, opt_crit, loss_crit, y, s, a)
  update_model!(actor, opt_act, loss_act, s)
  # Update Target models
  update_target!(actor_target, actor; τ = τ)
  update_target!(critic_target, critic; τ = τ)
  return nothing
end
# ---------------------------- Helper Functions --------------------------------
# Stores tuple of state, action, reward, next_state, and done
remember(state, action, reward, next_state, done) =
  push!(memory, [(state, action, reward, next_state)..., done])
# Choose action according to policy PendulumPolicy
function action(env; train=true)
	s = Flux.unsqueeze(env.state, 2)
	act_pred = cpu(actor(s |> gpu)) .+		# scale action to bound
	 	train * noise_scale * sample_noise(ou) 				# add noise only in training
	return clamp.(act_pred[1], -ACTION_BOUND, ACTION_BOUND) # returns action, assumes ACTION_SIZE=1
end
function episode!(env::Pendulum; train=true, show=false)
  reset!(env)
  total_reward=0f0
  for ep=1:MAX_EP_STEPS
	s = env.state
    a = action(env, train=train)
    r, s′ = step!(env, s, a)
	total_reward += r
	if show == true
		sleep(0.001)
		gui(plot(env))
	end
	if train == true
      remember(s, a, r, s′, finished(env, s′))
	  finished(env, s′) && break
	  if ep % UPDATE_EVERY == 0
		  replay()
	  end
    end
  end
  return total_reward
end
# -------------------------------- Testing -------------------------------------
# Returns average score over 100 episodes
function test(env::Pendulum; show=false)
  score_mean = 0f0
  for e=1:100
    total_reward = episode!(env, train=false, show=show)
    score_mean += total_reward / 100
  end
  return score_mean
end
# ------------------------------ Training --------------------------------------
# Populate memory with random actions
reset!(env)
Random.seed!(123)
for e=1:MIN_EXP_SIZE
  s = env.state
  a = (2rand(Float32) * ACTION_BOUND) - ACTION_BOUND
  r, s′ = step!(env, s, a)
  remember(s, a, r, s′, finished(env, s′))
end
println("Max steps= $(MAX_EP_STEPS), Max episodes= $(MAX_EP), Actor_target + Action_bounds")
total_reward=Float32[]
score_mean=Float32[]
for e=1:MAX_EP
  global noise_scale
  push!(total_reward, episode!(env, train=true))
  tr = @sprintf "%9.3f" total_reward[e]
  print("Episode: $e | Score: $tr | ")
  push!(score_mean, test(env))
  sm = @sprintf "%9.3f" score_mean[e]
  println("Mean score over 100 test episodes: $sm")
  noise_scale *= ϵ
end