Flux GPU Error with Zygote

Hello,

I want to build an encoder for text classification model. But some how I get errors when I call Flux.train! method. For the MWE:

# Since this is a text based model, an embedding matrix Is needed - this can be pretrained or  initialized randomly. 
struct Embedding{T}
     WE::T
end

@functor Embedding
(e::Embedding)(x::AbstractVector)  = m.WE[:, x]
# This model uses RNNs so
struct UniDirectional{Embedding, Chain}
    E::Embedding # Embedding layers needs update
    C::Chain
end

@functor UniDirectional
function (model::UniDirectional)(idx::AbstractVector)
    embeds =  model.embedding(idx)
    return model.chain(embeds)
 end

Now need to define the loss function:

function apply(model, x)
    Flux.reset!(model)
    local time = size(x, 1)
    last(map(model, [view(x, t, :) for t in 1:time]))
end 

function loss(x, y)
   yhat = apply(x)
   return  crossentropy(yhat, y)
end

Dummy inputs and outputs:

voc_size = Int(10e3)
dims = 300
batch_size = 64
seq_length = 45 # each sentence has same length
embedding = rand(Float32, voc_size, dims) # assuming 10k words in voc.
X_train = [rand(1:voc_size,  1, batch_size) for i in 1:seq_length]
X_train = reduce(cat, X_train) # assume that this matrix shows only the idx of words
# the main function is inside the ```apply method```
Y_train = Flux.onehotbatch(rand(0:1, batch_size), 0:1)

So our inputs and outputs (X_train & Y_train) have the same batch sizes.

Finally building the model:

dims = 300
embedding  = Embedding(embedding)
modelRNN   = Chain(GRUv3(dims , 50), Dense(50, 2), Dropout(0.25), softmax)
model = UniDirectional(embedding, modelRNN) 
opt = ADAM()
data = [(X_train, Y_train)]
Flux.train!(loss, params(model), data, opt)

Everything works fine till I move the model to GPU.

model = model |> gpu
data = [(X_train, Y_train |> gpu)]
Flux.train!(loss, params(model), data, opt)

Then I get error saying that :

ERROR: InvalidIRError: compiling kernel rand!(CuDeviceMatrix{Float32, 1}, UInt32, UInt32) resulted in invalid LLVM IR
Reason: unsupported dynamic function invocation (call to CUDA.Philox2x32{R}() where R in CUDA at 

    @ Zygote ~/.julia/packages/Zygote/xGkZ5/src/lib/array.jl:193
 [40] _pullback
    @ ~/.julia/packages/Zygote/xGkZ5/src/lib/array.jl:235 [inlined]
 [41] _pullback
    @ ./REPL[96]:5 [inlined]
 [42] _pullback(::Zygote.Context{true}, ::typeof(apply), ::UniDirectional{WordEmbedding{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}, Chain{Tuple{Flux.Recur{Flux.GRUv3Cell{CuArray{Float32, 2,
CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}, Dense{typeof(identity), CuArray{F
loat32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Dropout{Float64, Colon, CUDA.RNG}, typeof(softmax)}}}, ::Matrix{Int64})
    @ Zygote ~/.julia/packages/Zygote/xGkZ5/src/compiler/interface2.jl:0
 [43] _pullback
    @ ./REPL[98]:2 [inlined]
 [44] _pullback(::Zygote.Context{true}, ::typeof(loss), ::Matrix{Int64}, ::Flux.OneHotArray{UInt32, 2, 1, 2, CuArray{UInt32, 1, CUDA.Mem.DeviceBuffer}})
    @ Zygote ~/.julia/packages/Zygote/xGkZ5/src/compiler/interface2.jl:0
 [45] _apply
    @ ./boot.jl:814 [inlined]
 [46] adjoint
    @ ~/.julia/packages/Zygote/xGkZ5/src/lib/lib.jl:203 [inlined]
 [47] _pullback
    @ ~/.julia/packages/ZygoteRules/AIbCs/src/adjoint.jl:65 [inlined]
 [48] _pullback
    @ ~/.julia/packages/Flux/KkC79/src/optimise/train.jl:120 [inlined]
 [49] _pullback(::Zygote.Context{true}, ::Flux.Optimise.var"#37#40"{typeof(loss), Tuple{Matrix{Int64}, Flux.OneHotArray{UInt32, 2, 1, 2, CuArray{UInt32, 1, CUDA.Mem.DeviceBuffer}}}})
    @ Zygote ~/.julia/packages/Zygote/xGkZ5/src/compiler/interface2.jl:0
 [50] pullback(f::Function, ps::Zygote.Params{Zygote.Buffer{Any, Vector{Any}}})
    @ Zygote ~/.julia/packages/Zygote/xGkZ5/src/compiler/interface.jl:373
 [51] gradient(f::Function, args::Zygote.Params{Zygote.Buffer{Any, Vector{Any}}})
    @ Zygote ~/.julia/packages/Zygote/xGkZ5/src/compiler/interface.jl:96
 [52] macro expansion
    @ ~/.julia/packages/Flux/KkC79/src/optimise/train.jl:119 [inlined]
 [53] macro expansion
    @ ~/.julia/packages/ProgressLogging/6KXlp/src/ProgressLogging.jl:328 [inlined]
 [54] train!(loss::Function, ps::Zygote.Params{Zygote.Buffer{Any, Vector{Any}}}, data::Vector{Tuple{Matrix{Int64}, Flux.OneHotArray{UInt32, 2, 1, 2, CuArray{UInt32, 1, CUDA.Mem.DeviceBuffer}}
}}, opt::Adam; cb::Flux.Optimise.var"#38#41")
    @ Flux.Optimise ~/.julia/packages/Flux/KkC79/src/optimise/train.jl:117
 [55] train!(loss::Function, ps::Zygote.Params{Zygote.Buffer{Any, Vector{Any}}}, data::Vector{Tuple{Matrix{Int64}, Flux.OneHotArray{UInt32, 2, 1, 2, CuArray{UInt32, 1, CUDA.Mem.DeviceBuffer}}
}}, opt::Adam)
    @ Flux.Optimise ~/.julia/packages/Flux/KkC79/src/optimise/train.jl:114
 [56] top-level scope
    @ REPL[159]:1

Could anyone shed light on it ?

B.R.

Looks like https://github.com/JuliaGPU/CUDA.jl/issues/1508

Following the link, it now strangely works. Thank you.

By the way, I use the exact same model for regression that is output data are just random vectors. With that setting everything works as expected. Nothing breaks down. But when the Y_train is one hot encodings, this happens. May be this will help to fix the bug.

B.R.