Zygote gradient error with `reduce` on GPU

Hello,

I’m trying to use reduce (and mapreduce and its variants) in the loss function of a Flux neural network, but Zygote throws an error when it runs on the GPU (works fine on the CPU though). Are these functions unsupported on GPUs for some particular reason?

Here’s a MWE:

julia> gradient(x -> reduce(+, x), rand(10))
([1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],)

julia> gradient(x -> reduce(+, x), CUDA.rand(10))
ERROR: `llvmcall` must be compiled to be called
Stacktrace:
  [1] macro expansion
    @ ~/.julia/packages/Zygote/g2w9o/src/compiler/interface2.jl:0 [inlined]
  [2] _pullback(::Zygote.Context{false}, ::Core.IntrinsicFunction, ::String, ::Type{Int64}, ::Type{Tuple{Ptr{Int64}}}, ::Ptr{Int64})
    @ Zygote ~/.julia/packages/Zygote/g2w9o/src/compiler/interface2.jl:9
  [3] _pullback
    @ ./atomics.jl:358 [inlined]
  [4] _pullback(ctx::Zygote.Context{false}, f::typeof(getindex), args::Base.Threads.Atomic{Int64})
    @ Zygote ~/.julia/packages/Zygote/g2w9o/src/compiler/interface2.jl:0
  [5] _pullback
    @ ~/.julia/packages/CUDA/ZdCxS/lib/utils/threading.jl:25 [inlined]
  [6] _pullback
    @ ~/.julia/packages/CUDA/ZdCxS/lib/utils/threading.jl:24 [inlined]
  [7] _pullback
    @ ~/.julia/packages/CUDA/ZdCxS/src/compiler/gpucompiler.jl:5 [inlined]
  [8] _pullback(ctx::Zygote.Context{false}, f::typeof(CUDA.device_properties), args::CuDevice)
    @ Zygote ~/.julia/packages/Zygote/g2w9o/src/compiler/interface2.jl:0
  [9] _pullback
    @ ~/.julia/packages/CUDA/ZdCxS/src/compiler/gpucompiler.jl:49 [inlined]
 [10] _pullback(::Zygote.Context{false}, ::CUDA.var"##CUDACompilerTarget#203", ::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}, ::typeof(CUDA.CUDACompilerTarget), ::CuDevice)
    @ Zygote ~/.julia/packages/Zygote/g2w9o/src/compiler/interface2.jl:0
 [11] _pullback
    @ ~/.julia/packages/CUDA/ZdCxS/src/compiler/gpucompiler.jl:48 [inlined]
 [12] _pullback
    @ ~/.julia/packages/CUDA/ZdCxS/src/compiler/execution.jl:303 [inlined]
 [13] _pullback(::Zygote.Context{false}, ::CUDA.var"##cufunction#218", ::Nothing, ::Bool, ::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}, ::typeof(cufunction), ::typeof(CUDA.partial_mapreduce_grid), ::Type{Tuple{typeof(identity), typeof(+), Float32, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Val{true}, CuDeviceMatrix{Float32, 1}, CuDeviceVector{Float32, 1}}})
    @ Zygote ~/.julia/packages/Zygote/g2w9o/src/compiler/interface2.jl:0
 [14] _pullback
    @ ~/.julia/packages/CUDA/ZdCxS/src/compiler/execution.jl:299 [inlined]
 [15] _pullback(::Zygote.Context{false}, ::typeof(cufunction), ::typeof(CUDA.partial_mapreduce_grid), ::Type{Tuple{typeof(identity), typeof(+), Float32, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Val{true}, CuDeviceMatrix{Float32, 1}, CuDeviceVector{Float32, 1}}})
    @ Zygote ~/.julia/packages/Zygote/g2w9o/src/compiler/interface2.jl:0
 [16] macro expansion
    @ ~/.julia/packages/CUDA/ZdCxS/src/compiler/execution.jl:102 [inlined]
 [17] _pullback
    @ ~/.julia/packages/CUDA/ZdCxS/src/mapreduce.jl:234 [inlined]
 [18] _pullback(::Zygote.Context{false}, ::CUDA.var"##mapreducedim!#282", ::Float32, ::typeof(GPUArrays.mapreducedim!), ::typeof(identity), ::typeof(+), ::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, ::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer})
    @ Zygote ~/.julia/packages/Zygote/g2w9o/src/compiler/interface2.jl:0
 [19] _pullback
    @ ~/.julia/packages/CUDA/ZdCxS/src/mapreduce.jl:169 [inlined]
 [20] _pullback(::Zygote.Context{false}, ::GPUArrays.var"#mapreducedim!##kw", ::NamedTuple{(:init,), Tuple{Float32}}, ::typeof(GPUArrays.mapreducedim!), ::typeof(identity), ::typeof(+), ::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, ::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer})
    @ Zygote ~/.julia/packages/Zygote/g2w9o/src/compiler/interface2.jl:0
 [21] _pullback
    @ ~/.julia/packages/GPUArrays/g2pOV/src/host/mapreduce.jl:69 [inlined]
 [22] _pullback(::Zygote.Context{false}, ::GPUArrays.var"##_mapreduce#33", ::Colon, ::Nothing, ::typeof(GPUArrays._mapreduce), ::typeof(identity), ::typeof(+), ::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer})
    @ Zygote ~/.julia/packages/Zygote/g2w9o/src/compiler/interface2.jl:0
 [23] _apply(::Function, ::Vararg{Any})
    @ Core ./boot.jl:816
 [24] adjoint
    @ ~/.julia/packages/Zygote/g2w9o/src/lib/lib.jl:203 [inlined]
 [25] _pullback
    @ ~/.julia/packages/ZygoteRules/AIbCs/src/adjoint.jl:65 [inlined]
 [26] _pullback
    @ ~/.julia/packages/GPUArrays/g2pOV/src/host/mapreduce.jl:35 [inlined]
 [27] _pullback(::Zygote.Context{false}, ::GPUArrays.var"#_mapreduce##kw", ::NamedTuple{(:dims, :init), Tuple{Colon, Nothing}}, ::typeof(GPUArrays._mapreduce), ::typeof(identity), ::typeof(+), ::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer})
    @ Zygote ~/.julia/packages/Zygote/g2w9o/src/compiler/interface2.jl:0
 [28] _apply(::Function, ::Vararg{Any})
    @ Core ./boot.jl:816
 [29] adjoint
    @ ~/.julia/packages/Zygote/g2w9o/src/lib/lib.jl:203 [inlined]
 [30] _pullback
    @ ~/.julia/packages/ZygoteRules/AIbCs/src/adjoint.jl:65 [inlined]
 [31] _pullback
    @ ~/.julia/packages/GPUArrays/g2pOV/src/host/mapreduce.jl:31 [inlined]
 [32] _pullback(::Zygote.Context{false}, ::GPUArrays.var"##mapreduce#31", ::Colon, ::Nothing, ::typeof(mapreduce), ::typeof(identity), ::typeof(+), ::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer})
    @ Zygote ~/.julia/packages/Zygote/g2w9o/src/compiler/interface2.jl:0
 [33] _apply(::Function, ::Vararg{Any})
    @ Core ./boot.jl:816
 [34] adjoint
    @ ~/.julia/packages/Zygote/g2w9o/src/lib/lib.jl:203 [inlined]
 [35] _pullback
    @ ~/.julia/packages/ZygoteRules/AIbCs/src/adjoint.jl:65 [inlined]
 [36] _pullback
    @ ~/.julia/packages/GPUArrays/g2pOV/src/host/mapreduce.jl:31 [inlined]
 [37] _pullback(::Zygote.Context{false}, ::typeof(mapreduce), ::typeof(identity), ::typeof(+), ::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer})
    @ Zygote ~/.julia/packages/Zygote/g2w9o/src/compiler/interface2.jl:0
 [38] _pullback (repeats 2 times)
    @ ./reducedim.jl:406 [inlined]
 [39] _pullback
    @ ./REPL[6]:1 [inlined]
 [40] _pullback(ctx::Zygote.Context{false}, f::var"#11#12", args::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer})
    @ Zygote ~/.julia/packages/Zygote/g2w9o/src/compiler/interface2.jl:0
 [41] pullback(f::Function, cx::Zygote.Context{false}, args::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer})
    @ Zygote ~/.julia/packages/Zygote/g2w9o/src/compiler/interface.jl:44
 [42] pullback
    @ ~/.julia/packages/Zygote/g2w9o/src/compiler/interface.jl:42 [inlined]
 [43] gradient(f::Function, args::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer})
    @ Zygote ~/.julia/packages/Zygote/g2w9o/src/compiler/interface.jl:96
 [44] top-level scope
    @ REPL[6]:1
 [45] top-level scope
    @ ~/.julia/packages/CUDA/ZdCxS/src/initialization.jl:155

My versions:

julia> VERSION
v"1.8.5"

(@v1.8) pkg> status
Status `~/.julia/environments/v1.8/Project.toml`
  [052768ef] CUDA v4.0.1
  [e88e6eb3] Zygote v0.6.55

There is no AD rule definition for reduce, so Zygote can only try to (unsuccessfully) differentiate the CUDA code. For more on this, see Chainrule for CUDA reduction · Issue #666 · JuliaDiff/ChainRules.jl · GitHub.

1 Like

Oh, thanks.

Is there any particular reason for reduce and its variants not be supported on the GPU? I would suppose these functions are pretty basic, widely used, no?

And how hard is it to write a rule for it? I have no idea how to write one, but could give it a shot.

Basic? Not really. You can have a look through at some of the existing mapreduce functionality in the GPU libraries to see what subtleties are involved. Also, just because reduce is achievable on GPU (for a select list of functions), does not mean the AD rule is. Have a look at the reduction rules in ChainRules.jl to see how much work is required for those.

2 Likes