ERROR: this intrinsic must be compiled to be called

Hello,

For the past 2 weeks , I have been struggling with a flux model that I wrote. Since I am newbie to flux, zygote, and CuArrays, certainly, I don’t understand what is the actual problem right away. Below, I leave a mwe for reproducing the same issues that I am facing with.

Could someone help please ?

using Flux
using CUDA
using Flux: glorot_uniform
using Statistics: mean


CUDA.allowscalar(false); # disallowing scalar operations on GPU


mutable struct Enc
    rConv::Chain
    iConv::Chain

    function Enc(filter, stride, in, out, pad )
        realConv = Chain(Conv(filter, in=>out, leakyrelu, init=glorot_uniform, stride=stride, pad=pad),
                         BatchNorm(out, relu))
        imgConv  = Chain(Conv(filter, in=>out, leakyrelu, init=glorot_uniform, stride=stride, pad=pad),
                         BatchNorm(out, relu))
        new(realConv, imgConv)
    end

    function Enc(rConv::Chain, iConv::Chain)
        new(rConv, iConv)
    end
end 
Flux.@functor Enc

function (enc::Enc)(x)
    rC = enc.rConv(real(x)) 
    iC = enc.iConv(imag(x))
    rC = rC - iC
    iC = rC + iC
    complex.(rC, iC) 
end

function multistft(spectrogram::CuArray{T, 4},
                    framelen::Int=1024,
                    hopsize::Int=div(framelen, 2)) where T <: Complex

    freqbins, numframes, channels, samples = size(spectrogram)
    expectedlen = framelen + (numframes - 1) * hopsize

    spectrogram = isodd(numframes) ? hcat(spectrogram, CUDA.zeros(eltype(spectrogram), size(spectrogram, 1), 1, channels, samples)) : spectrogram
    numframes   = isodd(numframes) ? numframes + 1 : numframes  # number of frames can be altered here, it should not effect the original framelen !
 
    # window  = hanningTensor(framelen, numframes, channels, samples)
    window  = CUDA.ones(Float32, (framelen, numframes, channels, samples)) .* CUDA.CuArray(Float32.(.5 .* (1 .- cos.(2 .* pi .* collect(0:framelen - 1)/(framelen - 1)))))
    windows = CUDA.fill(Float32(1.0e-8), framelen, numframes, channels, samples) .+ (window.^2)
    
    odds   = Flux.flatten(windows[:, 1:2:end, :, :]);
    evens  = Flux.flatten(windows[:, 2:2:end, :, :]);
    winsum = vcat(odds, CUDA.zeros(Float32, hopsize, samples)) .+ vcat(CUDA.zeros(Float32, hopsize, samples), evens);

    wr_odd  = window[:, 1:2:end, :, :] .* CUDA.CUFFT.irfft(spectrogram[:, 1:2:end, :, :], framelen, 1);
    wr_even = window[:, 2:2:end, :, :] .* CUDA.CUFFT.irfft(spectrogram[:, 2:2:end, :, :], framelen, 1);
    
    reconstructed = vcat(Flux.flatten(wr_odd), CUDA.zeros(Float32, hopsize, samples)) .+ vcat(CUDA.zeros(Float32, hopsize, samples), Flux.flatten(wr_even))

    return (reconstructed ./ winsum)
end


# this loss is user-defined
function wsdrLoss(x, ŷ, y; ϵ=1e-8)

    x = x |> multistft
    ŷ = ŷ |> multistft
    y = y |> multistft
    
    z = x .- y
    ẑ = x .- ŷ

    nd  = sum(y.^2; dims=1)[:]
    dom = sum(z.^2; dims=1)[:]

    ϵ_array = CUDA.fill(Float32(ϵ), size(nd))
    aux = nd ./ (nd .+ dom .+ ϵ_array)
    wSDR = aux .* sdr(ŷ, y) .+ (1 .- aux) .* sdr(ẑ, z) 
    CUDA.mean(wSDR)
end

multiNorm(A; dims) = CUDA.sqrt.(sum(real(A .* conj(A)), dims=dims))

function sdr(ypred, ygold; ϵ=1e-8)
    num = sum(ygold .*  ypred, dims=1)
    den = multiNorm(ygold, dims=1) .* multiNorm(ypred, dims=1)
    ϵ_array = CUDA.fill(Float32(ϵ), size(den))
    -(num ./ (den  .+ ϵ_array))
end 

After defining the model and the loss function, I create the model with proper inputs:

x = CUDA.rand(ComplexF32, 513, 321, 1, 1); # input
y = CUDA.rand(ComplexF32, 513, 321, 1, 1); # output

# creating a dummy model on gpu
encoder = Chain(Enc((1, 1), (1, 1), 1, 1, (0, 0))) |> gpu

#  ŷ = encoder(x);
# the loss function accepts 3 arguments that are input, prediction, and ground truths.

wsdrLoss(x, encoder(x), y) ; # works fine

gradient(wsdrLoss, x, encoder(x), y) # returns error : ERROR: this intrinsic must be compiled to be called

When taking the gradients I get :

ERROR: this intrinsic must be compiled to be called
Stacktrace:
 [1] macro expansion at /opt/.julia/packages/Zygote/xBjHw/src/compiler/interface2.jl:0 [inlined]
 [2] _pullback(::Zygote.Context, ::Core.IntrinsicFunction, ::String, ::Type{UInt64}, ::Type{Tuple{Ptr{UInt64}}}, ::Ptr{UInt64}) at /opChain(t/.julia/packages/Zygote/xBjHw/src/compiler/interface2.jl:12
 [3] getindex at ./atomics.jl:358 [inlined]
 [4] _pullback(::Zygote.Context, ::typeof(getindex), ::Base.Threads.Atomic{UInt64}) at /opt/.julia/packages/Zygote/xBjHw/src/compiler/interface2.jl:0
 [5] macro expansion at /opt/.julia/packages/CUDA/YeS8q/lib/utils/call.jl:37 [inlined]
 [6] macro expansion at /opt/.julia/packages/CUDA/YeS8q/lib/cudadrv/libcuda.jl:1641 [inlined]
 [7] macro expansion at /opt/.julia/packthis intrinsic must be compiled to be calledages/CUDA/YeS8q/lib/cudadrv/error.jl:102 [inlined]
 [8] cuOccupancyMaxPotentialBlockSize at /opt/.julia/packages/CUDA/YeS8q/lib/utils/call.jl:93 [inlined]
 [9] _pullback(::Zygote.Context, ::typeof(CUDA.cuOccupancyMaxPotentialBlockSize), ::Base.RefValue{Int32}, ::Base.RefValue{Int32}, ::CuFunction, ::Ptr{Nothing}, ::Int64, ::Int64) at /opt/.julia/packages/Zygote/xBjHw/src/compiler/interface2.jl:0
 [10] #launch_configuration#606 at /opt/.julia/packages/CUDA/YeS8q/lib/cudadrv/occupancy.jl:58 [inlined]
 [11] _pullback(::Zygote.Context, ::CUDA.var"#launch_configuration##kw", ::NamedTuple{(:max_threads,),Tuple{Int64}}, ::typeof(launch_configuration), ::CuFunction) at /opt/.julia/packages/Zygote/xBjHw/src/compiler/interface2.jl:0
 [12] #launch_heuristic#853 at /opt/.julia/packages/CUDA/YeS8q/src/gpuarrays.jl:26 [inlined]
 [13] adjoint at /opt/.julia/packages/Zygote/xBjHw/src/lib/lib.jl:188 [inlined]
 [14] _pullback at /opt/.julia/packages/ZygoteRules/6nssF/src/adjoint.jl:47 [inlined]
 [15] launch_heuristic at /opt/.julia/packages/CUDA/YeS8q/src/gpuarrays.jl:17 [inlined]
 [16] adjoint at /opt/.julia/packages/Zygote/xBjHw/src/lib/lib.jl:188 [inlined]
 [17] _pullback at /opt/.julia/packages/ZygoteRules/6nssF/src/adjoint.jl:47 [inlined]
 [18] #gpu_call#1 at /opt/.julia/packages/GPUArrays/jhRU7/src/device/execution.jl:61 [inlined]
 [19] _pullback(::Zygote.Context, ::GPUArrays.var"##gpu_call#1", ::CuArray{Complex{Float32},4}, ::Nothing, ::Nothing, ::Nothing, ::Nothing, ::typeof(GPUArrays.gpu_call), ::GPUArrays.var"#4#5", ::CuArray{Complex{Float32},4}, ::Complex{Float32}) at /opt/.julia/packages/Zygote/xBjHw/src/compiler/interface2.jl:0
 [20] adjoint at /opt/.julia/packages/Zygote/xBjHw/src/lib/lib.jl:188 [inlined]
 [21] _pullback at /opt/.julia/packages/ZygoteRules/6nssF/src/adjoint.jl:47 [inlined]
 [22] gpu_call at /opt/.julia/packages/GPUArrays/jhRU7/src/device/execution.jl:46 [inlined]
 [23] fill! at /opt/.julia/packages/GPUArrays/jhRU7/src/host/construction.jl:5 [inlined]
 [24] _pullback(::Zygote.Context, ::typeof(fill!), ::CuArray{Complex{Float32},4}, ::Int64) at /opt/.julia/packages/Zygote/xBjHw/src/compiler/interface2.jl:0
 [25] zeros at /opt/.julia/packages/CUDA/YeS8q/src/array.jl:348 [inlined]
 [26] _pullback(::Zygote.Context, ::typeof(CUDA.zeros), ::Type{Complex{Float32}}, ::Int64, ::Int64, ::Int64, ::Int64) at /opt/.julia/packages/Zygote/xBjHw/src/compiler/interface2.jl:0
 [27] multistft at ./REPL[29]:6 [inlined]
 [28] _pullback(::Zygote.Context, ::typeof(multistft), ::CuArray{Complex{Float32},4}, ::Int64, ::Int64) at /opt/.julia/packages/Zygote/xBjHw/src/compiler/interface2.jl:0
 [29] multistft at ./REPL[29]:4 [inlined] (repeats 2 times)
 [30] |> at ./operators.jl:834 [inlined]
 [31] #wsdrLoss#7 at ./REPL[17]:2 [inlined]
 [32] _pullback(::Zygote.Context, ::var"##wsdrLoss#7", ::Float64, ::typeof(wsdrLoss), ::CuArray{Complex{Float32},4}, ::CuArray{Complex{Float32},4}, ::CuArray{Complex{Float32},4}) at /opt/.julia/packages/Zygote/xBjHw/src/compiler/interface2.jl:0
 [33] wsdrLoss at ./REPL[17]:2 [inlined]
 [34] _pullback(::Zygote.Context, ::typeof(wsdrLoss), ::CuArray{Complex{Float32},4}, ::CuArray{Complex{Float32},4}, ::CuArray{Complex{Float32},4}) at /opt/.julia/packages/Zygote/xBjHw/src/compiler/interface2.jl:0
 [35] _pullback(::Function, ::CuArray{Complex{Float32},4}, ::CuArray{Complex{Float32},4}, ::CuArray{Complex{Float32},4}) at /opt/.julia/packages/Zygote/xBjHw/src/compiler/interface.jl:38
 [36] pullback(::Function, ::CuArray{Complex{Float32},4}, ::CuArray{Complex{Float32},4}, ::Vararg{CuArray{Complex{Float32},4},N} where N) at /opt/.julia/packages/Zygote/xBjHw/src/compiler/interface.jl:44
 [37] gradient(::Function, ::CuArray{Complex{Float32},4}, ::Vararg{CuArray{Complex{Float32},4},N} where N) at /opt/.julia/packages/Zygote/xBjHw/src/compiler/interface.jl:53
 [38] top-level scope at REPL[31]:1

Looks like Zygote is missing an adjoint for CUDA.zeros. You could try defining

Zygote.@adjoint CUDA.zeros(x...) = CUDA.zeros(x...), _ -> map(_ -> nothing, x)

and then try again. If that works for you, would you mind opening an issue for CUDA.jl?

2 Likes

Ok I got it working. In addition to Zygote.@adjoint CUDA.zeros(x...) also I had to create CUDA.ones and CUDA.fill methods with Zygote.@adjoint.

If that works for you, would you mind opening an issue for CUDA.jl?

Sure.

thank you

That’s good to hear! You want to be a bit careful with CUDA.fill, since the derivative wrt the first argument is non-zero. You could just copy the definition for Base.fill in Zygote though:

(In your case this doesn’t matter, since in your example the first argument is a constant, but this might cause wrong derivatives later on, which is often difficult to track down)

@simeonschaub the problem stays when using @adjoint fill(x::Real, dims...) = fill(x, dims...), Δ->(sum(Δ), map(_->nothing, dims)...) .

ERROR: this intrinsic must be compiled to be called
Stacktrace:
 [1] macro expansion at /opt/.julia/packages/Zygote/xBjHw/src/compiler/interface2.jl:0 [inlined]
 [2] _pullback(::Zygote.Context, ::Core.IntrinsicFunction, ::String, ::Type{UInt64}, ::Type{Tuple{Ptr{UInt64}}}, ::Ptr{UInt64}) at /opt/.julia/packages/Zygote/xBjHw/src/compiler/interface2.jl:12
 [3] getindex at ./atomics.jl:358 [inlined]
 [4] _pullback(::Zygote.Context, ::typeof(getindex), ::Base.Threads.Atomic{UInt64}) at /opt/.julia/packages/Zygote/xBjHw/src/compiler/interface2.jl:0
 [5] macro expansion at /opt/.julia/packages/CUDA/YeS8q/lib/utils/call.jl:37 [inlined]
 [6] macro expansion at /opt/.julia/packages/CUDA/YeS8q/lib/cudadrv/libcuda.jl:668 [inlined]
 [7] macro expansion at /opt/.julia/packages/CUDA/YeS8q/lib/cudadrv/error.jl:102 [inlined]
 [8] cuMemsetD32_v2 at /opt/.julia/packages/CUDA/YeS8q/lib/utils/call.jl:93 [inlined]
 [9] _pullback(::Zygote.Context, ::typeof(CUDA.cuMemsetD32_v2), ::CuPtr{UInt32}, ::UInt32, ::Int64) at /opt/.julia/packages/Zygote/xBjHw/src/compiler/interface2.jl:0
 [10] #set!#5 at /opt/.julia/packages/CUDA/YeS8q/lib/cudadrv/memory.jl:373 [inlined]
 [11] set! at /opt/.julia/packages/CUDA/YeS8q/lib/cudadrv/memory.jl:366 [inlined]
 [12] _pullback(::Zygote.Context, ::typeof(CUDA.Mem.set!), ::CuPtr{UInt32}, ::UInt32, ::Int64) at /opt/.julia/packages/Zygote/xBjHw/src/compiler/interface2.jl:0
 [13] fill! at /opt/.julia/packages/CUDA/YeS8q/src/array.jl:365 [inlined]
 [14] _pullback(::Zygote.Context, ::typeof(fill!), ::CuArray{Float32,4}, ::Float32) at /opt/.julia/packages/Zygote/xBjHw/src/compiler/interface2.jl:0
 [15] fill at /opt/.julia/packages/CUDA/YeS8q/src/array.jl:352 [inlined]
 [16] _pullback(::Zygote.Context, ::typeof(CUDA.fill), ::Float32, ::Int64, ::Int64, ::Int64, ::Int64) at /opt/.julia/packages/Zygote/xBjHw/src/compiler/interface2.jl:0
 [17] multistft at ./REPL[12]:13 [inlined]
 [18] _pullback(::Zygote.Context, ::typeof(multistft), ::CuArray{Complex{Float32},4}, ::Int64, ::Int64) at /opt/.julia/packages/Zygote/xBjHw/src/compiler/interface2.jl:0
 [19] multistft at ./REPL[12]:5 [inlined] (repeats 2 times)
 [20] |> at ./operators.jl:834 [inlined]
 [21] #wsdrLoss#9 at ./REPL[26]:3 [inlined]
 [22] _pullback(::Zygote.Context, ::var"##wsdrLoss#9", ::Float64, ::typeof(wsdrLoss), ::CuArray{Complex{Float32},4}, ::CuArray{Complex{Float32},4}, ::CuArray{Complex{Float32},4}) at /opt/.julia/packages/Zygote/xBjHw/src/compiler/interface2.jl:0
 [23] wsdrLoss at ./REPL[26]:3 [inlined]
 [24] _pullback(::Zygote.Context, ::typeof(wsdrLoss), ::CuArray{Complex{Float32},4}, ::CuArray{Complex{Float32},4}, ::CuArray{Complex{Float32},4}) at /opt/.julia/packages/Zygote/xBjHw/src/compiler/interface2.jl:0
 [25] _pullback(::Function, ::CuArray{Complex{Float32},4}, ::CuArray{Complex{Float32},4}, ::CuArray{Complex{Float32},4}) at /opt/.julia/packages/Zygote/xBjHw/src/compiler/interface.jl:38
 [26] pullback(::Function, ::CuArray{Complex{Float32},4}, ::CuArray{Complex{Float32},4}, ::Vararg{CuArray{Complex{Float32},4},N} where N) at /opt/.julia/packages/Zygote/xBjHw/src/compiler/interface.jl:44
 [27] gradient(::Function, ::CuArray{Complex{Float32},4}, ::Vararg{CuArray{Complex{Float32},4},N} where N) at /opt/.julia/packages/Zygote/xBjHw/src/compiler/interface.jl:53
 [28] top-level scope at REPL[40]:1

You still need to replace fill with CUDA.fill in that definition.

My bad. Forgot to add CUDA. in front of fill function.

B.R.

@simeonschaub I still have the gradient problem when using my_custom_train!() in Flux site. Could you help me to resolve that ? If it is fine for you I will create another issue and give the link here.

Is this a different error? If so, it’s probably better to open another thread, so it is more discoverable.

1 Like

I cannot train a model. For instance to train the dummy model above :

θ = params(encoder)
opt = ADAM(0.01)
∇ = gradient(wsdrLoss, x, encoder(x), y)[1]
Flux.update!(opt, θ, ∇)

gives error(s) :

ERROR: InvalidIRError: compiling kernel getindex_kernel(CUDA.CuKernelContext, CuDeviceArray{Complex{Float32},4,1}, CuDeviceArray{Complex{Float32},4,1}, Tuple{Int64}, CuDeviceArray{Float32,4,1}) resulted in invalid LLVM IR
Reason: unsupported dynamic function invocation (call to #sprint#355(context, sizehint::Integer, ::typeof(sprint), f::Function, args...) in Base at strings/io.jl:100)