LoadError: `llvmcall` must be compiled to be called when calling Zygote.Jacobian

Dear all,

I am facing a problem when trying to use Zygote.Jacobian to train a network.
Here is the function that I am using to get the Jacobian of a network

function dSCustomNonLinear(model, px::CuArray{T}, py::CuArray{T}, pz::CuArray{T}, t::CuArray{T}, fdp::FieldParams{T}, Ip::T) where {T}
jac = x → Zygote.jacobian(model, x)

function fill_deriv!(deriv  , Jacob, NN :: Int) 
    idx = threadIdx().x + (blockIdx().x - 1) * blockDim().x
    i = 2 * (idx - 1) + 1
    if i <= NN
        real_part = 0.5 * (Jacob[i, i] + Jacob[i+1, i+1])
        imag_part = 0.5 * (Jacob[i+1, i] - Jacob[i, i+1])
        deriv[div(i, 2) + 1] = ComplexF32(real_part, imag_part)
    end
    return
end
# Adapt the `complex_derivative` function for GPU
function complex_derivative(z::CuArray{Float32, 2})
    Jacob = jac(z)[1] |> gpu 
    NN = size(Jacob, 2)
    deriv = CUDA.fill(ComplexF32(0.0, 0.0), div(NN, 2))

    # Use a loop on the GPU
    threads = min(div(NN, 2), 1024)
    blocks = cld(div(NN, 2), threads)
    CUDA.@sync @cuda threads=threads blocks=blocks fill_deriv!(deriv, Jacob, NN)

    return deriv
end

∂X = complex_derivative(t)
return CUDA.sum(abs2.(∂X))

end

When I just call the function it’s working, but if I use the following function into the training one:
function TrainFullSingleNonLinear(modelX, modelSFA, loader, fdp :: FieldParams{T}, TP :: TrainingParams ,Iₚ::T) where {T}

BestModelX = deepcopy(modelX)
BestModelSFA = deepcopy(modelSFA)   
opt = Flux.AdaMax() 
optimX = Flux.setup(opt, modelX)
#optimSFA = Flux.setup(opt, modelSFA)
losses = T.([]) 
MinLoss = Inf32

@showprogress color=:blue for epoch in 1:TP.Epochs
    for (p,pad) in loader
        loss, grads = Flux.withgradient(modelX) do mX
            tᵢ = T.(modelSFA(p))
            dSCustomNonLinear(mX,p[1,:], p[2,:], p[3,:], tᵢ[1:2,:], fdp, Iₚ)
        end
        Flux.update!(optimX, modelX, grads[1])
        push!(losses, loss)  # logging, outside gradient context
        if loss < MinLoss
            MinLoss = loss
            BestModelX = deepcopy(modelX)
        end

    end
    println("")
    println("Epoch: $epoch, Min Loss Full: $MinLoss")  
end
return BestModelX,losses, MinLoss

end

I got the following error:
ERROR: LoadError: llvmcall must be compiled to be called
Stacktrace:
[1] macro expansion
@ ~/.julia/packages/Zygote/nsBv0/src/compiler/interface2.jl:0 [inlined]
[2] _pullback(::Zygote.Context{false}, ::Core.IntrinsicFunction, ::String, ::Type{Int64}, ::Type{Tuple{…}}, ::Ptr{Int64})
@ Zygote ~/.julia/packages/Zygote/nsBv0/src/compiler/interface2.jl:87
[3] getindex
@ ./atomics.jl:358 [inlined]
[4] getindex
@ ~/.julia/packages/GPUArrays/8Y80U/src/host/abstractarray.jl:48 [inlined]
[5] _pullback(ctx::Zygote.Context{false}, f::typeof(getindex), args::GPUArrays.RefCounted{CUDA.Managed{CUDA.DeviceMemory}})
@ Zygote ~/.julia/packages/Zygote/nsBv0/src/compiler/interface2.jl:0
[6] getindex
@ ~/.julia/packages/GPUArrays/8Y80U/src/host/abstractarray.jl:72 [inlined]
[7] context
@ ~/.julia/packages/CUDA/Tl08O/src/array.jl:345 [inlined]
[8] fill!
@ ~/.julia/packages/CUDA/Tl08O/src/array.jl:788 [inlined]
[9] _pullback(::Zygote.Context{false}, ::typeof(fill!), ::CuArray{Float32, 2, CUDA.DeviceMemory}, ::Int64)
@ Zygote ~/.julia/packages/Zygote/nsBv0/src/compiler/interface2.jl:0
[10] _eyelike
@ ~/.julia/packages/Zygote/nsBv0/src/lib/grad.jl:166 [inlined]
[11] _pullback(ctx::Zygote.Context{…}, f::typeof(Zygote._eyelike), args::Base.ReshapedArray{…})
@ Zygote ~/.julia/packages/Zygote/nsBv0/src/compiler/interface2.jl:0
[12] withjacobian
@ ~/.julia/packages/Zygote/nsBv0/src/lib/grad.jl:148 [inlined]
[13] _pullback(::Zygote.Context{…}, ::typeof(Zygote.withjacobian), ::Chain{…}, ::CuArray{…})
@ Zygote ~/.julia/packages/Zygote/nsBv0/src/compiler/interface2.jl:0
[14] _apply(::Function, ::Vararg{Any})
@ Core ./boot.jl:838
[15] adjoint
@ ~/.julia/packages/Zygote/nsBv0/src/lib/lib.jl:203 [inlined]
[16] _pullback
@ ~/.julia/packages/ZygoteRules/M4xmc/src/adjoint.jl:67 [inlined]
[17] jacobian
@ ~/.julia/packages/Zygote/nsBv0/src/lib/grad.jl:128 [inlined]
[18] _pullback(::Zygote.Context{…}, ::typeof(Zygote.jacobian), ::Chain{…}, ::CuArray{…})
@ Zygote ~/.julia/packages/Zygote/nsBv0/src/compiler/interface2.jl:0
[19] #102
@ /mnt/CAPTAIN_HARLOCK/RECHERCHE/JULIA/MY_COUNTER_ROTATING/MyCounterRotating/src/action.jl:251 [inlined]
[20] _pullback(ctx::Zygote.Context{…}, f::MyCounterRotating.var"#102#103"{…}, args::CuArray{…})
@ Zygote ~/.julia/packages/Zygote/nsBv0/src/compiler/interface2.jl:0
[21] complex_derivative
@ /mnt/CAPTAIN_HARLOCK/RECHERCHE/JULIA/MY_COUNTER_ROTATING/MyCounterRotating/src/action.jl:265 [inlined]
[22] _pullback(ctx::Zygote.Context{…}, f::MyCounterRotating.var"#complex_derivative#105"{…}, args::CuArray{…})
@ Zygote ~/.julia/packages/Zygote/nsBv0/src/compiler/interface2.jl:0
[23] dSCustomNonLinear
@ /mnt/CAPTAIN_HARLOCK/RECHERCHE/JULIA/MY_COUNTER_ROTATING/MyCounterRotating/src/action.jl:277 [inlined]
[24] #100
@ /mnt/CAPTAIN_HARLOCK/RECHERCHE/JULIA/MY_COUNTER_ROTATING/MyCounterRotating/src/neural_nets.jl:605 [inlined]
[25] _pullback(ctx::Zygote.Context{…}, f::MyCounterRotating.var"#100#101"{…}, args::Chain{…})
@ Zygote ~/.julia/packages/Zygote/nsBv0/src/compiler/interface2.jl:0
[26] pullback(f::Function, cx::Zygote.Context{false}, args::Chain{Tuple{…}})
@ Zygote ~/.julia/packages/Zygote/nsBv0/src/compiler/interface.jl:90
[27] pullback
@ ~/.julia/packages/Zygote/nsBv0/src/compiler/interface.jl:88 [inlined]
[28] withgradient(f::Function, args::Chain{Tuple{…}})
@ Zygote ~/.julia/packages/Zygote/nsBv0/src/compiler/interface.jl:205

Thanks a lot for your help, because I am really stuck…

From a quick look you are using a CUDA.jl kernel, i.e. using @cuda.

Zygote.jl won’t work with that. Enzyme.jl can differentiate through kernels but integrating it with other code is a little tricky, for example requiring you to call Enzyme from within a chain rule.

Thanks I will try Enzyme.