Understanding and optimizing Enzyme.jl Reverse AD on CUDA

yolhan_mannes · October 23, 2025, 11:01am

A weird one

ERROR: type Const has no field dval
Stacktrace:
  [1] getproperty
    @ .\Base.jl:49 [inlined]
  [2] augmented_primal
    @ C:\Users\yolha\.julia\packages\CUDA\G7Cnt\ext\EnzymeCoreExt.jl:99 [inlined]
  [3] map
    @ .\tuple.jl:357 [inlined]
  [4] map (repeats 2 times)
    @ .\tuple.jl:358 [inlined]
  [5] macro expansion
    @ C:\Users\yolha\.julia\packages\CUDA\G7Cnt\src\compiler\execution.jl:110 [inlined]
  [6] compute_logLs!
    @ c:\Users\yolha\Desktop\juju_tests\Nouveau dossier\main2.jl:46 [inlined]
  [7] diffejulia_compute_logLs__28677wrap
    @ c:\Users\yolha\Desktop\juju_tests\Nouveau dossier\main2.jl:0
  [8] macro expansion
    @ C:\Users\yolha\.julia\packages\Enzyme\LJjsP\src\compiler.jl:5873 [inlined]
  [9] enzyme_call
    @ C:\Users\yolha\.julia\packages\Enzyme\LJjsP\src\compiler.jl:5407 [inlined]
 [10] CombinedAdjointThunk
    @ C:\Users\yolha\.julia\packages\Enzyme\LJjsP\src\compiler.jl:5293 [inlined]
 [11] autodiff
    @ C:\Users\yolha\.julia\packages\Enzyme\LJjsP\src\Enzyme.jl:521 [inlined]
 [12] autodiff
    @ C:\Users\yolha\.julia\packages\Enzyme\LJjsP\src\Enzyme.jl:562 [inlined]
 [13] autodiff
    @ C:\Users\yolha\.julia\packages\Enzyme\LJjsP\src\Enzyme.jl:534 [inlined]
 [14] per_sample_gradients(thetas::CuArray{Float32, 2, CUDA.DeviceMemory}, x::CuArray{Float32, 2, CUDA.DeviceMemory})
    @ Main c:\Users\yolha\Desktop\juju_tests\Nouveau dossier\main2.jl:59
 [15] top-level scope
    @ c:\Users\yolha\Desktop\juju_tests\Nouveau dossier\main2.jl:83

mwe :

using CUDA
using Enzyme

function log_likelihood_kernel!(logLs::CuDeviceVector{Float32},
                                thetas::CuDeviceMatrix{Float32},
                                x::CuDeviceMatrix{Float32},
                                N::Int,
                                batch_size::Int)
    # compute the global thread index (1D grid)
    idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    # only compute if thread corresponds to a sample in the batch
    if idx <= batch_size
        # extract parameters for this thread/sample
        mu1     = thetas[1, idx]
        mu2     = thetas[2, idx]
        sigma_1 = thetas[3, idx]
        sigma_2 = thetas[4, idx]

        # compute inverses of standard deviations once to avoid repeated division
        inv_sigma1 = 1f0 / (sigma_1)
        inv_sigma2 = 1f0 / (sigma_2)

        # accumulate quadratic term of Gaussian log-likelihood
        quad = 0.0f0
        @inbounds for i in 1:N
            # compute standardized residuals for both dimensions
            d1 = (x[1, i] - mu1) * inv_sigma1
            d2 = (x[2, i] - mu2) * inv_sigma2
            quad += d1*d1 + d2*d2
        end

        # write the log-likelihood for this thread/sample
        # formula: -0.5 * quad - N*log(sigma1*sigma2) - N*0.5*log(2π)
        logLs[idx] = -0.5f0 * quad - N * (log(sigma_1) + log(sigma_2)) - N * log(2f0 * Float32(pi))
    end
    return
end

function compute_logLs!(logLs::CuArray{Float32,1},
                        thetas::CuArray{Float32,2},
                        x::CuArray{Float32,2})
    N = size(x, 2)
    batch_size = size(thetas, 2)
    threads = 256
    blocks = cld(batch_size, threads)
    @cuda threads=threads blocks=blocks log_likelihood_kernel!(logLs, thetas, x, N, batch_size)
    return nothing  # mutating style: result is in `logLs`
end

function per_sample_gradients(thetas::CuArray{Float32,2}, x::CuArray{Float32,2})
    batch_size = size(thetas, 2)
    logLs = CUDA.zeros(Float32, batch_size)

    # allocate gradients
    dlogLs = CUDA.ones(Float32, batch_size)
    dthetas = CUDA.zeros(Float32, size(thetas))

    # differentiate compute_logLs! w.r.t. thetas
    Enzyme.autodiff(
        Enzyme.Reverse,
        compute_logLs!,                                   # function to differentiate
        Enzyme.DuplicatedNoNeed(logLs, dlogLs),                 # we need to differentiate only wrt thetas
        Enzyme.Duplicated(thetas, dthetas),               # input and gradient buffer
        Enzyme.Const(x) # x is constant
    )

    return dthetas  # this is 4×batch_size, same shape as thetas
end


### Example usage on GPU
# 3 samples, 4 parameters each
thetas = cu([0.5f0 0.6f0 0.7f0;
             0.5f0 0.4f0 0.3f0;
             0.1f0 0.2f0 0.15f0;
             0.1f0 0.2f0 0.25f0])

# mock data
x = cu([0.0f0 0.5f0 0.3f0 0.4f0 0.5f0;
        0.6f0 0.7f0 0.8f0 0.9f0 1.0f0])

# compute gradients
dthetas = per_sample_gradients(thetas, x)

Topic		Replies	Views
Speeding up gradient of logpdf Machine Learning question , performance , autodiff	19	813	February 12, 2024
Any faster way of computing small gradients? Performance zygote , forwarddiff , symbolics , autodiff	21	2076	August 11, 2022
What's the state of Automatic Differentiation in Julia January 2023? General Usage autodiff	41	12467	June 21, 2024
CUDA gradient of gamma PDF GPU	4	1078	April 15, 2020
Automatic Differentiation Slow (Slower than Finite-Differences) Optimization (Mathematical)	18	3560	May 31, 2018

Understanding and optimizing Enzyme.jl Reverse AD on CUDA

Related topics