thanks @maxfreu I tried
using CUDA, Enzyme, Test
function mul_kernel(A)
shared=CuDynamicSharedArray(Float32, length(A))
i = threadIdx().x
if i <= length(A)
shared[i] = A[i] * A[i]
A[i] = shared[i]
end
return nothing
end
function grad_mul_kernel(A, dA)
Enzyme.autodiff_deferred(mul_kernel, Const, Duplicated(A, dA))
return nothing
end
A = CUDA.ones(64,)
@cuda threads=length(A) shmem=64*4 mul_kernel(A )
A = CUDA.ones(64,)
dA = similar(A)
dA .= 1
@cuda threads=length(A) shmem=64*4 grad_mul_kernel(A, dA)
@test all(dA .== 2)
and got
ERROR: GPUCompiler.InvalidIRError(GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}(MethodInstance for grad_mul_kernel(::CuDeviceVector{Float32, 1}, ::CuDeviceVector{Float32, 1}), GPUCompiler.CompilerConfig{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}(GPUCompiler.PTXCompilerTarget(v"8.6.0", v"7.5.0", true, nothing, nothing, nothing, nothing, false, nothing, nothing), CUDA.CUDACompilerParams(v"8.6.0", v"8.2.0"), true, nothing, :specfunc, false, 2), 0x0000000000007b37), Tuple{String, Vector{Base.StackTraces.StackFrame}, Any}[("dynamic function invocation", [grad_mul_kernel at get_lin_synth_dat.jl:143], EnzymeCore.autodiff_deferred)])
Stacktrace:
[1] check_ir(job::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, args::LLVM.Module)
@ GPUCompiler ~/.julia/packages/GPUCompiler/nWT2N/src/validation.jl:147
[2] macro expansion
@ ~/.julia/packages/GPUCompiler/nWT2N/src/driver.jl:460 [inlined]
[3] macro expansion
@ ~/.julia/packages/TimerOutputs/Lw5SP/src/TimerOutput.jl:253 [inlined]
[4] macro expansion
@ ~/.julia/packages/GPUCompiler/nWT2N/src/driver.jl:459 [inlined]
[5]
@ GPUCompiler ~/.julia/packages/GPUCompiler/nWT2N/src/utils.jl:103
[6] emit_llvm
@ ~/.julia/packages/GPUCompiler/nWT2N/src/utils.jl:97 [inlined]
[7]
@ GPUCompiler ~/.julia/packages/GPUCompiler/nWT2N/src/driver.jl:136
[8]
@ GPUCompiler ~/.julia/packages/GPUCompiler/nWT2N/src/driver.jl:111
[9] compile
@ ~/.julia/packages/GPUCompiler/nWT2N/src/driver.jl:103 [inlined]
[10] #1145
@ ~/.julia/packages/CUDA/75aiI/src/compiler/compilation.jl:254 [inlined]
[11] JuliaContext(f::CUDA.var"#1145#1148"{GPUCompiler.CompilerJob{…}}; kwargs::@Kwargs{})
@ GPUCompiler ~/.julia/packages/GPUCompiler/nWT2N/src/driver.jl:52
[12] JuliaContext(f::Function)
@ GPUCompiler ~/.julia/packages/GPUCompiler/nWT2N/src/driver.jl:42
[13] compile(job::GPUCompiler.CompilerJob)
@ CUDA ~/.julia/packages/CUDA/75aiI/src/compiler/compilation.jl:253
[14] actual_compilation(cache::Dict{…}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{…}, compiler::typeof(CUDA.compile), linker::typeof(CUDA.link))
@ GPUCompiler ~/.julia/packages/GPUCompiler/nWT2N/src/execution.jl:128
[15] cached_compilation(cache::Dict{…}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{…}, compiler::Function, linker::Function)
@ GPUCompiler ~/.julia/packages/GPUCompiler/nWT2N/src/execution.jl:103
[16] macro expansion
@ ~/.julia/packages/CUDA/75aiI/src/compiler/execution.jl:369 [inlined]
[17] macro expansion
@ ./lock.jl:267 [inlined]
[18] cufunction(f::typeof(grad_mul_kernel), tt::Type{Tuple{CuDeviceVector{…}, CuDeviceVector{…}}}; kwargs::@Kwargs{})
@ CUDA ~/.julia/packages/CUDA/75aiI/src/compiler/execution.jl:364
[19] cufunction(f::typeof(grad_mul_kernel), tt::Type{Tuple{CuDeviceVector{Float32, 1}, CuDeviceVector{Float32, 1}}})
@ CUDA ~/.julia/packages/CUDA/75aiI/src/compiler/execution.jl:361
[20] top-level scope
@ ~/.julia/packages/CUDA/75aiI/src/compiler/execution.jl:112
Some type information was truncated. Use `show(err)` to see complete types.