I am having a cosine function call in CUDA kernel and it is throwing LLVM error.
Here is the MWE to reproduce this issue.
File: cos_julia.jl
using CUDA
# Define a GPU kernel that computes the cosine of each element
function cos_kernel!(y, x)
# Calculate the global thread index
idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x
# Make sure we do not go out-of-bounds
if idx <= length(x)
y[idx] = cos(x[idx])
end
return
end
N=1024
x_gpu = CUDA.randn(Float32, N)
y_gpu = similar(x_gpu)
threads = 256
blocks = cld(N, threads) # cld rounds up division to cover all elements
@cuda threads=threads blocks=blocks cos_kernel!(y_gpu, x_gpu)
synchronize()
y = Array(y_gpu)
println(y[1:10])
Error:
ERROR: LoadError: LLVM error: Instruction Combining did not reach a fixpoint after 1 iterations
Stacktrace:
[1] handle_error(reason::Cstring) @ LLVM ~/.julia-1.12.0/packages/LLVM/b3kFs/src/core/context.jl:194
[2] LLVMRunJuliaPasses
@ ~/.julia-1.12.0/packages/LLVM/b3kFs/lib/18/libLLVM_extra.jl:291 [inlined]
[3] macro expansion
@ ~/.julia-1.12.0/packages/LLVM/b3kFs/src/executionengine/utils.jl:25 [inlined]
[4] run!(pb::LLVM.NewPMPassBuilder, target::LLVM.Module, tm::LLVM.TargetMachine)
@ LLVM ~/.julia-1.12.0/packages/LLVM/b3kFs/src/newpm.jl:291
[5] macro expansion
@ ~/.julia-1.12.0/packages/GPUCompiler/2MI6e/src/optim.jl:20 [inlined]
[6] macro expansion
@ ~/.julia-1.12.0/packages/LLVM/b3kFs/src/base.jl:97 [inlined]
[7] optimize!(job::GPUCompiler.CompilerJob, mod::LLVM.Module; opt_level::Int64)
@ GPUCompiler ~/.julia-1.12.0/packages/GPUCompiler/2MI6e/src/optim.jl:9
[8] optimize!
@ ~/.julia-1.12.0/packages/GPUCompiler/2MI6e/src/optim.jl:3 [inlined]
[9] emit_llvm(job::GPUCompiler.CompilerJob; kwargs::@Kwargs{})
@ GPUCompiler ~/.julia-1.12.0/packages/GPUCompiler/2MI6e/src/driver.jl:284
[10] emit_llvm
@ ~/.julia-1.12.0/packages/GPUCompiler/2MI6e/src/driver.jl:154 [inlined]
[11] #compile_unhooked#112
@ ~/.julia-1.12.0/packages/GPUCompiler/2MI6e/src/driver.jl:95 [inlined]
[12] compile_unhooked
@ ~/.julia-1.12.0/packages/GPUCompiler/2MI6e/src/driver.jl:80 [inlined]
[13] compile(target::Symbol, job::GPUCompiler.CompilerJob; kwargs::@Kwargs{})
@ GPUCompiler ~/.julia-1.12.0/packages/GPUCompiler/2MI6e/src/driver.jl:67
[14] compile
@ ~/.julia-1.12.0/packages/GPUCompiler/2MI6e/src/driver.jl:55 [inlined]
[15] #compile##0
@ ~/.julia-1.12.0/packages/CUDA/RQqFT/src/compiler/compilation.jl:255 [inlined]
[16] JuliaContext(f::CUDA.var"#compile##0#compile##1"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}}; kwargs::@Kwargs{})
@ GPUCompiler ~/.julia-1.12.0/packages/GPUCompiler/2MI6e/src/driver.jl:34
[17] JuliaContext(f::Function)
@ GPUCompiler ~/.julia-1.12.0/packages/GPUCompiler/2MI6e/src/driver.jl:25
[18] compile(job::GPUCompiler.CompilerJob)
@ CUDA ~/.julia-1.12.0/packages/CUDA/RQqFT/src/compiler/compilation.jl:254
[19] actual_compilation(cache::Dict{Any, CuFunction}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{GPUCompiler.PTXCompilerTarget, C
UDA.CUDACompilerParams}, compiler::typeof(CUDA.compile), linker::typeof(CUDA.link))
@ GPUCompiler ~/.julia-1.12.0/packages/GPUCompiler/2MI6e/src/execution.jl:245
[20] cached_compilation(cache::Dict{Any, CuFunction}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompile
rParams}, compiler::Function, linker::Function)
@ GPUCompiler ~/.julia-1.12.0/packages/GPUCompiler/2MI6e/src/execution.jl:159
[21] macro expansion
@ ~/.julia-1.12.0/packages/CUDA/RQqFT/src/compiler/execution.jl:373 [inlined]
[22] macro expansion
@ ./lock.jl:376 [inlined]
[23] cufunction(f::typeof(cos_kernel!), tt::Type{Tuple{CuDeviceVector{Float32, 1}, CuDeviceVector{Float32, 1}}}; kwargs::@Kwargs{})
@ CUDA ~/.julia-1.12.0/packages/CUDA/RQqFT/src/compiler/execution.jl:368
[24] cufunction(f::typeof(cos_kernel!), tt::Type{Tuple{CuDeviceVector{Float32, 1}, CuDeviceVector{Float32, 1}}})
@ CUDA ~/.julia-1.12.0/packages/CUDA/RQqFT/src/compiler/execution.jl:365
Here is the julia version info
julia> versioninfo()
Julia Version 1.12.0-beta1
Commit c175ace780d (2025-04-02 11:19 UTC)
Build Info:
Official https://julialang.org release
Platform Info:
OS: Linux (x86_64-linux-gnu)
CPU: 255 × AMD EPYC 7763 64-Core Processor
WORD_SIZE: 64
LLVM: libLLVM-18.1.7 (ORCJIT, znver3)
GC: Built with stock GC
Threads: 1 default, 1 interactive, 1 GC (on 255 virtual cores)
Environment:
LD_LIBRARY_PATH = .:/home/nqx/packages/julia-1.12.0-beta1/lib:
JULIA_CACHE_PATH = /home/nqx/.julia-1.12.0
JULIA_INSTALL_DIR = /home/nqx/packages/julia-1.12.0-beta1
JULIA_DEPOT_PATH = /home/nqx/.julia-1.12.0:/home/nqx/.julia-1.12.0/
JULIA_CACHE_NAME = .julia-1.12.0