CUDA cos is giving LLVM IR instruction combine error

I am having a cosine function call in CUDA kernel and it is throwing LLVM error.

Here is the MWE to reproduce this issue.
File: cos_julia.jl

using CUDA

# Define a GPU kernel that computes the cosine of each element
function cos_kernel!(y, x)
    # Calculate the global thread index
    idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    # Make sure we do not go out-of-bounds
    if idx <= length(x)
        y[idx] = cos(x[idx])
    end
    return
end

N=1024
x_gpu = CUDA.randn(Float32, N)
y_gpu = similar(x_gpu)
threads = 256
blocks = cld(N, threads)  # cld rounds up division to cover all elements
@cuda threads=threads blocks=blocks cos_kernel!(y_gpu, x_gpu)
synchronize()
y = Array(y_gpu)
println(y[1:10])

Error:

ERROR: LoadError: LLVM error: Instruction Combining did not reach a fixpoint after 1 iterations
Stacktrace:
  [1] handle_error(reason::Cstring)                                                                                                                                 @ LLVM ~/.julia-1.12.0/packages/LLVM/b3kFs/src/core/context.jl:194
  [2] LLVMRunJuliaPasses
    @ ~/.julia-1.12.0/packages/LLVM/b3kFs/lib/18/libLLVM_extra.jl:291 [inlined]
  [3] macro expansion
    @ ~/.julia-1.12.0/packages/LLVM/b3kFs/src/executionengine/utils.jl:25 [inlined]
  [4] run!(pb::LLVM.NewPMPassBuilder, target::LLVM.Module, tm::LLVM.TargetMachine)
    @ LLVM ~/.julia-1.12.0/packages/LLVM/b3kFs/src/newpm.jl:291
  [5] macro expansion
    @ ~/.julia-1.12.0/packages/GPUCompiler/2MI6e/src/optim.jl:20 [inlined]
  [6] macro expansion
    @ ~/.julia-1.12.0/packages/LLVM/b3kFs/src/base.jl:97 [inlined]
  [7] optimize!(job::GPUCompiler.CompilerJob, mod::LLVM.Module; opt_level::Int64)
    @ GPUCompiler ~/.julia-1.12.0/packages/GPUCompiler/2MI6e/src/optim.jl:9
  [8] optimize!
    @ ~/.julia-1.12.0/packages/GPUCompiler/2MI6e/src/optim.jl:3 [inlined]
  [9] emit_llvm(job::GPUCompiler.CompilerJob; kwargs::@Kwargs{})
    @ GPUCompiler ~/.julia-1.12.0/packages/GPUCompiler/2MI6e/src/driver.jl:284
 [10] emit_llvm
    @ ~/.julia-1.12.0/packages/GPUCompiler/2MI6e/src/driver.jl:154 [inlined]
 [11] #compile_unhooked#112
    @ ~/.julia-1.12.0/packages/GPUCompiler/2MI6e/src/driver.jl:95 [inlined]
 [12] compile_unhooked
    @ ~/.julia-1.12.0/packages/GPUCompiler/2MI6e/src/driver.jl:80 [inlined]
 [13] compile(target::Symbol, job::GPUCompiler.CompilerJob; kwargs::@Kwargs{})
    @ GPUCompiler ~/.julia-1.12.0/packages/GPUCompiler/2MI6e/src/driver.jl:67
 [14] compile
    @ ~/.julia-1.12.0/packages/GPUCompiler/2MI6e/src/driver.jl:55 [inlined]
 [15] #compile##0
    @ ~/.julia-1.12.0/packages/CUDA/RQqFT/src/compiler/compilation.jl:255 [inlined]
 [16] JuliaContext(f::CUDA.var"#compile##0#compile##1"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}}; kwargs::@Kwargs{})
    @ GPUCompiler ~/.julia-1.12.0/packages/GPUCompiler/2MI6e/src/driver.jl:34
 [17] JuliaContext(f::Function)
    @ GPUCompiler ~/.julia-1.12.0/packages/GPUCompiler/2MI6e/src/driver.jl:25
 [18] compile(job::GPUCompiler.CompilerJob)
    @ CUDA ~/.julia-1.12.0/packages/CUDA/RQqFT/src/compiler/compilation.jl:254
 [19] actual_compilation(cache::Dict{Any, CuFunction}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{GPUCompiler.PTXCompilerTarget, C
UDA.CUDACompilerParams}, compiler::typeof(CUDA.compile), linker::typeof(CUDA.link))
    @ GPUCompiler ~/.julia-1.12.0/packages/GPUCompiler/2MI6e/src/execution.jl:245
 [20] cached_compilation(cache::Dict{Any, CuFunction}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompile
rParams}, compiler::Function, linker::Function)
    @ GPUCompiler ~/.julia-1.12.0/packages/GPUCompiler/2MI6e/src/execution.jl:159
 [21] macro expansion
    @ ~/.julia-1.12.0/packages/CUDA/RQqFT/src/compiler/execution.jl:373 [inlined]
 [22] macro expansion
    @ ./lock.jl:376 [inlined]
 [23] cufunction(f::typeof(cos_kernel!), tt::Type{Tuple{CuDeviceVector{Float32, 1}, CuDeviceVector{Float32, 1}}}; kwargs::@Kwargs{})
    @ CUDA ~/.julia-1.12.0/packages/CUDA/RQqFT/src/compiler/execution.jl:368
 [24] cufunction(f::typeof(cos_kernel!), tt::Type{Tuple{CuDeviceVector{Float32, 1}, CuDeviceVector{Float32, 1}}})
    @ CUDA ~/.julia-1.12.0/packages/CUDA/RQqFT/src/compiler/execution.jl:365

Here is the julia version info

julia> versioninfo()
Julia Version 1.12.0-beta1
Commit c175ace780d (2025-04-02 11:19 UTC)
Build Info:
  Official https://julialang.org release
Platform Info:
  OS: Linux (x86_64-linux-gnu)
  CPU: 255 × AMD EPYC 7763 64-Core Processor
  WORD_SIZE: 64
  LLVM: libLLVM-18.1.7 (ORCJIT, znver3)
  GC: Built with stock GC
Threads: 1 default, 1 interactive, 1 GC (on 255 virtual cores)
Environment:
  LD_LIBRARY_PATH = .:/home/nqx/packages/julia-1.12.0-beta1/lib:
  JULIA_CACHE_PATH = /home/nqx/.julia-1.12.0
  JULIA_INSTALL_DIR = /home/nqx/packages/julia-1.12.0-beta1
  JULIA_DEPOT_PATH = /home/nqx/.julia-1.12.0:/home/nqx/.julia-1.12.0/
  JULIA_CACHE_NAME = .julia-1.12.0

Hi, You should try and tell if it works on release channel since packages are not supposed to work ( don’t have to) on alpha release.
Especially since I think GPUCompiler.jl is in progress adapting to 1.12