Problem with CUDAv3

I have been using CUDAv2.4 and everything has worked perfect. But after upgrading, I have found that many kernels fail to compile. Even a simple kernel:

function krnl_foo!(ac,bc)
           b = threadIdx().x
           r = blockIdx().x
           ac[b,r] = 0.0
           return nothing
end

when examined results in:

CUDA.@sync begin
        @device_code_warntype    CUDA.@cuda threads=10 blocks=10 krnl_foo!(ac,bc)
        end
PTX CompilerJob of kernel krnl_foo!(CuDeviceMatrix{Float64, 1}, CuDeviceMatrix{Float64, 1}) for sm_60

Variables
  #self#::Core.Const(krnl_foo!)
  ac::CuDeviceMatrix{Float64, 1}
  bc::CuDeviceMatrix{Float64, 1}
  r::Union{}
  b::Union{}

Body::Union{}
1 ─     Main.threadIdx()
│       Core.Const(:(b = Base.getproperty(%1, :x)))
│       Core.Const(:(Main.blockIdx()))
│       Core.Const(:(r = Base.getproperty(%3, :x)))
│       Core.Const(:(CUDA._cuprint(b, Val{Symbol(" ")}(), r, Val{Symbol("\n")}())))
│       Core.Const(:(Base.setindex!(ac, 0.0, b, r)))
└──     Core.Const(:(return Main.nothing))

Why are b, r Union? In previous CUDA versions I was getting Int64. Is this normal in CUDAv3 or is my installation faulty?

Some details:

CUDA.versioninfo()
CUDA toolkit 11.4, artifact installation
NVIDIA driver 440.33.1, for CUDA 10.2
CUDA driver 11.4

Libraries: 
- CUBLAS: 11.5.4
- CURAND: 10.2.5
- CUFFT: 10.5.1
- CUSOLVER: 11.2.0
- CUSPARSE: 11.6.0
- CUPTI: 14.0.0
- NVML: 10.0.0+440.33.1
- CUDNN: 8.20.2 (for CUDA 11.4.0)
- CUTENSOR: 1.3.0 (for CUDA 11.2.0)

Toolchain:
- Julia: 1.6.3
- LLVM: 11.0.1
- PTX ISA support: 3.2, 4.0, 4.1, 4.2, 4.3, 5.0, 6.0, 6.1, 6.3, 6.4, 6.5, 7.0
- Device capability support: sm_35, sm_37, sm_50, sm_52, sm_53, sm_60, sm_61, sm_62, sm_70, sm_72, sm_75, sm_80

1 device:
  0: Tesla P100-PCIE-12GB (sm_60, 11.902 GiB / 11.912 GiB available)

Many thanks!

This kernel works here, both on Julia 1.6 and 1.7. Please provide an actual MWE, the inputs ac and bc are undefined here. Also, ensure you’re using the latest version of CUDA.jl.

Hi,

This is a fairly complex code. This reproduces the error in the two clusters that I have access to:


import Pkg
Pkg.activate(".")
using CUDA

Pkg.status()

struct SpaceParm{N,M,B,D}
    ndim::Int64
    iL::NTuple{N,Int64}
    npls::Int64
    plidx::NTuple{M,Tuple{Int64, Int64}}

    blk::NTuple{N,Int64}
    blkS::NTuple{N,Int64}
    rbk::NTuple{N,Int64}
    rbkS::NTuple{N,Int64}

    bsz::Int64
    rsz::Int64

    ntw::NTuple{M,Int64}
    
    function SpaceParm{N}(x, y, nt::Union{Nothing,NTuple{I,Int64}}=nothing) where {N,I}
        M = convert(Int64, round(N*(N-1)/2))
        N == length(x) || throw(ArgumentError("Lattice size incorrect length for dimension $N"))
        N == length(y) || throw(ArgumentError("Block   size incorrect length for dimension $N"))

        if any(i->i!=0, x.%y)
            error("Lattice size not divisible by block size.")
        end
        
        pls = Vector{Tuple{Int64, Int64}}()
        for i in N:-1:1
            for j in 1:i-1
                push!(pls, (i,j))
            end
        end

        r  = div.(x, y)
        rS = ones(N)
        yS = ones(N)
        for i in 2:N
            for j in 1:i-1
                rS[i] = rS[i]*r[j]
                yS[i] = yS[i]*y[j]
            end
        end

        D = prod(y)
        if nt == nothing
            ntw = ntuple(i->0, M)
        else
            ntw = nt
        end
        return new{N,M,0,D}(N, x, M, tuple(pls...), y,
                            tuple(yS...), tuple(r...), tuple(rS...), prod(y), prod(r), ntw)
    end

end

@inline cntb(nb, id::Int64, lp::SpaceParm) = mod(div(nb-1,lp.blkS[id]),lp.blk[id])
@inline cntr(nr, id::Int64, lp::SpaceParm) = mod(div(nr-1,lp.rbkS[id]),lp.rbk[id])
@inline cnt(nb, nr, id::Int64, lp::SpaceParm)  = 1 + cntb(nb,id,lp) + cntr(nr,id,lp)*lp.blk[id]

@inline function point_time(p::NTuple{2,Int64}, lp::SpaceParm{N,M,B,D}) where {N,M,B,D}
    return cnt(p[1], p[2], N, lp)
end

function krnl_foo!(ac, lp)
 b, r = CUDA.threadIdx().x, CUDA.blockIdx().x
 it = point_time((b,r), lp)

 ac[b,r] = zero(eltype(ac))
 return nothing
end

lp = SpaceParm{4}((12,12,12,12), (4,4,4,4))

A = randn(10,10)
ac = CuArray(A)

CUDA.@sync begin
  @device_code_warntype    CUDA.@cuda threads=10 blocks=10 krnl_foo!(ac,lp)
end

When this is runned in CUDAv3.3.3 I get:

  Activating environment at `~/code/CUDAv3.3.3/Project.toml`
      Status `~/code/CUDAv3.3.3/Project.toml`
  [052768ef] CUDA v3.3.3
PTX CompilerJob of kernel krnl_foo!(CuDeviceMatrix{Float64, 1}, SpaceParm{4, 6, 0, 256}) for sm_70

Variables
  #self#::Core.Const(krnl_foo!)
  ac::CuDeviceMatrix{Float64, 1}
  lp::SpaceParm{4, 6, 0, 256}
  it::Int64
  r::Int64
  b::Int64

Body::Nothing
1 ─ %1  = CUDA.threadIdx::Core.Const(CUDA.threadIdx)
│   %2  = (%1)()::NamedTuple{(:x, :y, :z), Tuple{Int64, Int64, Int64}}
│   %3  = Base.getproperty(%2, :x)::Int64
│   %4  = CUDA.blockIdx::Core.Const(CUDA.blockIdx)
│   %5  = (%4)()::NamedTuple{(:x, :y, :z), Tuple{Int64, Int64, Int64}}
│   %6  = Base.getproperty(%5, :x)::Int64
│         (b = %3)
│         (r = %6)
│   %9  = Core.tuple(b, r)::Tuple{Int64, Int64}
│         (it = Main.point_time(%9, lp))
│   %11 = Main.eltype(ac)::Core.Const(Float64)
│   %12 = Main.zero(%11)::Core.Const(0.0)
│         Base.setindex!(ac, %12, b, r)
└──       return Main.nothing

(i.e. it inferes the correct data types, and works)

But in the last version (CUDAv3.5.0), I get:

 Activating environment at `~/code/CUDAv3.5.0/Project.toml`
      Status `~/code/CUDAv3.5.0/Project.toml`
  [052768ef] CUDA v3.5.0
PTX CompilerJob of kernel krnl_foo!(CuDeviceMatrix{Float64, 1}, SpaceParm{4, 6, 0, 256}) for sm_70

Variables
  #self#::Core.Const(krnl_foo!)
  ac::CuDeviceMatrix{Float64, 1}
  lp::SpaceParm{4, 6, 0, 256}
  it::Union{}
  r::Union{}
  b::Union{}

Body::Union{}
1 ─ %1 = CUDA.threadIdx::Core.Const(CUDA.threadIdx)
│        (%1)()
│        Core.Const(:(Base.getproperty(%2, :x)))
│        Core.Const(:(CUDA.blockIdx))
│        Core.Const(:((%4)()))
│        Core.Const(:(Base.getproperty(%5, :x)))
│        Core.Const(:(b = %3))
│        Core.Const(:(r = %6))
│        Core.Const(:(Core.tuple(b, r)))
│        Core.Const(:(it = Main.point_time(%9, lp)))
│        Core.Const(:(Main.eltype(ac)))
│        Core.Const(:(Main.zero(%11)))
│        Core.Const(:(Base.setindex!(ac, %12, b, r)))
└──      Core.Const(:(return Main.nothing))
ERROR: LoadError: GPU compilation of kernel krnl_foo!(CuDeviceMatrix{Float64, 1}, SpaceParm{4, 6, 0, 256}) failed
KernelError: kernel returns a value of type `Union{}`

Make sure your kernel function ends in `return`, `return nothing` or `nothing`.
If the returned value is of type `Union{}`, your Julia code probably throws an exception.
Inspect the code with `@device_code_warntype` for more details.

Stacktrace:
  [1] check_method(job::GPUCompiler.CompilerJob)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/AJD5L/src/validation.jl:21
  [2] macro expansion
    @ ~/.julia/packages/TimerOutputs/SSeq1/src/TimerOutput.jl:252 [inlined]
  [3] macro expansion
    @ ~/.julia/packages/GPUCompiler/AJD5L/src/driver.jl:89 [inlined]
  [4] emit_julia(job::GPUCompiler.CompilerJob)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/AJD5L/src/utils.jl:62
  [5] cufunction_compile(job::GPUCompiler.CompilerJob)
    @ CUDA ~/.julia/packages/CUDA/YpW0k/src/compiler/execution.jl:324
  [6] cached_compilation(cache::Dict{UInt64, Any}, job::GPUCompiler.CompilerJob, compiler::typeof(CUDA.cufunction_compile), linker::typeof(CUDA.cufunction_link))
    @ GPUCompiler ~/.julia/packages/GPUCompiler/AJD5L/src/cache.jl:89
  [7] cufunction(f::typeof(krnl_foo!), tt::Type{Tuple{CuDeviceMatrix{Float64, 1}, SpaceParm{4, 6, 0, 256}}}; name::Nothing, kwargs::Base.Iterators.Pairs{Union{}, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
    @ CUDA ~/.julia/packages/CUDA/YpW0k/src/compiler/execution.jl:297
  [8] cufunction(f::typeof(krnl_foo!), tt::Type{Tuple{CuDeviceMatrix{Float64, 1}, SpaceParm{4, 6, 0, 256}}})
    @ CUDA ~/.julia/packages/CUDA/YpW0k/src/compiler/execution.jl:291
  [9] macro expansion
    @ ~/.julia/packages/CUDA/YpW0k/src/compiler/execution.jl:102 [inlined]
 [10] macro expansion
    @ ~/.julia/packages/GPUCompiler/AJD5L/src/reflection.jl:147 [inlined]
 [11] macro expansion
    @ ~/code/CUDAv3.5.0/foo.jl:84 [inlined]
 [12] top-level scope
    @ ~/.julia/packages/CUDA/YpW0k/src/utilities.jl:28
 [13] include(fname::String)
    @ Base.MainInclude ./client.jl:444
 [14] top-level scope
    @ REPL[1]:1
in expression starting at /home/aramos/code/CUDAv3.5.0/foo.jl:83

I think that the problem originates in the fact that for some reason CUDAv3.5.0 does not inferes the types of CUDA.threadIdx().x, CUDA.blockIdx().x as Int64, but I am not sure.

For a more complicated example, you can try to execute src/main/times.jl of the project: https://igit.ific.uv.es/alramos/latticegpu.jl
It shows a similar issue. All works perfectly with CUDA up to v3.3.3, but for later versions basically all kernerls fail top compile.

Many thanks!

Those intrinsics now return an Int32.

Ufff… This potentially breaks all operations with data structures inside kernels…

In any case this is not what the inspection of the code claims:

Variables
  #self#::Core.Const(krnl_foo!)
  ac::CuDeviceMatrix{Float64, 1}
  lp::SpaceParm{4, 6, 0, 256}
  it::Union{}
  r::Union{}
  b::Union{}

(i.e. @device_code_warntype does not find them as Int32). The problem is that CUDAv3.5.0 claims that b,r are of type Union{}, and this breaks basically every operation that you do inside your kernels with these indices.

PS: When did this change happen? This will break many kernels and data structures… Is this reversible in any way?

Why?

That’s not what that output implies. Union{} is the bottom type representing an error, this doesn’t have anything to do with the Int64/Int32 change I mentioned. The fact that calls to CUDA.threadIdx return Union{} means something’s seriously wrong, as those intrinsics obviously shouldn’t ever error.

Well, see the above example. If one’s code expects these intrinsics to be Int64, the code will not work anymore.

If I explicitly convert these values to Int64 inside the kernel:

function krnl_foo!(ac, lp)
 b, r = CUDA.threadIdx().x, CUDA.blockIdx().x
 it = point_time((convert(Int64,b),convert(Int64,r)), lp)

 ac[b,r] = zero(eltype(ac))
 return nothing
end

The code works, but still claims that it,b,r are of type Union{}:

      Status `~/code/CUDAv3.5.0/Project.toml`
  [052768ef] CUDA v3.5.0
PTX CompilerJob of kernel krnl_foo!(CuDeviceMatrix{Float64, 1}, SpaceParm{4, 6, 0, 256}) for sm_70

Variables
  #self#::Core.Const(krnl_foo!)
  ac::CuDeviceMatrix{Float64, 1}
  lp::SpaceParm{4, 6, 0, 256}
  it::Union{}
  r::Union{}
  b::Union{}

Body::Union{}
1 ─ %1 = CUDA.threadIdx::Core.Const(CUDA.threadIdx)
│        (%1)()
│        Core.Const(:(Base.getproperty(%2, :x)))
│        Core.Const(:(CUDA.blockIdx))
│        Core.Const(:((%4)()))
│        Core.Const(:(Base.getproperty(%5, :x)))
│        Core.Const(:(b = %3))
│        Core.Const(:(r = %6))
│        Core.Const(:(Main.convert(Main.Int64, b)))
│        Core.Const(:(Main.convert(Main.Int64, r)))
│        Core.Const(:(Core.tuple(%9, %10)))
│        Core.Const(:(it = Main.point_time(%11, lp)))
│        Core.Const(:(Main.eltype(ac)))
│        Core.Const(:(Main.zero(%13)))
│        Core.Const(:(Base.setindex!(ac, %14, b, r)))
└──      Core.Const(:(return Main.nothing))

Even the code without any call to the method point_time claims that b,r are Union{}. What am I doing wrong?

Thanks!

Please create an MWE, or try to reduce the problem. The kernel you posted at the top of this thread works fine for me. The threadIdx() functions obviously still work on CUDA.jl 3.x, so there likely must be something else going wrong with your application.

Ok, sorry. I will try to be more concise.

I have two problems:

  1. Routines that were dispatching based on the type returned by threadIdx() are now broken. MWE:
import Pkg
Pkg.activate(".")
using CUDA

Pkg.status()

idx(i::Int64) = zero(i)*1.0

function krnl_foo!(ac)
    b, r = CUDA.threadIdx().x, CUDA.blockIdx().x

    ac[b,r] = idx(b)
 return nothing
end

A = randn(10,10)
ac = CuArray(A)

CUDA.@sync begin
  @device_code_warntype    CUDA.@cuda threads=10 blocks=10 krnl_foo!(ac)
end

A = Array(ac)
A .== zero(A)

works in CUDAv3.3.3, but fails in CUDAv3.5.0.

  1. Even if this is corrected by changing the type expected by idx(), the code works, but @device_code_warntype does not find the correct types. MWE:

import Pkg
Pkg.activate(".")
using CUDA

Pkg.status()

idx(i::Int32) = zero(i)*1.0 # Change type for CUDAv3.5.0

function krnl_foo!(ac)
    b, r = CUDA.threadIdx().x, CUDA.blockIdx().x

    ac[b,r] = idx(b)
 return nothing
end

A = randn(10,10)
ac = CuArray(A)

CUDA.@sync begin
  @device_code_warntype    CUDA.@cuda threads=10 blocks=10 krnl_foo!(ac)
end

A = Array(ac)
A .== zero(A)

works in CUDAv3.5.0, but claims that b,r are of type Union:

 Activating environment at `~/code/CUDAv3.5.0/Project.toml`
      Status `~/code/CUDAv3.5.0/Project.toml`
  [052768ef] CUDA v3.5.0
PTX CompilerJob of kernel krnl_foo!(CuDeviceMatrix{Float64, 1}) for sm_70

Variables
  #self#::Core.Const(krnl_foo!)
  ac::CuDeviceMatrix{Float64, 1}
  r::Union{}
  b::Union{}

Body::Union{}
1 ─ %1 = CUDA.threadIdx::Core.Const(CUDA.threadIdx)
│        (%1)()
│        Core.Const(:(Base.getproperty(%2, :x)))
│        Core.Const(:(CUDA.blockIdx))
│        Core.Const(:((%4)()))
│        Core.Const(:(Base.getproperty(%5, :x)))
│        Core.Const(:(b = %3))
│        Core.Const(:(r = %6))
│        Core.Const(:(Main.idx(b)))
│        Core.Const(:(Base.setindex!(ac, %9, b, r)))
└──      Core.Const(:(return Main.nothing))
10×10 BitMatrix:
 1  1  1  1  1  1  1  1  1  1
 1  1  1  1  1  1  1  1  1  1
 1  1  1  1  1  1  1  1  1  1
 1  1  1  1  1  1  1  1  1  1
 1  1  1  1  1  1  1  1  1  1
 1  1  1  1  1  1  1  1  1  1
 1  1  1  1  1  1  1  1  1  1
 1  1  1  1  1  1  1  1  1  1
 1  1  1  1  1  1  1  1  1  1
 1  1  1  1  1  1  1  1  1  1

In my particular case, because of 1), a substantial part of my codes are broken in CUDAv3.5.0.

Many thanks!

  1. Yes, that’s unfortunate, but using Int32 indices is probably worth it since it can result in significant register usage reductions. Besides, for code to work on 32-bit platforms you better make those methods generically-typed anyway, so can’t you update those methods to accept an ::Integer? For structures, you can just provide a generically-typed constructor (so again ::Integer or <:Integer) that converts whenever appropriately, and there’s no need to make the structures parametric.

  2. Ah, now I understand, it was just a rendering issue. This is a known issue on Julia 1.6, due to how code reflection methods interact with method redefinitions. It has been fixed in Julia 1.7, so you can try upgrading (or at least test if it works as expected).