Hi,
This is a fairly complex code. This reproduces the error in the two clusters that I have access to:
import Pkg
Pkg.activate(".")
using CUDA
Pkg.status()
struct SpaceParm{N,M,B,D}
ndim::Int64
iL::NTuple{N,Int64}
npls::Int64
plidx::NTuple{M,Tuple{Int64, Int64}}
blk::NTuple{N,Int64}
blkS::NTuple{N,Int64}
rbk::NTuple{N,Int64}
rbkS::NTuple{N,Int64}
bsz::Int64
rsz::Int64
ntw::NTuple{M,Int64}
function SpaceParm{N}(x, y, nt::Union{Nothing,NTuple{I,Int64}}=nothing) where {N,I}
M = convert(Int64, round(N*(N-1)/2))
N == length(x) || throw(ArgumentError("Lattice size incorrect length for dimension $N"))
N == length(y) || throw(ArgumentError("Block size incorrect length for dimension $N"))
if any(i->i!=0, x.%y)
error("Lattice size not divisible by block size.")
end
pls = Vector{Tuple{Int64, Int64}}()
for i in N:-1:1
for j in 1:i-1
push!(pls, (i,j))
end
end
r = div.(x, y)
rS = ones(N)
yS = ones(N)
for i in 2:N
for j in 1:i-1
rS[i] = rS[i]*r[j]
yS[i] = yS[i]*y[j]
end
end
D = prod(y)
if nt == nothing
ntw = ntuple(i->0, M)
else
ntw = nt
end
return new{N,M,0,D}(N, x, M, tuple(pls...), y,
tuple(yS...), tuple(r...), tuple(rS...), prod(y), prod(r), ntw)
end
end
@inline cntb(nb, id::Int64, lp::SpaceParm) = mod(div(nb-1,lp.blkS[id]),lp.blk[id])
@inline cntr(nr, id::Int64, lp::SpaceParm) = mod(div(nr-1,lp.rbkS[id]),lp.rbk[id])
@inline cnt(nb, nr, id::Int64, lp::SpaceParm) = 1 + cntb(nb,id,lp) + cntr(nr,id,lp)*lp.blk[id]
@inline function point_time(p::NTuple{2,Int64}, lp::SpaceParm{N,M,B,D}) where {N,M,B,D}
return cnt(p[1], p[2], N, lp)
end
function krnl_foo!(ac, lp)
b, r = CUDA.threadIdx().x, CUDA.blockIdx().x
it = point_time((b,r), lp)
ac[b,r] = zero(eltype(ac))
return nothing
end
lp = SpaceParm{4}((12,12,12,12), (4,4,4,4))
A = randn(10,10)
ac = CuArray(A)
CUDA.@sync begin
@device_code_warntype CUDA.@cuda threads=10 blocks=10 krnl_foo!(ac,lp)
end
When this is runned in CUDAv3.3.3 I get:
Activating environment at `~/code/CUDAv3.3.3/Project.toml`
Status `~/code/CUDAv3.3.3/Project.toml`
[052768ef] CUDA v3.3.3
PTX CompilerJob of kernel krnl_foo!(CuDeviceMatrix{Float64, 1}, SpaceParm{4, 6, 0, 256}) for sm_70
Variables
#self#::Core.Const(krnl_foo!)
ac::CuDeviceMatrix{Float64, 1}
lp::SpaceParm{4, 6, 0, 256}
it::Int64
r::Int64
b::Int64
Body::Nothing
1 ─ %1 = CUDA.threadIdx::Core.Const(CUDA.threadIdx)
│ %2 = (%1)()::NamedTuple{(:x, :y, :z), Tuple{Int64, Int64, Int64}}
│ %3 = Base.getproperty(%2, :x)::Int64
│ %4 = CUDA.blockIdx::Core.Const(CUDA.blockIdx)
│ %5 = (%4)()::NamedTuple{(:x, :y, :z), Tuple{Int64, Int64, Int64}}
│ %6 = Base.getproperty(%5, :x)::Int64
│ (b = %3)
│ (r = %6)
│ %9 = Core.tuple(b, r)::Tuple{Int64, Int64}
│ (it = Main.point_time(%9, lp))
│ %11 = Main.eltype(ac)::Core.Const(Float64)
│ %12 = Main.zero(%11)::Core.Const(0.0)
│ Base.setindex!(ac, %12, b, r)
└── return Main.nothing
(i.e. it inferes the correct data types, and works)
But in the last version (CUDAv3.5.0), I get:
Activating environment at `~/code/CUDAv3.5.0/Project.toml`
Status `~/code/CUDAv3.5.0/Project.toml`
[052768ef] CUDA v3.5.0
PTX CompilerJob of kernel krnl_foo!(CuDeviceMatrix{Float64, 1}, SpaceParm{4, 6, 0, 256}) for sm_70
Variables
#self#::Core.Const(krnl_foo!)
ac::CuDeviceMatrix{Float64, 1}
lp::SpaceParm{4, 6, 0, 256}
it::Union{}
r::Union{}
b::Union{}
Body::Union{}
1 ─ %1 = CUDA.threadIdx::Core.Const(CUDA.threadIdx)
│ (%1)()
│ Core.Const(:(Base.getproperty(%2, :x)))
│ Core.Const(:(CUDA.blockIdx))
│ Core.Const(:((%4)()))
│ Core.Const(:(Base.getproperty(%5, :x)))
│ Core.Const(:(b = %3))
│ Core.Const(:(r = %6))
│ Core.Const(:(Core.tuple(b, r)))
│ Core.Const(:(it = Main.point_time(%9, lp)))
│ Core.Const(:(Main.eltype(ac)))
│ Core.Const(:(Main.zero(%11)))
│ Core.Const(:(Base.setindex!(ac, %12, b, r)))
└── Core.Const(:(return Main.nothing))
ERROR: LoadError: GPU compilation of kernel krnl_foo!(CuDeviceMatrix{Float64, 1}, SpaceParm{4, 6, 0, 256}) failed
KernelError: kernel returns a value of type `Union{}`
Make sure your kernel function ends in `return`, `return nothing` or `nothing`.
If the returned value is of type `Union{}`, your Julia code probably throws an exception.
Inspect the code with `@device_code_warntype` for more details.
Stacktrace:
[1] check_method(job::GPUCompiler.CompilerJob)
@ GPUCompiler ~/.julia/packages/GPUCompiler/AJD5L/src/validation.jl:21
[2] macro expansion
@ ~/.julia/packages/TimerOutputs/SSeq1/src/TimerOutput.jl:252 [inlined]
[3] macro expansion
@ ~/.julia/packages/GPUCompiler/AJD5L/src/driver.jl:89 [inlined]
[4] emit_julia(job::GPUCompiler.CompilerJob)
@ GPUCompiler ~/.julia/packages/GPUCompiler/AJD5L/src/utils.jl:62
[5] cufunction_compile(job::GPUCompiler.CompilerJob)
@ CUDA ~/.julia/packages/CUDA/YpW0k/src/compiler/execution.jl:324
[6] cached_compilation(cache::Dict{UInt64, Any}, job::GPUCompiler.CompilerJob, compiler::typeof(CUDA.cufunction_compile), linker::typeof(CUDA.cufunction_link))
@ GPUCompiler ~/.julia/packages/GPUCompiler/AJD5L/src/cache.jl:89
[7] cufunction(f::typeof(krnl_foo!), tt::Type{Tuple{CuDeviceMatrix{Float64, 1}, SpaceParm{4, 6, 0, 256}}}; name::Nothing, kwargs::Base.Iterators.Pairs{Union{}, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
@ CUDA ~/.julia/packages/CUDA/YpW0k/src/compiler/execution.jl:297
[8] cufunction(f::typeof(krnl_foo!), tt::Type{Tuple{CuDeviceMatrix{Float64, 1}, SpaceParm{4, 6, 0, 256}}})
@ CUDA ~/.julia/packages/CUDA/YpW0k/src/compiler/execution.jl:291
[9] macro expansion
@ ~/.julia/packages/CUDA/YpW0k/src/compiler/execution.jl:102 [inlined]
[10] macro expansion
@ ~/.julia/packages/GPUCompiler/AJD5L/src/reflection.jl:147 [inlined]
[11] macro expansion
@ ~/code/CUDAv3.5.0/foo.jl:84 [inlined]
[12] top-level scope
@ ~/.julia/packages/CUDA/YpW0k/src/utilities.jl:28
[13] include(fname::String)
@ Base.MainInclude ./client.jl:444
[14] top-level scope
@ REPL[1]:1
in expression starting at /home/aramos/code/CUDAv3.5.0/foo.jl:83
I think that the problem originates in the fact that for some reason CUDAv3.5.0 does not inferes the types of CUDA.threadIdx().x, CUDA.blockIdx().x as Int64, but I am not sure.
For a more complicated example, you can try to execute src/main/times.jl of the project: https://igit.ific.uv.es/alramos/latticegpu.jl
It shows a similar issue. All works perfectly with CUDA up to v3.3.3, but for later versions basically all kernerls fail top compile.
Many thanks!