@Const(x::T), where T<: AbstractArray, throwing error in KernelAbstractions kernel call

Hi all,

I am currently working on a softmax kernel while parallelly learning Julia. After going through some examples, I tried using @Const macro with Abstract Type Declaration in kernel. Minimal Kernel Code given below.

@kernel function _softmax!(y::T, @Const(x::T), @Const(dims::Int)) where {T<:AbstractArray}
    G_i, G_j = @index(Global, NTuple)
end

Host Caller:

function softmax(x::T; dims::Int=1) where {T<:AbstractArray}
    
    # Checking validity of dimension for which softmax is requested
    input_dim_count = length(size(x))
    dims < input_dim_count ? nothing : throw(AssertionError("dims=$input_dim_count not allowed. dims < $input_dim_count"))

    # Setting up output softmax array
    backend = get_backend(x)
    y = KernelAbstractions.zeros(backend, eltype(x), size(x))

    # calling softmax kernel
    groupsize = KernelAbstractions.isgpu(backend) ? 32 : 1024
    @device_code_warntype _softmax!(backend, groupsize)(y, x, dims, ndrange=size(x))

    # synchronize if backend=GPU else nothing
    KernelAbstractions.isgpu(backend) ? KernelAbstractions.synchronize(backend) : nothing
        
    return y
end

The error shown below doesn’t occur when I use @Const(x) or x::T.
Is this because in order for @Const macro to work, it needs to know memory size during compilation which is not possible when using Abstract Types?

When using CUDA, I get the following error (I have no experience in IR interpretation hence, was not able to understand anything):

Added necessary packages - CUDA, CUDAKernels, Test, softmax.jl
Fetched the backend - CUDABackend

Benchmarking for - (1, 128, 128)
PTX CompilerJob of gpu__softmax!(KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{3, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}, Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{3, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(32, 1, 1)}, CartesianIndices{3, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}, Base.OneTo{Int64}}}, Nothing}}, CuDeviceArray{Float32, 3, 1}, CuDeviceArray{Float32, 3, 1}, Int64) in world 32513 for sm_86, maxthreads=32

MethodInstance for gpu__softmax!(::KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{3, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}, Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{3, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(32, 1, 1)}, CartesianIndices{3, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}, Base.OneTo{Int64}}}, Nothing}}, ::CuDeviceArray{Float32, 3, 1}, ::CuDeviceArray{Float32, 3, 1}, ::Int64)
  from gpu__softmax!(__ctx__, y::T, x::T, dims::Int64) where T<:AbstractArray in Main
Static Parameters
  T = CuDeviceArray{Float32, 3, 1}
Arguments
  #self#::Core.Const(gpu__softmax!)
  __ctx__::KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{3, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}, Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{3, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(32, 1, 1)}, CartesianIndices{3, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}, Base.OneTo{Int64}}}, Nothing}}
  y::CuDeviceArray{Float32, 3, 1}
  x@_4::CuDeviceArray{Float32, 3, 1}
  dims@_5::Int64
Locals
  x@_6::Union{}
  @_7::Union{}
  G_j::Union{}
  G_i::Union{}
  dims@_10::Union{}
Body::Union{}
1 ─ %1 = Core.typeassert(x@_4, $(Expr(:static_parameter, 1)))::CuDeviceArray{Float32, 3, 1}
│   %2 = (KernelAbstractions.constify)(%1)::CUDA.Const{Float32, 3, 1}
│        Base.convert($(Expr(:static_parameter, 1)), %2)
│        Core.Const(:(x@_6 = Core.typeassert(%3, $(Expr(:static_parameter, 1)))))
│        Core.Const(:(Core.typeassert(dims@_5, Main.Int)))
│        Core.Const(:((KernelAbstractions.constify)(%5)))
│        Core.Const(Core.NewvarNode(:(@_7)))
│        Core.Const(Core.NewvarNode(:(G_j)))
│        Core.Const(Core.NewvarNode(:(G_i)))
│        Core.Const(:(Base.convert(Main.Int, %6)))
│        Core.Const(:(dims@_10 = Core.typeassert(%10, Main.Int)))
│        Core.Const(:((KernelAbstractions.__validindex)(__ctx__)))
│        Core.Const(:(Core.typeassert(%12, Core.Bool)))
│        Core.Const(:(KernelAbstractions.__index_Global_NTuple(__ctx__)))
│        Core.Const(:(Base.indexed_iterate(%14, 1)))
│        Core.Const(:(G_i = Core.getfield(%15, 1)))
│        Core.Const(:(@_7 = Core.getfield(%15, 2)))
│        Core.Const(:(Base.indexed_iterate(%14, 2, @_7)))
│        Core.Const(:(G_j = Core.getfield(%18, 1)))
└──      Core.Const(:(return Main.nothing))

ERROR: LoadError: GPU compilation of gpu__softmax!(KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{3, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}, Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{3, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(32, 1, 1)}, CartesianIndices{3, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}, Base.OneTo{Int64}}}, Nothing}}, CuDeviceArray{Float32, 3, 1}, CuDeviceArray{Float32, 3, 1}, Int64) in world 32513 failed
KernelError: kernel returns a value of type `Union{}`

Make sure your kernel function ends in `return`, `return nothing` or `nothing`.
If the returned value is of type `Union{}`, your Julia code probably throws an exception.
Inspect the code with `@device_code_warntype` for more details.

Stacktrace:
  [1] check_method(job::GPUCompiler.CompilerJob)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/anMCs/src/validation.jl:54
  [2] macro expansion
    @ ~/.julia/packages/GPUCompiler/anMCs/src/driver.jl:153 [inlined]
  [3] macro expansion
    @ ~/.julia/packages/TimerOutputs/LHjFw/src/TimerOutput.jl:253 [inlined]
  [4] macro expansion
    @ ~/.julia/packages/GPUCompiler/anMCs/src/driver.jl:152 [inlined]
  [5] emit_julia(job::GPUCompiler.CompilerJob; validate::Bool)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/anMCs/src/utils.jl:83
  [6] emit_julia
    @ ~/.julia/packages/GPUCompiler/anMCs/src/utils.jl:77 [inlined]
  [7] compile(job::GPUCompiler.CompilerJob, ctx::LLVM.Context)
    @ CUDA ~/.julia/packages/CUDA/LjBYB/src/compiler/compilation.jl:105
  [8] #203
    @ ~/.julia/packages/CUDA/LjBYB/src/compiler/compilation.jl:100 [inlined]
  [9] JuliaContext(f::CUDA.var"#203#204"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}})
    @ GPUCompiler ~/.julia/packages/GPUCompiler/anMCs/src/driver.jl:76
 [10] compile
    @ ~/.julia/packages/CUDA/LjBYB/src/compiler/compilation.jl:99 [inlined]
 [11] actual_compilation(cache::Dict{UInt64, Any}, key::UInt64, cfg::GPUCompiler.CompilerConfig{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, ft::Type, tt::Type, world::UInt64, compiler::typeof(CUDA.compile), linker::typeof(CUDA.link))
    @ GPUCompiler ~/.julia/packages/GPUCompiler/anMCs/src/cache.jl:184
 [12] cached_compilation(cache::Dict{UInt64, Any}, cfg::GPUCompiler.CompilerConfig{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, ft::Type, tt::Type, compiler::Function, linker::Function)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/anMCs/src/cache.jl:163
 [13] macro expansion
    @ ~/.julia/packages/CUDA/LjBYB/src/compiler/execution.jl:310 [inlined]
 [14] macro expansion
    @ ./lock.jl:223 [inlined]
 [15] cufunction(f::typeof(gpu__softmax!), tt::Type{Tuple{KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{3, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}, Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{3, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(32, 1, 1)}, CartesianIndices{3, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}, Base.OneTo{Int64}}}, Nothing}}, CuDeviceArray{Float32, 3, 1}, CuDeviceArray{Float32, 3, 1}, Int64}}; kwargs::Base.Pairs{Symbol, Integer, Tuple{Symbol, Symbol}, NamedTuple{(:always_inline, :maxthreads), Tuple{Bool, Int64}}})
    @ CUDA ~/.julia/packages/CUDA/LjBYB/src/compiler/execution.jl:306
 [16] macro expansion
    @ ~/.julia/packages/CUDA/LjBYB/src/compiler/execution.jl:104 [inlined]
 [17] (::KernelAbstractions.Kernel{CUDABackend, KernelAbstractions.NDIteration.StaticSize{(32,)}, KernelAbstractions.NDIteration.DynamicSize, typeof(gpu__softmax!)})(::CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, ::Vararg{Any}; ndrange::Tuple{Int64, Int64, Int64}, workgroupsize::Nothing)
    @ CUDA.CUDAKernels ~/.julia/packages/CUDA/LjBYB/src/CUDAKernels.jl:116
 [18] macro expansion
    @ ~/.julia/packages/GPUCompiler/anMCs/src/reflection.jl:205 [inlined]
 [19] softmax(x::CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}; dims::Int64)
    @ Main ~/julia-work/nnlib/src/native/softmax.jl:24
 [20] macro expansion
    @ ~/julia-work/nnlib/src/native/test/softmax/softmax_performant_cuda.jl:32 [inlined]
 [21] macro expansion
    @ ~/.julia/packages/NVTX/d9htq/src/macro.jl:119 [inlined]
 [22] top-level scope
    @ ~/julia-work/nnlib/src/native/test/softmax/softmax_performant_cuda.jl:29
in expression starting at /home/priyammehta/julia-work/nnlib/src/native/test/softmax/softmax_performant_cuda.jl:18

@Const is only meaningful for arrays. So the Const on the dims is unnecessary and likely causing the issue.