@Const(x::T), where T<: AbstractArray, throwing error in KernelAbstractions kernel call

prikmm · March 25, 2023, 8:24am

Hi all,

I am currently working on a softmax kernel while parallelly learning Julia. After going through some examples, I tried using @Const macro with Abstract Type Declaration in kernel. Minimal Kernel Code given below.

@kernel function _softmax!(y::T, @Const(x::T), @Const(dims::Int)) where {T<:AbstractArray}
    G_i, G_j = @index(Global, NTuple)
end

Host Caller:

function softmax(x::T; dims::Int=1) where {T<:AbstractArray}
    
    # Checking validity of dimension for which softmax is requested
    input_dim_count = length(size(x))
    dims < input_dim_count ? nothing : throw(AssertionError("dims=$input_dim_count not allowed. dims < $input_dim_count"))

    # Setting up output softmax array
    backend = get_backend(x)
    y = KernelAbstractions.zeros(backend, eltype(x), size(x))

    # calling softmax kernel
    groupsize = KernelAbstractions.isgpu(backend) ? 32 : 1024
    @device_code_warntype _softmax!(backend, groupsize)(y, x, dims, ndrange=size(x))

    # synchronize if backend=GPU else nothing
    KernelAbstractions.isgpu(backend) ? KernelAbstractions.synchronize(backend) : nothing
        
    return y
end

The error shown below doesn’t occur when I use @Const(x) or x::T.
Is this because in order for @Const macro to work, it needs to know memory size during compilation which is not possible when using Abstract Types?

When using CUDA, I get the following error (I have no experience in IR interpretation hence, was not able to understand anything):

Added necessary packages - CUDA, CUDAKernels, Test, softmax.jl
Fetched the backend - CUDABackend

Benchmarking for - (1, 128, 128)
PTX CompilerJob of gpu__softmax!(KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{3, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}, Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{3, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(32, 1, 1)}, CartesianIndices{3, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}, Base.OneTo{Int64}}}, Nothing}}, CuDeviceArray{Float32, 3, 1}, CuDeviceArray{Float32, 3, 1}, Int64) in world 32513 for sm_86, maxthreads=32

MethodInstance for gpu__softmax!(::KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{3, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}, Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{3, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(32, 1, 1)}, CartesianIndices{3, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}, Base.OneTo{Int64}}}, Nothing}}, ::CuDeviceArray{Float32, 3, 1}, ::CuDeviceArray{Float32, 3, 1}, ::Int64)
  from gpu__softmax!(__ctx__, y::T, x::T, dims::Int64) where T<:AbstractArray in Main
Static Parameters
  T = CuDeviceArray{Float32, 3, 1}
Arguments
  #self#::Core.Const(gpu__softmax!)
  __ctx__::KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{3, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}, Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{3, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(32, 1, 1)}, CartesianIndices{3, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}, Base.OneTo{Int64}}}, Nothing}}
  y::CuDeviceArray{Float32, 3, 1}
  x@_4::CuDeviceArray{Float32, 3, 1}
  dims@_5::Int64
Locals
  x@_6::Union{}
  @_7::Union{}
  G_j::Union{}
  G_i::Union{}
  dims@_10::Union{}
Body::Union{}
1 ─ %1 = Core.typeassert(x@_4, $(Expr(:static_parameter, 1)))::CuDeviceArray{Float32, 3, 1}
│   %2 = (KernelAbstractions.constify)(%1)::CUDA.Const{Float32, 3, 1}
│        Base.convert($(Expr(:static_parameter, 1)), %2)
│        Core.Const(:(x@_6 = Core.typeassert(%3, $(Expr(:static_parameter, 1)))))
│        Core.Const(:(Core.typeassert(dims@_5, Main.Int)))
│        Core.Const(:((KernelAbstractions.constify)(%5)))
│        Core.Const(Core.NewvarNode(:(@_7)))
│        Core.Const(Core.NewvarNode(:(G_j)))
│        Core.Const(Core.NewvarNode(:(G_i)))
│        Core.Const(:(Base.convert(Main.Int, %6)))
│        Core.Const(:(dims@_10 = Core.typeassert(%10, Main.Int)))
│        Core.Const(:((KernelAbstractions.__validindex)(__ctx__)))
│        Core.Const(:(Core.typeassert(%12, Core.Bool)))
│        Core.Const(:(KernelAbstractions.__index_Global_NTuple(__ctx__)))
│        Core.Const(:(Base.indexed_iterate(%14, 1)))
│        Core.Const(:(G_i = Core.getfield(%15, 1)))
│        Core.Const(:(@_7 = Core.getfield(%15, 2)))
│        Core.Const(:(Base.indexed_iterate(%14, 2, @_7)))
│        Core.Const(:(G_j = Core.getfield(%18, 1)))
└──      Core.Const(:(return Main.nothing))

ERROR: LoadError: GPU compilation of gpu__softmax!(KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{3, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}, Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{3, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(32, 1, 1)}, CartesianIndices{3, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}, Base.OneTo{Int64}}}, Nothing}}, CuDeviceArray{Float32, 3, 1}, CuDeviceArray{Float32, 3, 1}, Int64) in world 32513 failed
KernelError: kernel returns a value of type `Union{}`

Make sure your kernel function ends in `return`, `return nothing` or `nothing`.
If the returned value is of type `Union{}`, your Julia code probably throws an exception.
Inspect the code with `@device_code_warntype` for more details.

Stacktrace:
  [1] check_method(job::GPUCompiler.CompilerJob)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/anMCs/src/validation.jl:54
  [2] macro expansion
    @ ~/.julia/packages/GPUCompiler/anMCs/src/driver.jl:153 [inlined]
  [3] macro expansion
    @ ~/.julia/packages/TimerOutputs/LHjFw/src/TimerOutput.jl:253 [inlined]
  [4] macro expansion
    @ ~/.julia/packages/GPUCompiler/anMCs/src/driver.jl:152 [inlined]
  [5] emit_julia(job::GPUCompiler.CompilerJob; validate::Bool)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/anMCs/src/utils.jl:83
  [6] emit_julia
    @ ~/.julia/packages/GPUCompiler/anMCs/src/utils.jl:77 [inlined]
  [7] compile(job::GPUCompiler.CompilerJob, ctx::LLVM.Context)
    @ CUDA ~/.julia/packages/CUDA/LjBYB/src/compiler/compilation.jl:105
  [8] #203
    @ ~/.julia/packages/CUDA/LjBYB/src/compiler/compilation.jl:100 [inlined]
  [9] JuliaContext(f::CUDA.var"#203#204"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}})
    @ GPUCompiler ~/.julia/packages/GPUCompiler/anMCs/src/driver.jl:76
 [10] compile
    @ ~/.julia/packages/CUDA/LjBYB/src/compiler/compilation.jl:99 [inlined]
 [11] actual_compilation(cache::Dict{UInt64, Any}, key::UInt64, cfg::GPUCompiler.CompilerConfig{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, ft::Type, tt::Type, world::UInt64, compiler::typeof(CUDA.compile), linker::typeof(CUDA.link))
    @ GPUCompiler ~/.julia/packages/GPUCompiler/anMCs/src/cache.jl:184
 [12] cached_compilation(cache::Dict{UInt64, Any}, cfg::GPUCompiler.CompilerConfig{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, ft::Type, tt::Type, compiler::Function, linker::Function)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/anMCs/src/cache.jl:163
 [13] macro expansion
    @ ~/.julia/packages/CUDA/LjBYB/src/compiler/execution.jl:310 [inlined]
 [14] macro expansion
    @ ./lock.jl:223 [inlined]
 [15] cufunction(f::typeof(gpu__softmax!), tt::Type{Tuple{KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{3, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}, Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{3, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(32, 1, 1)}, CartesianIndices{3, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}, Base.OneTo{Int64}}}, Nothing}}, CuDeviceArray{Float32, 3, 1}, CuDeviceArray{Float32, 3, 1}, Int64}}; kwargs::Base.Pairs{Symbol, Integer, Tuple{Symbol, Symbol}, NamedTuple{(:always_inline, :maxthreads), Tuple{Bool, Int64}}})
    @ CUDA ~/.julia/packages/CUDA/LjBYB/src/compiler/execution.jl:306
 [16] macro expansion
    @ ~/.julia/packages/CUDA/LjBYB/src/compiler/execution.jl:104 [inlined]
 [17] (::KernelAbstractions.Kernel{CUDABackend, KernelAbstractions.NDIteration.StaticSize{(32,)}, KernelAbstractions.NDIteration.DynamicSize, typeof(gpu__softmax!)})(::CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, ::Vararg{Any}; ndrange::Tuple{Int64, Int64, Int64}, workgroupsize::Nothing)
    @ CUDA.CUDAKernels ~/.julia/packages/CUDA/LjBYB/src/CUDAKernels.jl:116
 [18] macro expansion
    @ ~/.julia/packages/GPUCompiler/anMCs/src/reflection.jl:205 [inlined]
 [19] softmax(x::CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}; dims::Int64)
    @ Main ~/julia-work/nnlib/src/native/softmax.jl:24
 [20] macro expansion
    @ ~/julia-work/nnlib/src/native/test/softmax/softmax_performant_cuda.jl:32 [inlined]
 [21] macro expansion
    @ ~/.julia/packages/NVTX/d9htq/src/macro.jl:119 [inlined]
 [22] top-level scope
    @ ~/julia-work/nnlib/src/native/test/softmax/softmax_performant_cuda.jl:29
in expression starting at /home/priyammehta/julia-work/nnlib/src/native/test/softmax/softmax_performant_cuda.jl:18

vchuravy · April 13, 2023, 1:25am

@Const is only meaningful for arrays. So the Const on the dims is unnecessary and likely causing the issue.

Topic		Replies	Views
Best Practice for Type Declarations in CUDA Kernels GPU cuda , type	3	252	September 27, 2024
Val{N} + LinearIndices Causes Massive Compile-Time Unrolling GPU question , kernelabstractions , reflection , lowering	2	82	April 20, 2025
Custom Type for lazy kernel operations on AbstractArrays, ho can i improve it? Performance question	3	375	March 9, 2020
Problems with LinearAlgebra functions within KernelAbstractions and CUDA General Usage cuda , linearalgebra , kernelabstractions	9	671	February 22, 2024
Calling a function inside of a Kernel GPU question	4	2457	June 7, 2019

@Const(x::T), where T<: AbstractArray, throwing error in KernelAbstractions kernel call

Related topics