Hi all,
I am currently working on a softmax kernel while parallelly learning Julia. After going through some examples, I tried using @Const
macro with Abstract Type Declaration
in kernel. Minimal Kernel Code given below.
@kernel function _softmax!(y::T, @Const(x::T), @Const(dims::Int)) where {T<:AbstractArray}
G_i, G_j = @index(Global, NTuple)
end
Host Caller:
function softmax(x::T; dims::Int=1) where {T<:AbstractArray}
# Checking validity of dimension for which softmax is requested
input_dim_count = length(size(x))
dims < input_dim_count ? nothing : throw(AssertionError("dims=$input_dim_count not allowed. dims < $input_dim_count"))
# Setting up output softmax array
backend = get_backend(x)
y = KernelAbstractions.zeros(backend, eltype(x), size(x))
# calling softmax kernel
groupsize = KernelAbstractions.isgpu(backend) ? 32 : 1024
@device_code_warntype _softmax!(backend, groupsize)(y, x, dims, ndrange=size(x))
# synchronize if backend=GPU else nothing
KernelAbstractions.isgpu(backend) ? KernelAbstractions.synchronize(backend) : nothing
return y
end
The error shown below doesn’t occur when I use @Const(x)
or x::T
.
Is this because in order for @Const
macro to work, it needs to know memory size during compilation which is not possible when using Abstract Types
?
When using CUDA
, I get the following error (I have no experience in IR interpretation hence, was not able to understand anything):
Added necessary packages - CUDA, CUDAKernels, Test, softmax.jl
Fetched the backend - CUDABackend
Benchmarking for - (1, 128, 128)
PTX CompilerJob of gpu__softmax!(KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{3, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}, Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{3, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(32, 1, 1)}, CartesianIndices{3, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}, Base.OneTo{Int64}}}, Nothing}}, CuDeviceArray{Float32, 3, 1}, CuDeviceArray{Float32, 3, 1}, Int64) in world 32513 for sm_86, maxthreads=32
MethodInstance for gpu__softmax!(::KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{3, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}, Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{3, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(32, 1, 1)}, CartesianIndices{3, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}, Base.OneTo{Int64}}}, Nothing}}, ::CuDeviceArray{Float32, 3, 1}, ::CuDeviceArray{Float32, 3, 1}, ::Int64)
from gpu__softmax!(__ctx__, y::T, x::T, dims::Int64) where T<:AbstractArray in Main
Static Parameters
T = CuDeviceArray{Float32, 3, 1}
Arguments
#self#::Core.Const(gpu__softmax!)
__ctx__::KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{3, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}, Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{3, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(32, 1, 1)}, CartesianIndices{3, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}, Base.OneTo{Int64}}}, Nothing}}
y::CuDeviceArray{Float32, 3, 1}
x@_4::CuDeviceArray{Float32, 3, 1}
dims@_5::Int64
Locals
x@_6::Union{}
@_7::Union{}
G_j::Union{}
G_i::Union{}
dims@_10::Union{}
Body::Union{}
1 ─ %1 = Core.typeassert(x@_4, $(Expr(:static_parameter, 1)))::CuDeviceArray{Float32, 3, 1}
│ %2 = (KernelAbstractions.constify)(%1)::CUDA.Const{Float32, 3, 1}
│ Base.convert($(Expr(:static_parameter, 1)), %2)
│ Core.Const(:(x@_6 = Core.typeassert(%3, $(Expr(:static_parameter, 1)))))
│ Core.Const(:(Core.typeassert(dims@_5, Main.Int)))
│ Core.Const(:((KernelAbstractions.constify)(%5)))
│ Core.Const(Core.NewvarNode(:(@_7)))
│ Core.Const(Core.NewvarNode(:(G_j)))
│ Core.Const(Core.NewvarNode(:(G_i)))
│ Core.Const(:(Base.convert(Main.Int, %6)))
│ Core.Const(:(dims@_10 = Core.typeassert(%10, Main.Int)))
│ Core.Const(:((KernelAbstractions.__validindex)(__ctx__)))
│ Core.Const(:(Core.typeassert(%12, Core.Bool)))
│ Core.Const(:(KernelAbstractions.__index_Global_NTuple(__ctx__)))
│ Core.Const(:(Base.indexed_iterate(%14, 1)))
│ Core.Const(:(G_i = Core.getfield(%15, 1)))
│ Core.Const(:(@_7 = Core.getfield(%15, 2)))
│ Core.Const(:(Base.indexed_iterate(%14, 2, @_7)))
│ Core.Const(:(G_j = Core.getfield(%18, 1)))
└── Core.Const(:(return Main.nothing))
ERROR: LoadError: GPU compilation of gpu__softmax!(KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{3, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}, Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{3, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(32, 1, 1)}, CartesianIndices{3, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}, Base.OneTo{Int64}}}, Nothing}}, CuDeviceArray{Float32, 3, 1}, CuDeviceArray{Float32, 3, 1}, Int64) in world 32513 failed
KernelError: kernel returns a value of type `Union{}`
Make sure your kernel function ends in `return`, `return nothing` or `nothing`.
If the returned value is of type `Union{}`, your Julia code probably throws an exception.
Inspect the code with `@device_code_warntype` for more details.
Stacktrace:
[1] check_method(job::GPUCompiler.CompilerJob)
@ GPUCompiler ~/.julia/packages/GPUCompiler/anMCs/src/validation.jl:54
[2] macro expansion
@ ~/.julia/packages/GPUCompiler/anMCs/src/driver.jl:153 [inlined]
[3] macro expansion
@ ~/.julia/packages/TimerOutputs/LHjFw/src/TimerOutput.jl:253 [inlined]
[4] macro expansion
@ ~/.julia/packages/GPUCompiler/anMCs/src/driver.jl:152 [inlined]
[5] emit_julia(job::GPUCompiler.CompilerJob; validate::Bool)
@ GPUCompiler ~/.julia/packages/GPUCompiler/anMCs/src/utils.jl:83
[6] emit_julia
@ ~/.julia/packages/GPUCompiler/anMCs/src/utils.jl:77 [inlined]
[7] compile(job::GPUCompiler.CompilerJob, ctx::LLVM.Context)
@ CUDA ~/.julia/packages/CUDA/LjBYB/src/compiler/compilation.jl:105
[8] #203
@ ~/.julia/packages/CUDA/LjBYB/src/compiler/compilation.jl:100 [inlined]
[9] JuliaContext(f::CUDA.var"#203#204"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}})
@ GPUCompiler ~/.julia/packages/GPUCompiler/anMCs/src/driver.jl:76
[10] compile
@ ~/.julia/packages/CUDA/LjBYB/src/compiler/compilation.jl:99 [inlined]
[11] actual_compilation(cache::Dict{UInt64, Any}, key::UInt64, cfg::GPUCompiler.CompilerConfig{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, ft::Type, tt::Type, world::UInt64, compiler::typeof(CUDA.compile), linker::typeof(CUDA.link))
@ GPUCompiler ~/.julia/packages/GPUCompiler/anMCs/src/cache.jl:184
[12] cached_compilation(cache::Dict{UInt64, Any}, cfg::GPUCompiler.CompilerConfig{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, ft::Type, tt::Type, compiler::Function, linker::Function)
@ GPUCompiler ~/.julia/packages/GPUCompiler/anMCs/src/cache.jl:163
[13] macro expansion
@ ~/.julia/packages/CUDA/LjBYB/src/compiler/execution.jl:310 [inlined]
[14] macro expansion
@ ./lock.jl:223 [inlined]
[15] cufunction(f::typeof(gpu__softmax!), tt::Type{Tuple{KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{3, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}, Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{3, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(32, 1, 1)}, CartesianIndices{3, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}, Base.OneTo{Int64}}}, Nothing}}, CuDeviceArray{Float32, 3, 1}, CuDeviceArray{Float32, 3, 1}, Int64}}; kwargs::Base.Pairs{Symbol, Integer, Tuple{Symbol, Symbol}, NamedTuple{(:always_inline, :maxthreads), Tuple{Bool, Int64}}})
@ CUDA ~/.julia/packages/CUDA/LjBYB/src/compiler/execution.jl:306
[16] macro expansion
@ ~/.julia/packages/CUDA/LjBYB/src/compiler/execution.jl:104 [inlined]
[17] (::KernelAbstractions.Kernel{CUDABackend, KernelAbstractions.NDIteration.StaticSize{(32,)}, KernelAbstractions.NDIteration.DynamicSize, typeof(gpu__softmax!)})(::CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, ::Vararg{Any}; ndrange::Tuple{Int64, Int64, Int64}, workgroupsize::Nothing)
@ CUDA.CUDAKernels ~/.julia/packages/CUDA/LjBYB/src/CUDAKernels.jl:116
[18] macro expansion
@ ~/.julia/packages/GPUCompiler/anMCs/src/reflection.jl:205 [inlined]
[19] softmax(x::CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}; dims::Int64)
@ Main ~/julia-work/nnlib/src/native/softmax.jl:24
[20] macro expansion
@ ~/julia-work/nnlib/src/native/test/softmax/softmax_performant_cuda.jl:32 [inlined]
[21] macro expansion
@ ~/.julia/packages/NVTX/d9htq/src/macro.jl:119 [inlined]
[22] top-level scope
@ ~/julia-work/nnlib/src/native/test/softmax/softmax_performant_cuda.jl:29
in expression starting at /home/priyammehta/julia-work/nnlib/src/native/test/softmax/softmax_performant_cuda.jl:18