CUDA.jl not inlining "simple" functions

cdsousa · November 14, 2020, 6:02pm

After porting some code from using oneAPI.jl GPU kernels to CUDA.jl I got timings in CUDA which were higher than the ones from running in an Intel GPU.
Something like getting 20ms on the on an Nvidia Gtx 1080 Ti and 14ms on an Intel UHD Graphics 620, which was really not expected given that memory transfers were ruled out and given that this is a flagship card against a low-end integrated one.

After some research, I narrowed the potential issue as to be something related to inlining optimizations not happen when dual numbers (a kind of complex number related to automatic differentiation) are used. I’m not sure this is the only issue though.

I’m sharing an MWE below where a “simple” multiplication function being force-inlined or not has a huge impact.

In my original code I have no access to functions to force them inlining. Is there something that can be done? Can this be considered a bug?

The times I’m seeing in the machines I’m trying are the following:

oneAPI.jl - Intel UHD Graphics 620:
- with @inline annotated multiplication function (fmul_1)
  - real numbers: 8.0ms
  - dual numbers: 12.0 ms
- with not annotated multiplication function (fmul_2)
  - same as above
CUDA.jl - Nvidia Gtx 1080 Ti:
- with @inlined annotated multiplication function (fmul_1)
  - real numbers: 0.5ms
  - dual numbers: 0.6ms
- with not annotated multiplication function (fmul_2)
  - real numbers: 0.5ms
  - dual numbers: 12.6ms (!!!)

using StaticArrays
import Zygote
import CUDA
import oneAPI

@inline fmul_1(m,v) = m * v
        fmul_2(m,v) = m * v

@inline function compute_element(arrayvec3, a::T, mat3x3) where T
    acc = zero(T)
    for b in arrayvec3
        u = SVector{3, T}(a, a, b)

        # v = fmul_1(mat3x3, u)    ### CUDA:  0.5ms with reals,  0.6ms with duals
        v = fmul_2(mat3x3, u)      ### CUDA:  0.5ms with reals, 12.6ms with duals

        acc += sum( v )
    end
    return acc
end

function compute(arrayvec3::CUDA.CuArray, mat3x3)
    function kernel(arrayvec3, mat3x3)
        i = (CUDA.blockIdx().x - 1) * CUDA.blockDim().x + CUDA.threadIdx().x
        i > size(arrayvec3,1) && return nothing
        arrayvec3[i] = compute_element(arrayvec3, arrayvec3[i], mat3x3)
        return nothing
    end
    threads = 16*16; blocks = cld(length(arrayvec3), threads)
    CUDA.@cuda threads=threads blocks=blocks kernel(arrayvec3, mat3x3)
    nothing
end

function compute(arrayvec3::oneAPI.oneArray, mat3x3)
    function kernel(arrayvec3, mat3x3)
        i = oneAPI.get_global_id(0)
        i > size(arrayvec3,1) && return nothing
        arrayvec3[i] = compute_element(arrayvec3, arrayvec3[i], mat3x3)
        return nothing
    end
    items = 16*16; groups = cld(length(arrayvec3), items)
    oneAPI.@oneapi items=items groups=groups kernel(arrayvec3, mat3x3)
end

function compute(arrayvec3::Array, mat3x3)
    for arrayvec3idx in CartesianIndices(arrayvec3)
        @inbounds arrayvec3[arrayvec3idx] = compute_element(arrayvec3, arrayvec3[arrayvec3idx], mat3x3)
    end
end


mat3x3 = rand(SMatrix{3,3,Float32})
mat3x3dual = SMatrix{3,3,Zygote.ForwardDiff.Dual{Nothing,Float32,1}}(mat3x3)

arrayvec3 = rand(Float32, 10000)
arrayvec3dual = Array{Zygote.ForwardDiff.Dual{Nothing,Float32,1}}(arrayvec3)

# gpu = false
gpu = true
if gpu
    gpu_arr = CUDA.has_cuda() ? CUDA.CuArray : oneAPI.oneArray
    arrayvec3 = arrayvec3 |> gpu_arr
    arrayvec3dual = arrayvec3dual |> gpu_arr
end

compute(arrayvec3, mat3x3); Array(arrayvec3) # warm up
compute(arrayvec3dual, mat3x3dual); Array(arrayvec3dual) # warm up
@time (compute(arrayvec3, mat3x3); ret = Array(arrayvec3) )
@time (compute(arrayvec3dual, mat3x3dual); retdual = Array(arrayvec3dual) )

maleadt · November 16, 2020, 9:57am

I’m getting the following timings:

CUDA.jl
  592.662 μs (13 allocations: 78.53 KiB)
  4.978 ms (13 allocations: 78.53 KiB)

oneAPI.jl
  11.427 ms (67 allocations: 81.95 KiB)
  10.438 ms (67 allocations: 81.95 KiB)

If I compare the generated code though, the same happens with oneAPI: the function is not inlined.

using StaticArrays
import Zygote
import CUDA
import oneAPI

using BenchmarkTools

@inline fmul_1(m,v) = m * v
fmul_2(m,v) = m * v

@inline function compute_element(fmul, arrayvec3, a::T, mat3x3) where T
    acc = zero(T)
    for b in arrayvec3
        u = SVector{3, T}(a, a, b)
        v = fmul(mat3x3, u)
        acc += sum(v)
    end
    return acc
end

function compute(fmul, arrayvec3::CUDA.CuArray, mat3x3)
    function kernel(arrayvec3, mat3x3)
        i = (CUDA.blockIdx().x - 1) * CUDA.blockDim().x + CUDA.threadIdx().x
        i > size(arrayvec3,1) && return
        arrayvec3[i] = compute_element(fmul, arrayvec3, arrayvec3[i], mat3x3)
        return
    end
    threads = 16*16; blocks = cld(length(arrayvec3), threads)
    CUDA.@cuda threads=threads blocks=blocks kernel(arrayvec3, mat3x3)
    nothing
end

function compute(fmul, arrayvec3::oneAPI.oneArray, mat3x3)
    function kernel(arrayvec3, mat3x3)
        i = oneAPI.get_global_id(0)
        i > size(arrayvec3,1) && return
        arrayvec3[i] = compute_element(fmul, arrayvec3, arrayvec3[i], mat3x3)
        return
    end
    items = 16*16; groups = cld(length(arrayvec3), items)
    oneAPI.@oneapi items=items groups=groups kernel(arrayvec3, mat3x3)
end

function compute(fmul, arrayvec3::Array, mat3x3)
    for arrayvec3idx in CartesianIndices(arrayvec3)
        @inbounds arrayvec3[arrayvec3idx] = compute_element(fmul, arrayvec3, arrayvec3[arrayvec3idx], mat3x3)
    end
end

function main()
    mat3x3 = rand(SMatrix{3,3,Float32})
    mat3x3dual = SMatrix{3,3,Zygote.ForwardDiff.Dual{Nothing,Float32,1}}(mat3x3)

    arrayvec3 = rand(Float32, 10000)
    arrayvec3dual = Array{Zygote.ForwardDiff.Dual{Nothing,Float32,1}}(arrayvec3)

    let arrayvec3 = CUDA.CuArray(arrayvec3), arrayvec3dual = CUDA.CuArray(arrayvec3dual)
        println("CUDA.jl")
        CUDA.@device_code dir="/tmp/cuda_inline" compute(fmul_1, arrayvec3dual, mat3x3dual)
        CUDA.@device_code dir="/tmp/cuda_normal" compute(fmul_2, arrayvec3dual, mat3x3dual)
        @btime (compute($fmul_1, $arrayvec3dual, $mat3x3dual); ret = Array($arrayvec3dual) )
        @btime (compute($fmul_2, $arrayvec3dual, $mat3x3dual); ret = Array($arrayvec3dual) )
    end

    println()

    let arrayvec3 = oneAPI.oneArray(arrayvec3), arrayvec3dual = oneAPI.oneArray(arrayvec3dual)
        println("oneAPI.jl")
        oneAPI.@device_code dir="/tmp/oneapi_inline" compute(fmul_1, arrayvec3dual, mat3x3dual)
        oneAPI.@device_code dir="/tmp/oneapi_normal" compute(fmul_2, arrayvec3dual, mat3x3dual)
        @btime (compute($fmul_1, $arrayvec3dual, $mat3x3dual); ret = Array($arrayvec3dual) )
        @btime (compute($fmul_2, $arrayvec3dual, $mat3x3dual); ret = Array($arrayvec3dual) )
    end

    return
end

isinteractive() || main()

One possibility is that Intel’s SPIR-V compiler decides to inline the function afterwards, since the IR is quite a bit higher level than CUDA’s PTX.

cdsousa · November 16, 2020, 3:07pm

Um, I thought that oneAPI would have to have everything inlined, or else it wouldn’t work.
I was assuming it was inlining automatically.

In my system, removing @inline from the @inline function compute_element ... line simply crashes the system. Probably that’s a bug worth reporting.

cdsousa · January 30, 2021, 11:13am

An interesting fact about this:
Using KernelAbstractions.jl it works full speed even without @inline annotations.

using StaticArrays
import Zygote
import CUDA
import oneAPI
using KernelAbstractions

@inline fmul_1(m,v) = m * v
        fmul_2(m,v) = m * v

@inline function compute_element(arrayvec3, a::T, mat3x3) where T
    acc = zero(T)
    for b in arrayvec3
        u = SVector{3, T}(a, a, b)

        # v = fmul_1(mat3x3, u)    ### CUDA:  0.5ms with reals,  0.6ms with duals
        v = fmul_2(mat3x3, u)      ### CUDA:  0.5ms with reals, 12.6ms with duals, 0.6ms with duals and KernelAbstractions.jl

        acc += sum( v )
    end
    return acc
end


mat3x3 = rand(SMatrix{3,3,Float32})
mat3x3dual = SMatrix{3,3,Zygote.ForwardDiff.Dual{Nothing,Float32,1}}(mat3x3)

arrayvec3 = rand(Float32, 10000)
arrayvec3dual = Array{Zygote.ForwardDiff.Dual{Nothing,Float32,1}}(arrayvec3)

# gpu = false
gpu = true
if gpu
    gpu_arr = CUDA.has_cuda() ? CUDA.CuArray : oneAPI.oneArray
    arrayvec3 = arrayvec3 |> gpu_arr
    arrayvec3dual = arrayvec3dual |> gpu_arr
end

function compute_ka(arrayvec3, mat3x3)
    @kernel function kernel_func(arrayvec3, mat3x3)
        I = @index(Global)
        arrayvec3[I] = compute_element(arrayvec3, arrayvec3[I], mat3x3)
    end

    kernel = kernel_func(CUDADevice(), 16)
    event = kernel(arrayvec3, mat3x3, ndrange=size(arrayvec3))
    wait(event)
    nothing
end

compute_ka(arrayvec3, mat3x3); Array(arrayvec3) # warm up
compute_ka(arrayvec3dual, mat3x3dual); Array(arrayvec3dual) # warm up
@time (compute_ka(arrayvec3, mat3x3); ret = Array(arrayvec3) )
@time (compute_ka(arrayvec3dual, mat3x3dual); retdual = Array(arrayvec3dual) )

jpsamaroo · January 30, 2021, 3:27pm

KernelAbstractions forcibly inlines functions on its own: Home · KernelAbstractions.jl

cdsousa · January 30, 2021, 7:11pm

Yeah, thanks, I saw that. I have yet to test it in a big kernel I have, but that and the automatic translation of math functions seem a big win for KernelAbstractions.jl