Why is my GPU kernel an order of magnitude slower than my CPU function?

Hi All,

I’m relatively new to Julia and I’ve been trying to benchmark a nested-loop function over 2 arrays, on both the CPU and GPU, however, the CPU version is significantly quicker.

Now, I’m sure that this is due to my inexperience with Julia and writing GPU kernel directly. So, I’d like to ask the Julia community what I’m doing wrong?

I’ve attached a minimal reproducible example script at the bottom of this topic.

I’ve ran a quick benchmark as a function of the size of the arrays, from 10 through 1000, and the walltime of the CPU function is significantly faster than the GPU (near an order of magnitude in some cases).

| Size | CPU walltime | CUDA walltime | CPU Allocations | CUDA Allocations |
| 10 | 10.197 μs | 306.622 μs | 64 (6.88 KiB) | 199 (4.97KiB) |
| 100 | 5.191 ms | 155.692 ms | 64 (6.88 KiB) | 200 (4.98KiB) |
| 1000 | | 118.089 s | 872.761 s | 64 (6.88KiB) | 307 (9.58 KiB) |

Any help is appreciated! Thanks

Here’s the minimal reproducible example,

using Random
using LinearAlgebra
using ProgressMeter
using BenchmarkTools
using CUDA

# numerical constants for our benchmark here
const MAX_INT = 2^12 # max integer in our array

@inline function lookup_cpu(bit::Int64, bitword::Vector{Int64}, coeffs::Vector{Float64})
    insertion_index = searchsortedfirst(bitword, bit)
    # Return 0.0 if index is out of bounds or if the state doesn't match
    if insertion_index > size(bitword,1) || bitword[insertion_index] != bit
        return 0.0
    end
    return coeffs[insertion_index]
end

@inline function calc_multi_factor(i::Int64, j::Int64, k::Int64, l::Int64)::Float64
    """ Function to compute occurance of each 4-index tuple,
        i.e. compute once times by factor """
    if i == j && j == k && k == l
        return 1.0
    elseif (i == j && j == k) || (i == j && j == l) || (i == k && k == l) || (j == k && k == l)
        return 4.0
    elseif (i == j && k == l) || (i == k && j == l) || (i == l && j == k)
        return 6.0
    elseif (i == j) || (i == k) || (i == l) || (j == k) || (j == l) || (k == l)
        return 12.0
    else
        return 24.0
    end
end

@inline function cpu_function(bitword::Vector{Int64}, coeffs::Vector{Float64}) 
    N_states = size(bitword,1)
    n_threads = Threads.nthreads()
    chunk_size = ceil(Int, N_states/n_threads)
    thread_totals = zeros(Float64, n_threads)
        
    Threads.@threads for thread_id in 1:n_threads # separate the outer loop across all threads
        # set a subset of indices in `i` loop to each available thread
        start_i = (thread_id-1)*chunk_size + 1
        end_i = min(thread_id*chunk_size, N_states)
        thread_total = 0.0 # local thread accumulator
        
        for i in start_i:end_i
            for j in i:N_states
                for k in j:N_states 
                    b123 =  bitword[i] ⊻ bitword[j] ⊻ bitword[k]
                    coeff1 = lookup_cpu(b123, bitword, coeffs)
                    if coeff1 == 0.0
                        continue # if coeff is 0. skip rest of calc
                    end 
                    base_coeff = coeffs[i] * coeffs[j] * coeffs[k]
                    for l in k:N_states
                        mult_factor = calc_multi_factor(i,j,k,l)
                        state2 = bitword[i] ⊻ bitword[k] ⊻ bitword[l]
                        coeff2 = lookup_cpu(state2, bitword, coeffs)
                        if coeff2 == 0.0
                            continue
                        end
                        state3 = bitword[j] ⊻ bitword[k] ⊻ bitword[l]
                        coeff3 = lookup_cpu(state3, bitword, coeffs)
                        if coeff3 == 0.0
                            continue
                        end
                        state4 = bitword[i] ⊻ bitword[j] ⊻ bitword[l]
                        coeff4 = lookup_cpu(state4, bitword, coeffs)
                        if coeff4 == 0.0
                            continue
                        end
                        thread_total += mult_factor * base_coeff * coeffs[l] * 
                                          coeff1 * coeff2 * coeff3 * coeff4
                    end
                end
            end
        end
        thread_totals[thread_id] = thread_total  # Store result for this thread
    end
    return -log2(sum(thread_totals))  # Combine results from all threads
end


@inline function lookup_gpu(basis::Int64, bitword::CuDeviceVector{Int64}, coeffs::CuDeviceVector{Float64}, n_configs::Int64)::Float64
     # binary tree 
    @inbounds begin 
        low = 1
        high = n_configs

        while low <= high
            mid = (low + high) >>> 1 
            mid_val = bitword[mid]
            if mid_val < basis
                low = mid + 1
            elseif mid_val > basis
                high = mid - 1
            else
                return coeffs[mid]
            end
        end
    end 
    return 0.0
end

function gpu_kernel(
    bitword::CuDeviceVector{Int64},
    coeffs::CuDeviceVector{Float64},
    N_states::Int,
    result::CuDeviceVector{Float64}
)
    tid = (threadIdx().x - 1) + (blockIdx().x - 1) * blockDim().x
    total_threads = blockDim().x * gridDim().x

    acc = 0.0f0

    @inbounds begin
        for i in tid+1:total_threads:N_states
            bi = bitword[i]
            ci = coeffs[i]
            for j in i:N_states
                bj = bitword[j]
                cj = coeffs[j]
                for k in j:N_states
                    bk = bitword[k]
                    ck = coeffs[k]

                    b123 = bi ⊻ bj ⊻ bk
                    coeff1 = lookup_gpu(b123, bitword, coeffs, N_states)
                    if coeff1 == 0.0f0
                        continue
                    end

                    base = ci * cj * ck
                    ik = bi ⊻ bk
                    jk = bj ⊻ bk
                    ij = bi ⊻ bj

                    for l in k:N_states
                        bl = bitword[l]
                        cl = coeffs[l]

                        coeff2 = lookup_gpu(ik ⊻ bl, bitword, coeffs, N_states)
                        if coeff2 == 0.0f0; continue; end

                        coeff3 = lookup_gpu(jk ⊻ bl, bitword, coeffs, N_states)
                        if coeff3 == 0.0f0; continue; end

                        coeff4 = lookup_gpu(ij ⊻ bl, bitword, coeffs, N_states)
                        if coeff4 == 0.0f0; continue; end

                        acc += calc_multi_factor(i,j,k,l) * base * cl * coeff1 * coeff2 * coeff3 * coeff4
                    end
                end
            end
        end
    end

    result[tid+1] = acc  # 1-based in Julia
    return
end

function gpu_function(bitword::Vector{Int64}, coeffs::Vector{Float64})
    N::Int64 = size(bitword,1)

    d_bitword = CuArray(bitword)
    d_coeffs = CuArray(coeffs)

    # Create dummy kernel to get launch configurations
    dummy_kernel = @cuda launch=false gpu_kernel(
        d_bitword, 
        d_coeffs, 
        N, 
        CUDA.zeros(Float64, 1)
    )
    # get number of threads/blocks from dummy kernel
    config = launch_configuration(dummy_kernel.fun)
    threads = min(N, config.threads)
    blocks = cld(N, threads)

    d_result = CUDA.zeros(Float64, threads * blocks)
    @cuda threads=threads blocks=blocks gpu_kernel(
            d_bitword, d_coeffs, N, d_result
    )

    total = sum(d_result)
    value = -log2(total)
    return value
end

# Our script begins here
NITEMS::Int64 = 100

bitword::Vector{Int64} = rand(1:MAX_INT,NITEMS) # array of integers representing binary strings

coeffs::Vector{Float64} = randn(NITEMS) # coeffs for each item in `bitword`
coeffs ./= norm(coeffs) # normalize to 1

sorted_indices = sortperm(bitword) # they need to be sorted for searchsortedfirst to work
bitword = bitword[sorted_indices]
coeffs = coeffs[sorted_indices]

# CPU code
results = @btime cpu_function($bitword, $coeffs)
println("[CPU] Size: ",size(bitword,1)," | value: ", results)

# GPU code
results_gpu = @btime gpu_function(bitword, coeffs)
println("[CUDA] Size: ",size(bitword,1)," | value: ", results_gpu)

The launch latency of CUDA kernels ranges from tens of microseconds to hundreds of microseconds. This is the price that must be paid when using a GPU. Therefore,

  • Increase the arithmetic intensity in order for the GPU to be able to hide the latency of memory accesses. Performance Tips

There is also a cost associated with copying data from the CPU to the GPU, which can only be offset when the amount of computation is substantial. Given that your data size is only between 10 and 1,000, this result is reasonable. Increase the size to over 2e6 and you will see the advantages of the GPU. Otherwise, the CPU is faster.

Double-precision floating-point calculations on GPUs are two to four times slower than single-precision calculations.

  • Use 32 bit types like Float32 and Int32 instead of 64 bit types like Float64 and Int/Int64
5 Likes

Hi @karei, thanks for the prompt response!

So it seems that performance slowdown is expected for such small amounts of data, and the GPU should only beat the CPU at very large array sizes.

Thanks for sharing the performance tips page, I’ll give it a read!

1 Like

Is this still true? I think newer GPUs don’t have this limitation any more…

Newer GPUs are much worse at this. A 5090 has 1:64 fp64 performance compared to fp32

I used AI to generate and did a rough benchmark test. This is for reference only and is not very rigorous.

Benchmark code
using LinearAlgebra, BenchmarkTools, Printf
using CUDA

function benchmark_matmul_cpu(n::Integer)
    A = rand(Float64, n, n)
    B = rand(Float64, n, n)
    t = @belapsed $A * $B
    return t
end

function benchmark_matmul_gpu(n::Integer, T::Type{<:Union{Float32,Float64}})
    A_gpu = CUDA.rand(T, n, n)
    B_gpu = CUDA.rand(T, n, n)
    # warm-up, JIT
    C_gpu = A_gpu * B_gpu
    CUDA.synchronize()

    t = @belapsed begin
        C_gpu = $A_gpu * $B_gpu
        CUDA.synchronize()
    end
    return t
end

function run_benchmarks(sizes::Vector{<:Integer})
    println("===========================================================================")
    println(rpad("Size", 8), rpad("CPU (Float64)", 20), rpad("GPU (Float32)", 20), rpad("GPU (Float64)", 20))
    println("---------------------------------------------------------------------------")
    for n in sizes
        # 1) CPU Float64
        cpu_time = benchmark_matmul_cpu(n)
        # 2) GPU Float32
        gpu_fp32_time = benchmark_matmul_gpu(n, Float32)
        # 3) GPU Float64
        gpu_fp64_time = benchmark_matmul_gpu(n, Float64)

        @printf("%-8d  %-18.6e  %-18.6e  %-18.6e\n", n, cpu_time, gpu_fp32_time, gpu_fp64_time)
    end
    println("===========================================================================")
end

sizes = (10, 100, 2^8, 2^10, 2^12, 2^14)
if !CUDA.has_cuda()
    @error "No available CUDA GPU was detected."
    return
end
println("Start testing the performance of matrix multiplication on CPU/GPU:")
run_benchmarks(sizes)

##
using PlotlyLight
xs = sizes .^ 2
# cpu_time = [9.694415e-08, 2.450000e-05, 1.437000e-04,
#     4.696800e-03, 1.689109e-01, 9.036293e+00]
# gpu_fp32_time = [3.760000e-05, 4.460000e-05, 3.950000e-05,
#     1.562000e-04, 5.876900e-03, 3.957396e-01]
# gpu_fp64_time = [4.210000e-05, 6.630000e-05, 1.096000e-04,
#     5.041500e-03, 3.099891e-01, 1.970480e+01]

trace_cpu = Config(x=xs, y=cpu_time, mode="lines+markers", name="CPU (Float64)", line=(width=2, color=:blue))

trace_gpu32 = Config(x=xs, y=gpu_fp32_time, mode="lines+markers", name="GPU (Float32)", line=(width=2, color=:red))

trace_gpu64 = Config(x=xs, y=gpu_fp64_time, mode="lines+markers", name="GPU (Float64)", line=(width=2, color=:orange))

layout = Config(
    title="NVIDIA GeForce RTX 3080",
    xaxis=Config(
        title     = "Data size",
        type      = "log",
        autorange = true,
        tickmode  = "array",
        tickvals  = xs,
    ),
    yaxis=Config(
        title="Time (s)",
        type="log",
        autorange=true,
    ),
    legend=Config(x=0.07, y=0.95),
    margin=Config(l=60, r=20, t=60, b=60),
)
fig = Plot([trace_cpu, trace_gpu32, trace_gpu64], layout)
display(fig)

I’m not sure, but I only have a 3080 on hand for your reference.

1 Like

I tried to run the code on a RTX 5090Ti (Laptop) and the results are quite similar

1 Like

Intel is the notable exception here. they have 1:8 fp64

Nvidia H series has 1:2 fp32:fp64 performance comparison, but lower total fp32 flops than regular 5000 serries or pro 6000 series (also uber expensive).