Why the Floating-Point Calculation Efficiency of CUDA.jl Does Not Reach the Official Theoretical Value

using CUDA
using BenchmarkTools
using Printf

function benchmark_floating_point(T::Type, size=2048)
    aligned_size = div(size, 8) * 8
    A = CUDA.rand(T, (aligned_size, aligned_size))
    B = CUDA.rand(T, (aligned_size, aligned_size))
    C = similar(A)

    CUDA.@sync A * B

    elapsed_time = @elapsed CUDA.@sync C = A * B

    operations = 2 * aligned_size^3
    flops = operations / elapsed_time / 1e12  # TFLOPS
    data_transferred = 3 * sizeof(T) * aligned_size^2 / 1e9  # GB
    bandwidth = data_transferred / elapsed_time  # GB/s

    return (elapsed_time, flops, bandwidth)
end

function benchmark_tensor_core(size=2048)
    aligned_size = div(size, 8) * 8

    A = CUDA.rand(Float16, (aligned_size, aligned_size))
    B = CUDA.rand(Float16, (aligned_size, aligned_size))
    C = CUDA.zeros(Float32, (aligned_size, aligned_size))

    CUDA.@sync A * B

    elapsed_time = @elapsed CUDA.@sync C = A * B

    operations = 2 * aligned_size^3
    flops = operations / elapsed_time / 1e12
    return (elapsed_time, flops)
end

function main()
    sizes = [256, 512, 1024, 2048]
    precisions = [Float16, Float32, Float64]

    println("RTX 3080 Ti FP performance (Julia + CUDA.jl)")
    println("=============================================")

    # 测试原生精度
    for T in precisions
        println("\n[FP] ", T)
        for size in sizes
            time, flops, bw = benchmark_floating_point(T, size)
            @printf("Size: %4d x %4d | Time: %.4f s | TFLOPS: %6.2f | Bandwidth: %6.1f GB/s\n",
                    size, size, time, flops, bw)
        end
    end

    println("\n[Tensor Core]")
    ENV["JULIA_CUDA_USE_TENSOR_CORES"] = "1"  
    for size in sizes
        time, flops = benchmark_tensor_core(size)
        @printf("Size: %4d x %4d | Time: %.4f s | TFLOPS: %6.2f (Tensor Core)\n",
                size, size, time, flops)
    end
end

main()
RTX 3080 Ti FP (Julia + CUDA.jl)
=============================================

Float16
Size:  256 x  256 | Time: 0.0000 s | TFLOPS:   1.28 | Bandwidth:   15.0 GB/s
Size:  512 x  512 | Time: 0.0000 s | TFLOPS:  10.09 | Bandwidth:   59.1 GB/s
Size: 1024 x 1024 | Time: 0.0000 s | TFLOPS:  52.76 | Bandwidth:  154.6 GB/s
Size: 2048 x 2048 | Time: 0.0002 s | TFLOPS:  87.74 | Bandwidth:  128.5 GB/s

Float32
Size:  256 x  256 | Time: 0.0000 s | TFLOPS:   1.23 | Bandwidth:   28.9 GB/s
Size:  512 x  512 | Time: 0.0000 s | TFLOPS:   6.74 | Bandwidth:   79.0 GB/s
Size: 1024 x 1024 | Time: 0.0001 s | TFLOPS:  14.82 | Bandwidth:   86.8 GB/s
Size: 2048 x 2048 | Time: 0.0008 s | TFLOPS:  21.84 | Bandwidth:   64.0 GB/s

Float64
Size:  256 x  256 | Time: 0.0001 s | TFLOPS:   0.27 | Bandwidth:   12.5 GB/s
Size:  512 x  512 | Time: 0.0007 s | TFLOPS:   0.37 | Bandwidth:    8.6 GB/s
Size: 1024 x 1024 | Time: 0.0050 s | TFLOPS:   0.43 | Bandwidth:    5.1 GB/s
Size: 2048 x 2048 | Time: 0.0367 s | TFLOPS:   0.47 | Bandwidth:    2.7 GB/s

[Tensor Core]
Size:  256 x  256 | Time: 0.0000 s | TFLOPS:   1.46 (Tensor Core)
Size:  512 x  512 | Time: 0.0000 s | TFLOPS:   8.66 (Tensor Core)
Size: 1024 x 1024 | Time: 0.0001 s | TFLOPS:  21.80 (Tensor Core)
Size: 2048 x 2048 | Time: 0.0002 s | TFLOPS:  74.70 (Tensor Core)

I’m no expert, but I’d imagine for the first eight values AIDA64 uses benchmarks which are either completely memory-bound, or completely compute-bound. Matrix multiplication does not seem an appropriate test for either. Additionally note that AIDA64’s results will still not be the same as the (official) theoretical values.

Also, in

the declared C matrix is not used. Instead A * B creates a new matrix, which we also call C. You should use something like mul!(C, A, B) from LinearAlgebra.jl.

I’m not convinced this is fully (or sufficiently) accurate. You’re probably better off testing something more straightforward than (optimised) matrix multiplication.

2 Likes