using CUDA
using BenchmarkTools
using Printf
function benchmark_floating_point(T::Type, size=2048)
aligned_size = div(size, 8) * 8
A = CUDA.rand(T, (aligned_size, aligned_size))
B = CUDA.rand(T, (aligned_size, aligned_size))
C = similar(A)
CUDA.@sync A * B
elapsed_time = @elapsed CUDA.@sync C = A * B
operations = 2 * aligned_size^3
flops = operations / elapsed_time / 1e12 # TFLOPS
data_transferred = 3 * sizeof(T) * aligned_size^2 / 1e9 # GB
bandwidth = data_transferred / elapsed_time # GB/s
return (elapsed_time, flops, bandwidth)
end
function benchmark_tensor_core(size=2048)
aligned_size = div(size, 8) * 8
A = CUDA.rand(Float16, (aligned_size, aligned_size))
B = CUDA.rand(Float16, (aligned_size, aligned_size))
C = CUDA.zeros(Float32, (aligned_size, aligned_size))
CUDA.@sync A * B
elapsed_time = @elapsed CUDA.@sync C = A * B
operations = 2 * aligned_size^3
flops = operations / elapsed_time / 1e12
return (elapsed_time, flops)
end
function main()
sizes = [256, 512, 1024, 2048]
precisions = [Float16, Float32, Float64]
println("RTX 3080 Ti FP performance (Julia + CUDA.jl)")
println("=============================================")
# 测试原生精度
for T in precisions
println("\n[FP] ", T)
for size in sizes
time, flops, bw = benchmark_floating_point(T, size)
@printf("Size: %4d x %4d | Time: %.4f s | TFLOPS: %6.2f | Bandwidth: %6.1f GB/s\n",
size, size, time, flops, bw)
end
end
println("\n[Tensor Core]")
ENV["JULIA_CUDA_USE_TENSOR_CORES"] = "1"
for size in sizes
time, flops = benchmark_tensor_core(size)
@printf("Size: %4d x %4d | Time: %.4f s | TFLOPS: %6.2f (Tensor Core)\n",
size, size, time, flops)
end
end
main()
RTX 3080 Ti FP (Julia + CUDA.jl)
=============================================
Float16
Size: 256 x 256 | Time: 0.0000 s | TFLOPS: 1.28 | Bandwidth: 15.0 GB/s
Size: 512 x 512 | Time: 0.0000 s | TFLOPS: 10.09 | Bandwidth: 59.1 GB/s
Size: 1024 x 1024 | Time: 0.0000 s | TFLOPS: 52.76 | Bandwidth: 154.6 GB/s
Size: 2048 x 2048 | Time: 0.0002 s | TFLOPS: 87.74 | Bandwidth: 128.5 GB/s
Float32
Size: 256 x 256 | Time: 0.0000 s | TFLOPS: 1.23 | Bandwidth: 28.9 GB/s
Size: 512 x 512 | Time: 0.0000 s | TFLOPS: 6.74 | Bandwidth: 79.0 GB/s
Size: 1024 x 1024 | Time: 0.0001 s | TFLOPS: 14.82 | Bandwidth: 86.8 GB/s
Size: 2048 x 2048 | Time: 0.0008 s | TFLOPS: 21.84 | Bandwidth: 64.0 GB/s
Float64
Size: 256 x 256 | Time: 0.0001 s | TFLOPS: 0.27 | Bandwidth: 12.5 GB/s
Size: 512 x 512 | Time: 0.0007 s | TFLOPS: 0.37 | Bandwidth: 8.6 GB/s
Size: 1024 x 1024 | Time: 0.0050 s | TFLOPS: 0.43 | Bandwidth: 5.1 GB/s
Size: 2048 x 2048 | Time: 0.0367 s | TFLOPS: 0.47 | Bandwidth: 2.7 GB/s
[Tensor Core]
Size: 256 x 256 | Time: 0.0000 s | TFLOPS: 1.46 (Tensor Core)
Size: 512 x 512 | Time: 0.0000 s | TFLOPS: 8.66 (Tensor Core)
Size: 1024 x 1024 | Time: 0.0001 s | TFLOPS: 21.80 (Tensor Core)
Size: 2048 x 2048 | Time: 0.0002 s | TFLOPS: 74.70 (Tensor Core)
eldee
February 2, 2025, 3:55pm
2
I’m no expert, but I’d imagine for the first eight values AIDA64 uses benchmarks which are either completely memory-bound, or completely compute-bound. Matrix multiplication does not seem an appropriate test for either. Additionally note that AIDA64’s results will still not be the same as the (official) theoretical values.
Also, in
the declared C
matrix is not used. Instead A * B
creates a new matrix, which we also call C
. You should use something like mul!(C, A, B)
from LinearAlgebra.jl.
I’m not convinced this is fully (or sufficiently) accurate. You’re probably better off testing something more straightforward than (optimised) matrix multiplication.
2 Likes