GPU performance evaluation exceeds target peakflop using CUDA.jl

LaurentPlagne · April 12, 2021, 11:04am

Hi,
I use the following kernel (weirdly adding n_{nin}*(n_{nin}-1)/2 to array elements) in order to evaluate the peak performance of my GPU :

# CPU bound kernel for large enough values of nin
function kernel_compute_bound!(a,nin)
    index = (blockIdx().x-1)*blockDim().x+threadIdx().x
    stride = blockDim().x*gridDim().x

    for i = index:stride:length(a)
        f1 = a[i]/a[i] #1
        f2 = f1*f1     #1
        s=a[i]-a[i]    #0
        f=a[i]-a[i]    #0
        for j ∈ 1:nin
            f = f*f1 + f1 #fma (2 flops)
            s = f*f2  + s #fma (2 flops)
        end
        a[i] += s # a[i]+=nin*(nin+1)/2
    end
    return nothing # mandatory return for kernels!!
end

Everything looks fine and I can check that the computations produce the expected results but… the computed top performance in GFLOPS (3.5 TFLOPS) using the following formula exceeds the peak performance of my GPU (3.2 TFLOPS for Float32 on a laptop GTX1650)…

gflops_cpu_bound(t,n::Int,nin) = n*4*nin/(t*1.e9) #2 fma (4 flops) per iter

I did use the synchronization CUDA function for the time measurement:

function sync_launch_kernel(a_gpu,nthreads,nblocks,gpu_kernel!)
    CUDA.@sync begin 
        @cuda blocks=nblocks threads=nthreads gpu_kernel!(a_gpu)           
    end
end

and 2 different timing methods that give the same results:

function evaluate_best_time(nmeasurements,a,implem)
    timings=zeros(nmeasurements)
    for i in 1:nmeasurements
        timings[i] = @elapsed implem(a)
    end 
    minimum(timings)
    # @belapsed $implem($a) #same problem with this
end

Although I am not very confident with my understanding of them, I think that LLVM and PTX output tend to indicate that the compiler do not optimize out the 4 flops per iter:

@device_code_llvm @cuda blocks = 256 threads = 32 kernel_compute_bound!(a, nin)

 @ /home/formation6/Projects/FormationGPU/cuda_julia/CUDAPractice.jl/standalone.jl:58 within `kernel_compute_bound!'
; ┌ @ float.jl:331 within `*'
   %58 = fmul float %37, %value_phi8.i.us.epil
; └
; ┌ @ float.jl:325 within `+'
   %59 = fadd float %37, %58
; └
;  @ /home/formation6/Projects/FormationGPU/cuda_julia/CUDAPractice.jl/standalone.jl:59 within `kernel_compute_bound!'
; ┌ @ float.jl:331 within `*'
   %60 = fmul float %38, %59
; └
; ┌ @ float.jl:325 within `+'
   %61 = fadd float %value_phi9.i.us.epil, %60

@device_code_sass @cuda blocks = 256 threads = 32 kernel_compute_bound!(a, nin)


.L_422:
        FFMA R9, R4, R9, R4 ;
        IADD3 R5, P1, R5, 0x10, RZ ;
        FFMA R7, R0, R9.reuse, R7 ;
        FFMA R9, R4, R9, R4.reuse ;
        IMAD.X R3, RZ, RZ, R3, P1 ;
        ISETP.GE.U32.AND P1, PT, R5, -0xc, PT ;
        FFMA R7, R0, R9.reuse, R7 ;
        FFMA R9, R4, R9, R4 ;
        ISETP.GE.AND.EX P1, PT, R3, -0x1, PT, P1 ;
        FFMA R7, R0, R9.reuse, R7 ;
        FFMA R9, R4, R9, R4 ;
        FFMA R7, R0, R9.reuse, R7 ;
        FFMA R9, R4, R9, R4 ;
        FFMA R7, R0, R9.reuse, R7 ;
        FFMA R9, R4, R9, R4 ;
        FFMA R7, R0, R9.reuse, R7 ;
        FFMA R9, R4, R9, R4 ;
        FFMA R7, R0, R9.reuse, R7 ;
        FFMA R9, R4, R9, R4 ;
        FFMA R7, R0, R9.reuse, R7 ;
        FFMA R9, R4, R9, R4 ;
        FFMA R7, R0, R9.reuse, R7 ;
        FFMA R9, R4, R9, R4 ;
        FFMA R7, R0, R9.reuse, R7 ;
        FFMA R9, R4, R9, R4 ;
        FFMA R7, R0, R9.reuse, R7 ;
        FFMA R9, R4, R9, R4 ;
        FFMA R7, R0, R9.reuse, R7 ;
        FFMA R9, R4, R9, R4 ;
        FFMA R7, R0, R9.reuse, R7 ;
        FFMA R9, R4, R9, R4 ;
        FFMA R7, R0, R9.reuse, R7 ;
        FFMA R9, R4, R9, R4 ;
        FFMA R7, R0, R9.reuse, R7 ;
        FFMA R9, R4, R9, R4 ;
        FFMA R7, R0, R9, R7 ;
   @!P1 BRA `(.L_422) ;

Here is the MWE that one may copy/paste to reproduce these results (with a NVidia GPU):

MWE.jl

using CUDA
using DataFrames
using Plots

function evaluate_best_time(nmeasurements,a,implem)
    timings=zeros(nmeasurements)
    for i in 1:nmeasurements
        timings[i] = @elapsed implem(a)
    end 
    minimum(timings)
    # @belapsed $implem($a) #use this for more robust timings
end 

function check_diff(a_cpu,a_gpu)
    b = Array(a_gpu)        # copy from GPU to CPU
    max_diff=zero(eltype(b))
    max_index = 1
    for i in 1:length(b)
        diff = abs(a_cpu[i]-b[i])
        if max_diff < diff
            max_diff = diff
            max_index = i
        end
    end
    if max_diff>1.e-6
        println("max diff=",max_diff," at index i=",max_index," cpu[$max_index]=",a_cpu[max_index]," gpu[$max_index]=",b[max_index])
    end
    @assert max_diff<1.e-6
end

function evaluate_performance(T,n,nmeasurements,cpu_implem!,gpu_implem!)
    a_cpu,a_gpu = initialize_cpugpu(T,n)
    check_diff(a_cpu,a_gpu) 
    t_cpu=evaluate_best_time(nmeasurements,a_cpu,cpu_implem!)
    t_gpu=evaluate_best_time(nmeasurements,a_gpu,gpu_implem!)
    check_diff(a_cpu,a_gpu) 
    t_cpu,t_gpu
end

function sync_launch_kernel(a_gpu,nthreads,nblocks,gpu_kernel!)
    CUDA.@sync begin 
        @cuda blocks=nblocks threads=nthreads gpu_kernel!(a_gpu)           
    end
end

# CPU bound kernel that will not saturate the GPU memory
# bandwidth for large enough nin
function kernel_compute_bound!(a,nin)
    index = (blockIdx().x-1)*blockDim().x+threadIdx().x
    stride = blockDim().x*gridDim().x

    for i = index:stride:length(a)
        f1 = a[i]/a[i] #1
        f2 = f1*f1     #1
        s=a[i]-a[i]
        f=a[i]-a[i]  
        for j ∈ 1:nin
            f = f*f1 + f1 #fma 
            s = f*f2  + s #fma
        end
        a[i] += s
    end
    return nothing # mandatory return for kernels!!
end

gflops_cpu_bound(t,n::Int,nin) = n*4*nin/(t*1.e9) 
gbs(t,T,n) = 2n*sizeof(T)/(t*1.e9)

sum_of_n(n) = n*(n+1)/2

function cpu_compute_bound!(a,nin)
    for i in 1:length(a)
        f=Float32(nin)
        a[i]+=sum_of_n(nin)
    end
    nothing
end


initialize_cpu(T,n) = [T(i) for i ∈ 1:n]
function initialize_cpugpu(T,n)
    a_cpu = initialize_cpu(T,n)
    a_gpu = CuArray(a_cpu)
    a_cpu,a_gpu
end


function plotgflops_nin(dfg)
    theme(:dark)
 
    nthreads=Int(dfg[1,:nthreads])
    nblocks=Int(dfg[1,:nblocks])
    array_size=Int(dfg[1,:array_size])
    title="nthreads=$nthreads nblocks=$nblocks array_size=$array_size"
    label=""

    yp2=Int.(round.(log2.(dfg[!,:GFLOPs])))
    ypextrema=(max(0,minimum(yp2)),maximum(yp2))
    yticks=[2^p for p ∈ ypextrema[1]:ypextrema[2]]
    maxy=Int(round(maximum(dfg[!,:GFLOPs])))
    push!(yticks,maxy)
    yticks=sort(yticks)

    p=plot(dfg[!,:NIN],dfg[!,:GFLOPs],label=label,title=title,xticks = (dfg[!,:NIN],Int.(dfg[!,:NIN])),yticks = (yticks,string.(yticks)),     
            xaxis=:log10,size=(800,600),box=:on,legend=:outertopright,
            linewidth= 3,xlabel="nin",markershape=:hexagon,markersize=4,ylabel="GFLOPs",
            yscale = :log10)
    p
end

#Test for max GFLOPS (peak performance) as a function of nin
function test(T=Float32,nmax=1024*1024*4,nmeasurements=100)
    n=nmax
    CUDA.allowscalar(false)
    df_gpu = DataFrame(nblocks=[],nthreads=[],array_size=Int[],min_time=Float64[],GFLOPs=Float64[],GBs=Float64[],NIN=Int[])
    minpowernin,maxpowernin = (0,10)
    nblocks,nthreads = (512,256)
    
    for nin ∈ (2^p for p ∈ minpowernin:maxpowernin)
        @show nin
        function gpu_implem!(a_gpu)
            CUDA.synchronize()
            @cuda blocks=nblocks threads=nthreads kernel_compute_bound!(a_gpu,nin)
            CUDA.synchronize()
        end
        cpu_implem!(a_cpu) = cpu_compute_bound!(a_cpu,nin)
        t_cpu,t_gpu=evaluate_performance(T,n,nmeasurements,cpu_implem!,gpu_implem!)
        push!(df_gpu,[nblocks,nthreads,n,t_gpu,gflops_cpu_bound(t_gpu,n,nin),gbs(t_gpu,T,n),nin])
    end
    df_gpu
end

df_gpu=test()
@show df_gpu
plotgflops_nin(df_gpu)

I am stuck and cannot find my mistake. Any ideas ?

antoine-levitt · April 13, 2021, 9:11am

Ah I just hate it too when my code is too fast

I know nothing about GPUs, but are you sure a FMA counts as two FLOPs?

LaurentPlagne · April 13, 2021, 10:18am

I know nothing about GPUs, but are you sure a FMA counts as two FLOPs?

I believe so. I think that the constructor’s peak perf is computed as n_{cudacore}*f_{boost}*2 with the 2 coming from fma…

I just ran the same test on a RTX 2060 Super with a SP peakperf of 7.181 TFLOPs and again obtain a top perf exceeding this limit (7.347):

The difference is not that big but I would feel much more comfortable if it was in the opposite direction

I try to setup a GPU lecture on GPGPU based on Julia which relies on an experimental discovery of the GPU properties. The lecturer is supposed to have a minimal mastery of his subject: it seems that it is not the case…

antoine-levitt · April 13, 2021, 10:42am

Just blame it on measurement error

devel-chm · April 13, 2021, 2:06pm

Maybe the GPU clocks are running faster than the spec?

LaurentPlagne · April 13, 2021, 3:22pm

After a simple google search, it seems that you are right and that the so called boost frequency is not an upper limit !

We get this question so much, nvidia and amd need to be more clear in how their boost functions work. That is GPU boost 3.0 at work, that gpu will essentially overclock itself if temp and power limits allow. That number they put on paper ins the min boost clock you should expect. My 1060 for example has a boost clock on paper of 1808mhz, but in practice it will boost to 1974mhz with no manual overclock.

Thank you very much, I did not consider this at all !

ajsagit · March 4, 2022, 2:03pm

Which lecture pls?TIA

Topic		Replies	Views
GPU kernel optimization (GPU vs CPU) GPU	3	1528	December 14, 2018
How much faster is GPU compare to CPU GPU	16	26981	November 24, 2018
Julia faster on Mac OSX than in linux? New to Julia	23	3815	April 28, 2018
Parallelizaton on GPU slower than on CPU...? Performance gpu	10	2359	January 21, 2020
Show off Julia performance on your PC! Performance	53	4500	April 26, 2020

GPU performance evaluation exceeds target peakflop using CUDA.jl

Related topics