Hi,
I use the following kernel (weirdly adding n_{nin}*(n_{nin}-1)/2 to array elements) in order to evaluate the peak performance of my GPU :
# CPU bound kernel for large enough values of nin
function kernel_compute_bound!(a,nin)
index = (blockIdx().x-1)*blockDim().x+threadIdx().x
stride = blockDim().x*gridDim().x
for i = index:stride:length(a)
f1 = a[i]/a[i] #1
f2 = f1*f1 #1
s=a[i]-a[i] #0
f=a[i]-a[i] #0
for j ∈ 1:nin
f = f*f1 + f1 #fma (2 flops)
s = f*f2 + s #fma (2 flops)
end
a[i] += s # a[i]+=nin*(nin+1)/2
end
return nothing # mandatory return for kernels!!
end
Everything looks fine and I can check that the computations produce the expected results but… the computed top performance in GFLOPS (3.5 TFLOPS) using the following formula exceeds the peak performance of my GPU (3.2 TFLOPS for Float32 on a laptop GTX1650)…
gflops_cpu_bound(t,n::Int,nin) = n*4*nin/(t*1.e9) #2 fma (4 flops) per iter
I did use the synchronization CUDA function for the time measurement:
function sync_launch_kernel(a_gpu,nthreads,nblocks,gpu_kernel!)
CUDA.@sync begin
@cuda blocks=nblocks threads=nthreads gpu_kernel!(a_gpu)
end
end
and 2 different timing methods that give the same results:
function evaluate_best_time(nmeasurements,a,implem)
timings=zeros(nmeasurements)
for i in 1:nmeasurements
timings[i] = @elapsed implem(a)
end
minimum(timings)
# @belapsed $implem($a) #same problem with this
end
Although I am not very confident with my understanding of them, I think that LLVM and PTX output tend to indicate that the compiler do not optimize out the 4 flops per iter:
@device_code_llvm @cuda blocks = 256 threads = 32 kernel_compute_bound!(a, nin)
@ /home/formation6/Projects/FormationGPU/cuda_julia/CUDAPractice.jl/standalone.jl:58 within `kernel_compute_bound!'
; ┌ @ float.jl:331 within `*'
%58 = fmul float %37, %value_phi8.i.us.epil
; └
; ┌ @ float.jl:325 within `+'
%59 = fadd float %37, %58
; └
; @ /home/formation6/Projects/FormationGPU/cuda_julia/CUDAPractice.jl/standalone.jl:59 within `kernel_compute_bound!'
; ┌ @ float.jl:331 within `*'
%60 = fmul float %38, %59
; └
; ┌ @ float.jl:325 within `+'
%61 = fadd float %value_phi9.i.us.epil, %60
@device_code_sass @cuda blocks = 256 threads = 32 kernel_compute_bound!(a, nin)
.L_422:
FFMA R9, R4, R9, R4 ;
IADD3 R5, P1, R5, 0x10, RZ ;
FFMA R7, R0, R9.reuse, R7 ;
FFMA R9, R4, R9, R4.reuse ;
IMAD.X R3, RZ, RZ, R3, P1 ;
ISETP.GE.U32.AND P1, PT, R5, -0xc, PT ;
FFMA R7, R0, R9.reuse, R7 ;
FFMA R9, R4, R9, R4 ;
ISETP.GE.AND.EX P1, PT, R3, -0x1, PT, P1 ;
FFMA R7, R0, R9.reuse, R7 ;
FFMA R9, R4, R9, R4 ;
FFMA R7, R0, R9.reuse, R7 ;
FFMA R9, R4, R9, R4 ;
FFMA R7, R0, R9.reuse, R7 ;
FFMA R9, R4, R9, R4 ;
FFMA R7, R0, R9.reuse, R7 ;
FFMA R9, R4, R9, R4 ;
FFMA R7, R0, R9.reuse, R7 ;
FFMA R9, R4, R9, R4 ;
FFMA R7, R0, R9.reuse, R7 ;
FFMA R9, R4, R9, R4 ;
FFMA R7, R0, R9.reuse, R7 ;
FFMA R9, R4, R9, R4 ;
FFMA R7, R0, R9.reuse, R7 ;
FFMA R9, R4, R9, R4 ;
FFMA R7, R0, R9.reuse, R7 ;
FFMA R9, R4, R9, R4 ;
FFMA R7, R0, R9.reuse, R7 ;
FFMA R9, R4, R9, R4 ;
FFMA R7, R0, R9.reuse, R7 ;
FFMA R9, R4, R9, R4 ;
FFMA R7, R0, R9.reuse, R7 ;
FFMA R9, R4, R9, R4 ;
FFMA R7, R0, R9.reuse, R7 ;
FFMA R9, R4, R9, R4 ;
FFMA R7, R0, R9, R7 ;
@!P1 BRA `(.L_422) ;
Here is the MWE that one may copy/paste to reproduce these results (with a NVidia GPU):
MWE.jl
using CUDA
using DataFrames
using Plots
function evaluate_best_time(nmeasurements,a,implem)
timings=zeros(nmeasurements)
for i in 1:nmeasurements
timings[i] = @elapsed implem(a)
end
minimum(timings)
# @belapsed $implem($a) #use this for more robust timings
end
function check_diff(a_cpu,a_gpu)
b = Array(a_gpu) # copy from GPU to CPU
max_diff=zero(eltype(b))
max_index = 1
for i in 1:length(b)
diff = abs(a_cpu[i]-b[i])
if max_diff < diff
max_diff = diff
max_index = i
end
end
if max_diff>1.e-6
println("max diff=",max_diff," at index i=",max_index," cpu[$max_index]=",a_cpu[max_index]," gpu[$max_index]=",b[max_index])
end
@assert max_diff<1.e-6
end
function evaluate_performance(T,n,nmeasurements,cpu_implem!,gpu_implem!)
a_cpu,a_gpu = initialize_cpugpu(T,n)
check_diff(a_cpu,a_gpu)
t_cpu=evaluate_best_time(nmeasurements,a_cpu,cpu_implem!)
t_gpu=evaluate_best_time(nmeasurements,a_gpu,gpu_implem!)
check_diff(a_cpu,a_gpu)
t_cpu,t_gpu
end
function sync_launch_kernel(a_gpu,nthreads,nblocks,gpu_kernel!)
CUDA.@sync begin
@cuda blocks=nblocks threads=nthreads gpu_kernel!(a_gpu)
end
end
# CPU bound kernel that will not saturate the GPU memory
# bandwidth for large enough nin
function kernel_compute_bound!(a,nin)
index = (blockIdx().x-1)*blockDim().x+threadIdx().x
stride = blockDim().x*gridDim().x
for i = index:stride:length(a)
f1 = a[i]/a[i] #1
f2 = f1*f1 #1
s=a[i]-a[i]
f=a[i]-a[i]
for j ∈ 1:nin
f = f*f1 + f1 #fma
s = f*f2 + s #fma
end
a[i] += s
end
return nothing # mandatory return for kernels!!
end
gflops_cpu_bound(t,n::Int,nin) = n*4*nin/(t*1.e9)
gbs(t,T,n) = 2n*sizeof(T)/(t*1.e9)
sum_of_n(n) = n*(n+1)/2
function cpu_compute_bound!(a,nin)
for i in 1:length(a)
f=Float32(nin)
a[i]+=sum_of_n(nin)
end
nothing
end
initialize_cpu(T,n) = [T(i) for i ∈ 1:n]
function initialize_cpugpu(T,n)
a_cpu = initialize_cpu(T,n)
a_gpu = CuArray(a_cpu)
a_cpu,a_gpu
end
function plotgflops_nin(dfg)
theme(:dark)
nthreads=Int(dfg[1,:nthreads])
nblocks=Int(dfg[1,:nblocks])
array_size=Int(dfg[1,:array_size])
title="nthreads=$nthreads nblocks=$nblocks array_size=$array_size"
label=""
yp2=Int.(round.(log2.(dfg[!,:GFLOPs])))
ypextrema=(max(0,minimum(yp2)),maximum(yp2))
yticks=[2^p for p ∈ ypextrema[1]:ypextrema[2]]
maxy=Int(round(maximum(dfg[!,:GFLOPs])))
push!(yticks,maxy)
yticks=sort(yticks)
p=plot(dfg[!,:NIN],dfg[!,:GFLOPs],label=label,title=title,xticks = (dfg[!,:NIN],Int.(dfg[!,:NIN])),yticks = (yticks,string.(yticks)),
xaxis=:log10,size=(800,600),box=:on,legend=:outertopright,
linewidth= 3,xlabel="nin",markershape=:hexagon,markersize=4,ylabel="GFLOPs",
yscale = :log10)
p
end
#Test for max GFLOPS (peak performance) as a function of nin
function test(T=Float32,nmax=1024*1024*4,nmeasurements=100)
n=nmax
CUDA.allowscalar(false)
df_gpu = DataFrame(nblocks=[],nthreads=[],array_size=Int[],min_time=Float64[],GFLOPs=Float64[],GBs=Float64[],NIN=Int[])
minpowernin,maxpowernin = (0,10)
nblocks,nthreads = (512,256)
for nin ∈ (2^p for p ∈ minpowernin:maxpowernin)
@show nin
function gpu_implem!(a_gpu)
CUDA.synchronize()
@cuda blocks=nblocks threads=nthreads kernel_compute_bound!(a_gpu,nin)
CUDA.synchronize()
end
cpu_implem!(a_cpu) = cpu_compute_bound!(a_cpu,nin)
t_cpu,t_gpu=evaluate_performance(T,n,nmeasurements,cpu_implem!,gpu_implem!)
push!(df_gpu,[nblocks,nthreads,n,t_gpu,gflops_cpu_bound(t_gpu,n,nin),gbs(t_gpu,T,n),nin])
end
df_gpu
end
df_gpu=test()
@show df_gpu
plotgflops_nin(df_gpu)
I am stuck and cannot find my mistake. Any ideas ?