You are not synchronizing the kernels, hence I believe you measure only the kernel launch. You need to CUDA.@sync
the CUDA call.
using Pkg
Pkg.add("CUDA")
Pkg.add("BenchmarkTools")
using Plots, CUDA, BenchmarkTools
pMax = 8
powerVector = 1:pMax
timeVectorCPU = Vector{Float16}(undef, pMax)
timeVectorGPU = Vector{Float16}(undef, pMax)
for p in powerVector
n = 10^p
xCPU, yCPU = (ones(n), ones(n))
xGPU, yGPU = (cu(xCPU), cu(yCPU))
timeVectorCPU[p] = @belapsed $xCPU + $yCPU
timeVectorGPU[p] = @belapsed CUDA.@sync $xGPU + $yGPU
end
timeVectorCPU |> display
timeVectorGPU |> display
plot(
10 .^ powerVector,
[timeVectorCPU timeVectorGPU],
label = ["CPU" "GPU"],
title = "CPU vs GPU",
xscale = :log10,
yscale = :log10,
ylabel = "Elapsed time [s]",
fmt = :png
)