I am new to CUDAnative.jl and I am trying to benchmark a very simple computational fluid dynamic model (based on the 1D linear shallow water equations). Essentially I get 3.228 s when I run the code on a CPU (i7-7700 CPU, using only a single core) and 1.324 s on a GPU (GeForce GTX 1080). I am wondering if I should expected a higher speed-up. When using nvprof
, 93% of the time is used in the function call cuModuleLoadDataEx
and only 5.6% in the function cuLaunchKernel
. Could it be that a data transfer issue?
The file shallow_water_gpu.jl
:
using CUDAnative
using CUDAdrv
using Test
using BenchmarkTools
g = Float32(9.81); # m/s^2
hbar = Float32(30); # m
L = 100e3; # m
imax = 1000
nmax = Int32(2000000)
#nmax = Int32(200)
a = 2; # m
b = 5e3; # m
dt = Float32(1)
x = range(0,stop=L,length=imax)
dx = Float32(x[2] - x[1])
#
# u[i] u[i+1]
# |-----o-----|-----o-----|-----o-----|
# η η
# i i+1
function shallow_water_cpu!(η, u, hbar, g, dx, dt, nmax)
imax = length(η)
a = - hbar*dt/dx
b = - g * dt/dx
for n = 1:nmax
for i = 1:imax
η[i] = η[i] + a * (u[i+1] - u[i])
im1 = max(i-1,1)
u[i] = u[i] + b * (η[i] - η[im1])
end
end
end
function shallow_water_gpu(η, u, hbar, g, dx, dt, nmax)
i = (blockIdx().x-1) * blockDim().x + threadIdx().x
a = - hbar*dt/dx
b = - g * dt/dx
for n = 1:nmax
η[i] = η[i] + a * (u[i+1] - u[i])
sync_threads()
im1 = max(i-1,1)
u[i] = u[i] + b * (η[i] - η[im1])
sync_threads()
end
return nothing
end
# initial conditions
η0 = zeros(Float32, (imax, 1))
η0 .= exp.(-(x./b).^2)
u0 = zeros(Float32, (imax+1, 1))
u0[1] = u0[end] = 0
η = copy(η0)
u = copy(u0)
# upload data to device
d_η = CuArray(η)
d_u = CuArray(u)
GPU Benchmark
bgpu = @benchmark begin
@cuda threads=length(η) shallow_water_gpu($d_η, $d_u, $hbar, $g, $dx, $dt, $nmax)
# download data from device
ηg = Array(d_η)
ug = Array(d_u)
end
GPU Result
BenchmarkTools.Trial:
memory estimate: 9.19 KiB
allocs estimate: 39
--------------
minimum time: 1.317 s (0.00% GC)
median time: 1.324 s (0.00% GC)
mean time: 1.323 s (0.00% GC)
maximum time: 1.329 s (0.00% GC)
--------------
samples: 4
evals/sample: 1
CPU Benchmark
bcpu = @benchmark shallow_water_cpu!($η, $u, $hbar, $g, $dx, $dt, $nmax)
CPU Result
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 3.220 s (0.00% GC)
median time: 3.228 s (0.00% GC)
mean time: 3.228 s (0.00% GC)
maximum time: 3.235 s (0.00% GC)
--------------
samples: 2
evals/sample: 1
Profiler
sudo nvprof --profile-from-start off /home/abarth/opt/julia-1.0.2/bin/julia
julia> include("shallow_water_gpu.jl");
==30576== NVPROF is profiling process 30576, command: /home/abarth/opt/julia-1.0.2/bin/julia
julia> CUDAdrv.@profile @cuda threads=length(η) shallow_water_gpu(d_η, d_u, hbar, g, dx, dt, nmax)
==30576== Profiling application: /home/abarth/opt/julia-1.0.2/bin/julia
==30576== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 100.00% 1.36197s 1 1.36197s 1.36197s 1.36197s ptxcall_shallow_water_gpu_1
API calls: 93.48% 999.40us 1 999.40us 999.40us 999.40us cuModuleLoadDataEx
5.67% 60.668us 1 60.668us 60.668us 60.668us cuLaunchKernel
0.27% 2.9000us 3 966ns 330ns 2.0270us cuCtxGetCurrent
0.25% 2.7230us 2 1.3610us 333ns 2.3900us cuDeviceGetAttribute
0.19% 2.0260us 1 2.0260us 2.0260us 2.0260us cuDeviceGetCount
0.07% 800ns 1 800ns 800ns 800ns cuModuleGetFunction
0.06% 616ns 1 616ns 616ns 616ns cuCtxGetDevice