Hello.
I’ve been learning sharedmemory from here:
https://pde-on-gpu.vaw.ethz.ch/lecture10/.
And I’m going through the motion to understand this. But when I benchmark the kernel I wrote based on the lecture, I found the profromance decreasing rather than increasing.
Here is my code:
using CUDA
using BenchmarkTools
nx = ny = 2^12
T = CUDA.rand(Float64, nx, ny);
T2 = CUDA.rand(Float64, nx, ny);
Ci = CUDA.rand(Float64, nx, ny);
lam = _dx = _dy = dt = rand();
threads = (32,8)
blocks = (nx÷threads[1], ny÷threads[2])
function update_temperature!(T2, T, Ci, lam, dt, _dx, _dy)
ix = (blockIdx().x-1) * blockDim().x + threadIdx().x
iy = (blockIdx().y-1) * blockDim().y + threadIdx().y
if (ix>1 && ix<size(T2,1) && iy>1 && iy<size(T2,2))
@inbounds T2[ix,iy] = T[ix,iy] + dt*Ci[ix,iy]*(
- ((-lam*(T[ix+1,iy] - T[ix,iy])*_dx) - (-lam*(T[ix,iy] - T[ix-1,iy])*_dx))*_dx
- ((-lam*(T[ix,iy+1] - T[ix,iy])*_dy) - (-lam*(T[ix,iy] - T[ix,iy-1])*_dy))*_dy
)
end
return nothing
end
function update_temperature_sha!(T2, T, Ci, lam, dt, _dx, _dy)
ix = (blockIdx().x-1) * blockDim().x + threadIdx().x
iy = (blockIdx().y-1) * blockDim().y + threadIdx().y
tx = threadIdx().x+1
ty = threadIdx().y+1
T_l = @cuDynamicSharedMem(eltype(T), (blockDim().x+2, blockDim().y+2))
@inbounds T_l[tx,ty] = T[ix,iy]
if (ix>1 && ix<size(T2,1) && iy>1 && iy<size(T2,2))
@inbounds if (threadIdx().x == 1) T_l[tx-1,ty] = T[ix-1,iy] end
@inbounds if (threadIdx().x == blockDim().x) T_l[tx+1,ty] = T[ix+1,iy] end
@inbounds if (threadIdx().y == 1) T_l[tx,ty-1] = T[ix,iy-1] end
@inbounds if (threadIdx().y == blockDim().y) T_l[tx,ty+1] = T[ix,iy+1] end
sync_threads()
@inbounds T2[ix,iy] = T_l[tx,ty] + dt*Ci[ix,iy]*(
- ((-lam*(T_l[tx+1,ty] - T_l[tx,ty])*_dx) - (-lam*(T_l[tx,ty] - T_l[tx-1,ty])*_dx))*_dx
- ((-lam*(T_l[tx,ty+1] - T_l[tx,ty])*_dy) - (-lam*(T_l[tx,ty] - T_l[tx,ty-1])*_dy))*_dy
)
end
return
end
t_it = @belapsed begin @cuda blocks=$blocks threads=$threads shmem=prod($threads.+2)*sizeof(Float64) update_temperature!($T2, $T, $Ci, $lam, $dt, $_dx, $_dy); synchronize() end
T_eff = (9*1+1)*1/1e9*nx*ny*sizeof(Float64)/t_it
#T_eff = 2913.71 GB/s
t_it_s = @belapsed begin @cuda blocks=$blocks threads=$threads shmem=prod($threads.+2)*sizeof(Float64) update_temperature_sha!($T2, $T, $Ci, $lam, $dt, $_dx, $_dy); synchronize() end
T_eff_s = (9*1+1)*1/1e9*nx*ny*sizeof(Float64)/t_it_s
#T_eff_s = 2894.07 GB/s
As you can see, The kernel that doesn’t use sharedmemory behaves a bit faster than the kernel using sharedmemory. Is there something wrong with my code or benchmark method?
Kind regards