Hello,
I am new to Julia and I am trying to use an iterative scheme to solve a force balance equation, concretely the kernel without shared memory is quite simple:
function kernel_comp_v_noshmem!(vn, v, F)
# Indexing (i,j)
i::Int32 = (blockIdx().x - 1f0) * blockDim().x + threadIdx().x
j::Int32 = (blockIdx().y - 1f0) * blockDim().y + threadIdx().y
i_::Int32 = i== 1 ? N : i-1; ip::Int32 = i== N ? 1 : i+1
jm::Int32 = j== 1 ? N : j-1; jp::Int32 = j== N ? 1 : j+1
@inbounds begin
vn[i,j,1] = 1/(1+6/Δ2)*(F[i,j,1] + ( 2*(v[i_,j,1]+v[ip,j,1]) + v[i,jm,1]+v[i,jp,1] + 0.25*(v[ip,jp,2]+v[i_,jm,2]-v[i_,jp,2]-v[ip,jm,2]) )/Δ2)
vn[i,j,2] = 1/(1+6/Δ2)*(F[i,j,2] + ( 2*(v[i,jm,2]+v[i,jp,2]) + v[i_,j,2]+v[ip,j,2] + 0.25*(v[ip,jp,1]+v[i_,jm,1]-v[i_,jp,1]-v[ip,jm,1]) )/Δ2)
end
return nothing
end
To avoid doing lot of global memory access, I wanted to use shared memory:
function kernel_comp_v!(vn, v, F)
tx = threadIdx().x
ty = threadIdx().y
# Global indices (i, j)
i = (blockIdx().x - 1) * blockDim().x + tx
j = (blockIdx().y - 1) * blockDim().y + ty
# Local indices (li, lj)
li = tx+1; lip = li+1; lim=tx
lj = ty+1; ljp = lj+1; ljm=ty
@inbounds begin
v_shared = CuStaticSharedArray(Tf, (T2,T2,2))
# Borders v[1,2:T+1,:], v[T+2,2:T+1,:], v[2:T+1,1,:], v[2:T+1,T+2,:]
if tx==1
i_ = i==1 ? N : i-1
v_shared[1,lj,1] = v[i_,j,1]
v_shared[1,lj,2] = v[i_,j,2]
elseif tx==T
ip = i== N ? 1 : i+1
v_shared[T2,lj,1] = v[ip,j,1]
v_shared[T2,lj,2] = v[ip,j,2]
end
if ty==1
jm = j==1 ? N : j-1
v_shared[li,1,1] = v[i,jm,1]
v_shared[li,1,2] = v[i,jm,2]
elseif ty==T
jp = j==N ? 1 : j+1
v_shared[li,T2,1] = v[i,jp,1]
v_shared[li,T2,2] = v[i,jp,2]
end
# Corners v[1,1,:], v[1,T+2,:], v[T+2,1,:], v[T+2,T+2,:]
# I use tx/y == 2 and tx/y==T-1 to do less operations on tx/y=1 and tx/y=T
if tx == 2 && ty == 2 # -> [1,1,:]
i_ = i<3 ? (i==2 ? N : N-1) : i-2
jm = j<3 ? (j==2 ? N : N-1) : j-2
v_shared[1,1,1] = v[i_,jm,1]
v_shared[1,1,2] = v[i_,jm,2]
elseif tx == 2 && ty == T-1 # -> [1,18,:]
i_ = i<3 ? (i==2 ? N : N-1) : i-2
jp = j>N-2 ? (j==N ? 2 : 1) : j+2
v_shared[1,T2,1] = v[i_,jp,1]
v_shared[1,T2,2] = v[i_,jp,2]
elseif tx == T-1 && ty == 2 # -> [T+2,1,:]
ip = i>N-2 ? (i==N ? 2 : 1) : i+2
jm = j<3 ? (j==2 ? N : N-1) : j-2
v_shared[T2,1,1] = v[ip,jm,1]
v_shared[T2,1,2] = v[ip,jm,2]
elseif tx == T-1 && ty == T-1 # -> [T+2,T+2,:]
ip = i>N-2 ? (i==N ? 2 : 1) : i+2
jp = j>N-2 ? (j==N ? 2 : 1) : j+2
v_shared[T2,T2,1] = v[ip,jp,1]
v_shared[T2,T2,2] = v[ip,jp,2]
end
v_shared[li,lj,1] = v[i,j,1]
v_shared[li,lj,2] = v[i,j,2]
# synchronize threads
sync_threads()
vn[i,j,1] = 1/(1+6/Δ2)*(F[i,j,1] + ( 2*(v_shared[lim,lj,1]+v_shared[lip,lj,1]) + v_shared[li,ljm,1]+v_shared[li,ljp,1] + 0.25*(v_shared[lip,ljp,2]+v_shared[lim,ljm,2]-v_shared[lim,ljp,2]-v_shared[lip,ljm,2]) )/Δ2)
vn[i,j,2] = 1/(1+6/Δ2)*(F[i,j,2] + ( 2*(v_shared[li,ljm,2]+v_shared[li,ljp,2]) + v_shared[lim,lj,2]+v_shared[lip,lj,2] + 0.25*(v_shared[lip,ljp,1]+v_shared[lim,ljm,1]-v_shared[lim,ljp,1]-v_shared[lip,ljm,1]) )/Δ2)
end
return nothing
end
To measure performance I use
@benchmark CUDA.@sync begin
comp_v!($v_temp, $v, $F; threads = block_dim, blocks = grid_dim)
end
Without shared memory: 630µs on A2000 ada, 112µs on A30
With shared memory: 637µs on A2000 ada, 143µs on A30
Every CuArrays are using FP64 precision. Setting the type of all the indices to Int32 doesn’t impact the benchmark or number of registers…
Thank you for your help,
Best