Hello everyone, I tried to test the performance of shared memory of `CUDA.jl`

with this code:

```
using CUDA, BenchmarkTools, Plots
# initialize
const N::Int64 = 1024
ϕ = ones(Float64, N, N)
ϕ[1, :] .= 10.0
function evolve(ϕ, ϕn, N)
i = (blockIdx().x-1)* blockDim().x + threadIdx().x
j = (blockIdx().y-1)* blockDim().y + threadIdx().y
if i < 2 || i > N-1 || j < 2 || j > N-1
return
end
@inbounds ϕn[i,j] = 0.25*(ϕ[i-1,j] + ϕ[i+1,j] + ϕ[i,j-1] + ϕ[i,j+1])
return
end
function evolve_shared(ϕ, ϕn, N)
i = (blockIdx().x-1)* blockDim().x + threadIdx().x
j = (blockIdx().y-1)* blockDim().y + threadIdx().y
if i > N || j > N
return
end
s = CuStaticSharedArray(Float64, 18*18)
local_i = threadIdx().x
local_j = threadIdx().y
@inbounds s[local_i * 18 + local_j+1] = ϕ[i, j]
#top
if local_i == 1 && i != 1
@inbounds s[local_j+1] = ϕ[i-1, j]
end
#bottom
if local_i == 16 && i != N
@inbounds s[17*18+local_j+1] = ϕ[i+1, j]
end
#left
if local_j == 1 && j != 1
@inbounds s[local_i*18+local_j] = ϕ[i, j-1]
end
#right
if local_j == 16 && j != N
@inbounds s[local_i*18+local_j+2] = ϕ[i, j+1]
end
sync_threads()
if i < 2 || i > N-1 || j < 2 || j > N-1
return
end
@inbounds ϕn[i,j] = 0.25*(s[(local_i-1)*18+local_j+1] + s[(local_i+1)*18+local_j+1]
+s[local_i*18+local_j] + s[local_i*18+local_j+2])
return
end
function run(ϕ, N)
nthreads = (16, 16)
nblock = (cld(N, 16), cld(N, 16))
ϕ_d = CuArray(ϕ)
ϕn_d = copy(ϕ_d)
for _ ∈ 1:100
@cuda blocks=nblock threads=nthreads evolve(ϕ_d, ϕn_d, N)
ϕ_d, ϕn_d = ϕn_d, ϕ_d
end
copyto!(ϕ, ϕ_d)
end
@benchmark run(ϕ, N)
# heatmap(ϕ)
```

With `evolve_shared`

, It can theoretically improve performance by reducing access to global memory, but my test results show that using shared memory is slower. What’s wrong with my code? Thanks in advance.