I don't understand why it is slower with CuStaticSharedArray

Hello,
I am new to Julia and I am trying to use an iterative scheme to solve a force balance equation, concretely the kernel without shared memory is quite simple:

function kernel_comp_v_noshmem!(vn, v, F)
    # Indexing (i,j)
    i::Int32 = (blockIdx().x - 1f0) * blockDim().x + threadIdx().x
    j::Int32 = (blockIdx().y - 1f0) * blockDim().y + threadIdx().y

    i_::Int32 = i== 1 ? N : i-1; ip::Int32 = i== N ? 1 : i+1
    jm::Int32 = j== 1 ? N : j-1; jp::Int32 = j== N ? 1 : j+1

    @inbounds begin
        vn[i,j,1] = 1/(1+6/Δ2)*(F[i,j,1] + (   2*(v[i_,j,1]+v[ip,j,1]) + v[i,jm,1]+v[i,jp,1] + 0.25*(v[ip,jp,2]+v[i_,jm,2]-v[i_,jp,2]-v[ip,jm,2])  )/Δ2)
        vn[i,j,2] = 1/(1+6/Δ2)*(F[i,j,2] + (   2*(v[i,jm,2]+v[i,jp,2]) + v[i_,j,2]+v[ip,j,2] + 0.25*(v[ip,jp,1]+v[i_,jm,1]-v[i_,jp,1]-v[ip,jm,1])  )/Δ2)
    end

    return nothing
end

To avoid doing lot of global memory access, I wanted to use shared memory:

function kernel_comp_v!(vn, v, F)
    tx = threadIdx().x
    ty = threadIdx().y
    # Global indices (i, j)
    i = (blockIdx().x - 1) * blockDim().x + tx
    j = (blockIdx().y - 1) * blockDim().y + ty
    # Local indices (li, lj)
    li = tx+1; lip = li+1; lim=tx 
    lj = ty+1; ljp = lj+1; ljm=ty

    @inbounds begin
        v_shared = CuStaticSharedArray(Tf, (T2,T2,2))
        # Borders v[1,2:T+1,:], v[T+2,2:T+1,:], v[2:T+1,1,:], v[2:T+1,T+2,:]
        if tx==1
            i_ = i==1 ? N : i-1
            v_shared[1,lj,1] = v[i_,j,1]
            v_shared[1,lj,2] = v[i_,j,2]
        elseif tx==T
            ip = i== N ? 1 : i+1
            v_shared[T2,lj,1] = v[ip,j,1]
            v_shared[T2,lj,2] = v[ip,j,2]
        end
        if ty==1
            jm = j==1 ? N : j-1
            v_shared[li,1,1] = v[i,jm,1]
            v_shared[li,1,2] = v[i,jm,2]
        elseif ty==T
            jp = j==N ? 1 : j+1
            v_shared[li,T2,1] = v[i,jp,1]
            v_shared[li,T2,2] = v[i,jp,2]
        end
        # Corners v[1,1,:], v[1,T+2,:], v[T+2,1,:], v[T+2,T+2,:]
        # I use tx/y == 2 and tx/y==T-1 to do less operations on tx/y=1 and tx/y=T
        if tx == 2 && ty == 2 # -> [1,1,:]
            i_ = i<3 ? (i==2 ? N : N-1) : i-2
            jm = j<3 ? (j==2 ? N : N-1) : j-2
            v_shared[1,1,1] = v[i_,jm,1]
            v_shared[1,1,2] = v[i_,jm,2]
        elseif tx == 2 && ty == T-1 # -> [1,18,:]
            i_ = i<3 ? (i==2 ? N : N-1) : i-2
            jp = j>N-2 ? (j==N ? 2 : 1) : j+2
            v_shared[1,T2,1] = v[i_,jp,1]
            v_shared[1,T2,2] = v[i_,jp,2]
        elseif tx == T-1 && ty == 2 # -> [T+2,1,:]
            ip = i>N-2 ? (i==N ? 2 : 1) : i+2
            jm = j<3 ? (j==2 ? N : N-1) : j-2
            v_shared[T2,1,1] = v[ip,jm,1]
            v_shared[T2,1,2] = v[ip,jm,2]
        elseif tx == T-1 && ty == T-1 # -> [T+2,T+2,:]
            ip = i>N-2 ? (i==N ? 2 : 1) : i+2
            jp = j>N-2 ? (j==N ? 2 : 1) : j+2
            v_shared[T2,T2,1] = v[ip,jp,1]
            v_shared[T2,T2,2] = v[ip,jp,2]
        end
        v_shared[li,lj,1] = v[i,j,1]
        v_shared[li,lj,2] = v[i,j,2]

        # synchronize threads
        sync_threads()

        vn[i,j,1] = 1/(1+6/Δ2)*(F[i,j,1] + (   2*(v_shared[lim,lj,1]+v_shared[lip,lj,1]) + v_shared[li,ljm,1]+v_shared[li,ljp,1] + 0.25*(v_shared[lip,ljp,2]+v_shared[lim,ljm,2]-v_shared[lim,ljp,2]-v_shared[lip,ljm,2])  )/Δ2)
        vn[i,j,2] = 1/(1+6/Δ2)*(F[i,j,2] + (   2*(v_shared[li,ljm,2]+v_shared[li,ljp,2]) + v_shared[lim,lj,2]+v_shared[lip,lj,2] + 0.25*(v_shared[lip,ljp,1]+v_shared[lim,ljm,1]-v_shared[lim,ljp,1]-v_shared[lip,ljm,1])  )/Δ2)
                    
    end

    return nothing
end

To measure performance I use

@benchmark CUDA.@sync begin 
   comp_v!($v_temp, $v, $F; threads = block_dim, blocks = grid_dim)
end

Without shared memory: 630µs on A2000 ada, 112µs on A30
With shared memory: 637µs on A2000 ada, 143µs on A30
Every CuArrays are using FP64 precision. Setting the type of all the indices to Int32 doesn’t impact the benchmark or number of registers…

Thank you for your help,
Best

1 Like

Hi,

Could you also post the (const) values of N, Δ2 (1e-4?), Tf (Float64?), T2, T, etc.? In other words, please provide all the necessary code so that others can also run it.

Did you try timing the shared memory loading itself (by returning after the sync_threads())? Given how complicated the code looks, my guess is simply that loading to shared memory currently takes up more time than you gain from using it. Perhaps using 1D indexing (instead of 2D indexing: (i, j)) might be faster?