Suppose I want to calculate the following sum: C_{nm}=\sum_i A_{ni}B_{im} (in reality, something more complicated). The corresponding code could be

```
function cpu_add!(C, A, B)
@. C = 0
for i in axes(A,1)
for m=1:2, n=1:2
C[n,m] += A[i,n] * B[i,m]
end
end
return nothing
end
N = 100
A = ones(N, 2)
B = ones(N, 2)
C = zeros(2, 2)
cpu_add!(C, A, B)
```

Now I would like to accelerate this calculation using CUDA. I know how to write the corresponding kernel for a single pair of `n`

and `m`

:

```
using CUDA
function gpu_add_single(A, B, n, m)
T = eltype(A)
N = size(A, 1)
nth = min(256, prevpow(2,N))
nbl = cld(N, nth)
shmem = sizeof(T) * nth
psum = CUDA.zeros(T, nbl) # partial sum
@sync @cuda threads=nth blocks=nbl shmem=shmem kernel_single(psum, A, B, n, m)
return sum(psum)
end
function kernel_single(psum, A, B, n, m)
id = (blockIdx().x - 1) * blockDim().x + threadIdx().x
stride = blockDim().x * gridDim().x
Ctmp = zero(eltype(psum))
for i=id:stride:size(A,1)
Ctmp += A[i,n] * B[i,m]
end
block_reduction!(psum, Ctmp)
return nothing
end
function block_reduction!(psum, temp)
shmem = @cuDynamicSharedMem(eltype(psum), blockDim().x)
ishmem = threadIdx().x
shmem[ishmem] = temp
sync_threads()
# for reductions, nthreads must be a power of 2:
i = div(blockDim().x, 2)
while i >= 1
if ishmem <= i
shmem[ishmem] += shmem[ishmem + i]
end
i = div(i, 2)
sync_threads()
end
if ishmem == 1
psum[blockIdx().x] = shmem[1]
end
return nothing
end
N = 100
A = CUDA.ones(N, 2)
B = CUDA.ones(N, 2)
C12 = gpu_add_single(A, B, 1,2)
```

However, I do not understand how to write the kernel which do the summation for all `n`

and `m`

at once, like in the CPU code above. Here, instead of the 1D array for partial sum `psum`

, I should have a higher dimensional array. But I do not get how to properly work with it, e.g. how to work with the corresponding shared memory and how to make the proper block reduction. Can you please help me with it?