Hi,

From a 2D Array of floats b(nx,ny), I want to compute a(nx,ny-1) on the GPU as:

```
function julia_diff_y(a,b)
s=size(a)
for j=1:s[2]
for i=1:s[1]
@inbounds a[i,j]=b[i,j+1]-b[i,j]
end
end
end
```

I obtain very good performances with CUDANative.jl and a speed-up (x13) which corresponds to the memory bandwidth ratio between CPU and GPU:

```
((nx, ny), (nblocks, thread_per_block)) = ((65536, 256), (512, 128))
CPU time:0.008354672 s
GPU time 0.000565216 s
SpeedUp: x14.781379063901454
CPU Bandwidth:16.064990702208295 GB/s
GPU Bandwidth:237.46271722739323 GB/s
```

Is it possible to implement this computation with cuArrays ?

Laurent

Here is the MWE:

```
using Test,BenchmarkTools
using CuArrays,CUDAnative
import CUDAdrv
function julia_diff_y(a,b)
s=size(a)
for j=1:s[2]
for i=1:s[1]
@inbounds a[i,j]=b[i,j+1]-b[i,j]
end
end
end
function kernel_diff_y(a,b)
s=size(a)
nj=s[2]
i = (blockIdx().x-1) * blockDim().x + threadIdx().x
@inbounds bp=b[i,1]
for j=1:nj
@inbounds bp1=b[i,j+1]
@inbounds a[i,j]=bp1-bp
bp=bp1
end
return nothing
end
function tst(N;thread_per_block=128)
nx=N^2
ny=N
nblocks,rem=divrem(N^2,thread_per_block)
@assert rem==0
a=ones(Float32,nx,ny-1)
b=ones(Float32,nx,ny)
copyto!(b,1:length(b))
d_a = CuArray(a)
d_b = CuArray(b)
@show (nx,ny),(nblocks,thread_per_block)
#warm-up
@cuda blocks=nblocks threads=thread_per_block kernel_diff_y(d_a, d_b)
CUDAdrv.synchronize()
tgpu = CUDAdrv.@elapsed begin
@cuda blocks=nblocks threads=thread_per_block kernel_diff_y(d_a, d_b)
end
julia_diff_y(a,b) #warm-up
tcpu= @elapsed julia_diff_y(a,b)
println("CPU time:$tcpu s")
println("GPU time $tgpu s")
println("SpeedUp: x$(tcpu/tgpu)")
# @show tgpu,tcpu,speedupGPU
bandwidth_GBs(nx,ny,ts) = nx*ny*sizeof(Float32)*2/(ts*1.e9)
println("CPU Bandwidth:$(bandwidth_GBs(nx,ny,tcpu)) GB/s")
println("GPU Bandwidth:$(bandwidth_GBs(nx,ny,tgpu)) GB/s")
# @show bandwidth_GBs(nx,ny,tgpu),bandwidth_GBs(nx,ny,tcpu)
#check result
julia_diff_y(a,b)
@test a ≈ Array(d_a)
end
tst(256)
```