Hi,

Let `x`

and `y`

be 2 3D arrays of Float32 with size (200,200,200)

I would like to perform the following operation

`y[i,:,:]=x[i+1,:,:]-x[i,:,:]`

for `i`

in [1:199]

I have implement this in a function that can be called both with Arrays and cuArrays:

```
function Btmult!(y,x)
s=size(x)
for i=1:s[1]-1
yi=view(y,i,:,:)
xi=view(x, i,:, :)
xip1=view(x, i+1,:, :)
yi.=xip1.-xi
end
end
```

I have also written a function acting on the third dimension, having in mind that it should be better suited to GPU computations:

```
function Btmult_plan!(y,x)
s=size(x)
for k=1:s[3]-1
yk=view(y,:,:,k)
xk=view(x, :, :, k)
xkp1=view(x, :, :, k+1)
yk.=xkp1.-xk
end
end
```

The performance comparison surprised me because I expected the GPU to perform much faster.

```
diff=0.0
Btmult!(Array{Float32,3},Array{Float32,3}): t=0.07029846898
Btmult!(CuArray{Float32,3},CuArray{Float32,3}): t=0.00921697145
SpeedUp GPU/CPU=7.62706810597748
diff=0.0
Btmult_plan!(Array{Float32,3},Array{Float32,3}): t=0.0037557243699999996
Btmult_plan!(CuArray{Float32,3},CuArray{Float32,3}): t=0.0014109958899999999
SpeedUp GPU/CPU=2.661754294691815
```

What would be the most efficient way of implementing this operation on GPU ?

Laurent

Here is the MWE:

```
using Pkg
using CuArrays
# using BenchmarkTools
function Btmult!(y,x)
s=size(x)
for i=1:s[1]-1
yi=view(y,i,:,:)
xi=view(x, i,:, :)
xip1=view(x, i+1,:, :)
yi.=xip1.-xi
# y[i,:,:].=x[i+1,:,:].-x[i,:,:]
end
end
function Btmult_plan!(y,x)
s=size(x)
for k=1:s[3]-1
yk=view(y,:,:,k)
xk=view(x, :, :, k)
xkp1=view(x, :, :, k+1)
yk.=xkp1.-xk
# y[:,:,k].=x[:,:,k+1].-x[:,:,k]
end
end
function permute3D!(y,x)
permutedims!(y,x,(2,3,1))
end
function init3D(n::Int64)
xcpu=zeros(Float32,(n,n,n))
copyto!(xcpu,1:length(xcpu))
ycpu=zeros(Float32,(n,n,n))
xgpu = CuArray{Float32}(n, n,n)
ygpu = CuArray{Float32}(n, n,n)
copyto!(xgpu,xcpu)
copyto!(ygpu,ycpu)
xcpu,ycpu,xgpu,ygpu
end
using LinearAlgebra
function norm_diff_cpugpu(xcpu,xgpu)
xcpubis=similar(xcpu)
copyto!(xcpubis,xgpu)
return norm(xcpubis.-xcpu)
end
function test_binaryf!(y,x,bf)
nsample=100
bf(y,x)#warm up
t=@elapsed for i=1:nsample
bf(y,x)
end
t/=nsample
return t
end
function test_binaryf_cpugpu(n::Int64,bf)
(xcpu,ycpu,xgpu,ygpu)=init3D(n)
tcpu=test_binaryf!(ycpu,xcpu,bf)
tgpu=test_binaryf!(ygpu,xgpu,bf)
println("diff=",norm_diff_cpugpu(ycpu,ygpu))
println(string(bf),"(",string(typeof(ycpu)),",",string(typeof(xcpu)),"): t=", tcpu)
println(string(bf),"(",string(typeof(ygpu)),",",string(typeof(xgpu)),"): t=", tgpu)
println("SpeedUp GPU/CPU=",tcpu/tgpu)
return
end
# test_binaryf_cpugpu(100,permute3D!)
test_binaryf_cpugpu(200,Btmult!)
test_binaryf_cpugpu(200,Btmult_plan!)
```