Hi folks,

the other day I was discussing with a colleague about how to speed up things in Julia for performance, and was wondering if anybody could help me improve (so as to learn) the following code.

This is a minimal part of a code, where one is supposed to evaluate the laplacian in cartesian coordinates of a given function discretised in an array. I have here a grid, the tabulated function and the tabulated (analytical) laplacian

```
xmax = 4.0
xmin = -xmax
Nx = 2001
x = collect(range(xmin, stop=xmax,length=Nx))
dx = x[2]-x[1];
ymax = 3.0
ymin = -ymax
Ny = 2001
y = collect(range(ymin, stop=ymax,length=Ny))
dy = y[2]-y[1];
# Testing the 2D case
fi = zeros(Nx,Ny);
lap_fi = zero(fi);
for i in 1:Nx
xx = x[i]
for j in 1:Ny
yy = y[j]
fi[i,j] = exp(-0.1*(xx*xx+2.0*yy*yy))
lap_fi[i,j] = fi[i,j]*(-0.6 + 0.04*xx*xx + 0.16*yy*yy)
end
end
lap_fi
```

Then I wrote a function the good-old-fortran way… I mean, performing all the operations element-wise

```
function Lapla(y::Array{T}, dr::Array{T}) where {T}
Nx, Ny = size(y)
idx2,idy2 = 1.0./(dr.*dr)
d2y = zero(y)
@inbounds for i in 2:Nx-1
@inbounds for j in 2:Ny-1
d2y[i,j] = idx2*(y[i+1,j] - 2.0*y[i,j] + y[i-1,j])
d2y[i,j] += idy2*(y[i,j+1] - 2.0*y[i,j] + y[i,j-1])
end
end
@inbounds for j in 2:Ny-1
d2y[1,j] = idy2*(y[1,j+1] - 2.0*y[1,j] + y[1,j-1]) +
idx2*(2.0*y[1,j] - 5.0*y[2,j] + 4.0*y[3,j] - y[4,j])
d2y[end,j] = idy2*(y[end,j+1] - 2.0*y[end,j] + y[end,j-1]) +
idx2*(2.0*y[end,j] - 5.0*y[end-1,j] + 4.0*y[end-2,j] - y[end-3,j])
end
@inbounds for j in 2:Nx-1
d2y[j,1] = idx2*(y[j+1,1] - 2.0*y[j,1] + y[j-1,1]) +
idy2*(2.0*y[j,1] - 5.0*y[j,2] + 4.0*y[j,3] - y[j,4])
d2y[j,end] = idx2*(y[j+1,end] - 2.0*y[j,end] + y[j-1,end]) +
idy2*(2.0*y[j,end] - 5.0*y[j,end-1] + 4.0*y[j,end-2] - y[j,end-3])
end
d2y[1,1] = idy2*(2.0*y[1,1] - 5.0*y[1,2] + 4.0*y[1,3] - y[1,4]) +
idx2*(2.0*y[1,1] - 5.0*y[2,1] + 4.0*y[3,1] - y[4,1])
d2y[1,end] = idy2*(2.0*y[1,end] - 5.0*y[1,end-1] + 4.0*y[1,end-2] - y[1,end-3]) +
idx2*(2.0*y[1,end] - 5.0*y[2,end] + 4.0*y[3,end] - y[4,end])
d2y[end,1] = idy2*(2.0*y[end,1] - 5.0*y[end,2] + 4.0*y[end,3] - y[end,4]) +
idx2*(2.0*y[end,1] - 5.0*y[end-1,1] + 4.0*y[end-2,1] - y[end-3,1])
d2y[end,end] = idy2*(2.0*y[end,end] - 5.0*y[end,end-1] + 4.0*y[end,end-2] - y[end,end-3]) +
idx2*(y[end,end] - 5.0*y[end-1,end] + 4.0*y[end-2,end] - y[end-3,end])
d2y
end;
```

with the result

```
@btime Lapla(fi,[dx,dy])
> 90.100 ms (4 allocations: 30.55 MiB)
```

…so quite fast with only 4 allocations. I’m happy with the result… But one of the reasons why I moved to Julia was to avoid having to write code like this, and use functions that operate on vectors/arrays directly.

I then wrote the following code

```
function Laplacian_1D!(d2y, y, dr)
idx2 = 1.0/prod(dr)^2
Nx = size(y,1)
@inbounds for i in 2:Nx-1
d2y[i] = idx2*(y[i+1]-2*y[i]+y[i-1])
end
d2y[1] = idx2*(2.0*y[1] - 5.0*y[2] + 4.0*y[3] - y[4])
d2y[end] = idx2*(2.0*y[end] - 5.0*y[end-1] + 4.0*y[end-2] - y[end-3])
end;
function Laplacian_2D!(d2y, y, dr)
dx, dy = dr
# idx2, idy2 = 1.0./(dr.*dr)
Nx, Ny = size(y)
aux = zero(d2y)
@inbounds for i in 1:Ny
Laplacian_1D!(view(aux,:,i),view(y,:,i),dx)
end
@inbounds for i in 1:Nx
Laplacian_1D!(view(d2y,i,:),view(y,i,:),dy)
end
d2y .+= aux
end;
```

which does the laplacian in 1 dimension, and uses this function to compute the laplacian in two dimensions. Then

```
d2y = zero(fi[:,1]);
@btime Laplacian_1D!(d2y,view(fi,:,1),dx)
> 1.688 μs (3 allocations: 80 bytes)
```

with 3 allocations (nice!), and

```
d2xy = zero(fi)
@btime Laplacian_2D!(d2xy,view(fi,:,:),[dx,dy])
> 100.073 ms (8010 allocations: 30.91 MiB)
```

which is similar in terms of computing time, though a 10% worse, probably(?) but to allocations.

Now the question is: can the second way (with Laplacian_1D! and Laplacian_2D!) be modified *slightly* to perform as the Lapla function above? Is the speed & allocation penalty unavoidable without having to re-think the whole problem?

I keep wondering if I must code everything element wise as in Lapla all the time… which I’d like to avoid.

Thanks for your help,

Ferran.