Hi all,

I’m trying to choose the best way of parallelizing my code given the the current options of @spawn and @threads. A toy model for what I’m trying to do is the following:

```
function solve_model_serial!(A, Nu, Nx, Ny)
for j in 1:Ny, i in 1:Nx
A_ij = view(A, :, i, j)
solve_kernel!(A_ij, Nu)
end
nothing
end
function solve_kernel!(A_ij, Nu)
for a in 1:Nu
A_ij[a] = 42*a + rand()
end
nothing
end
```

I’ve experimented with the following options:

```
function solve_model_threads1!(A, Nu, Nx, Ny)
@threads for j in 1:Ny
for i in 1:Nx
A_ij = view(A, :, i, j)
solve_kernel!(A_ij, Nu)
end
end
nothing
end
function solve_model_spawn1!(A, Nu, Nx, Ny)
t = @spawn for j in 1:Ny
for i in 1:Nx
A_ij = view(A, :, i, j)
solve_kernel!(A_ij, Nu)
end
end
wait(t)
nothing
end
function solve_model_spawn2!(A, Nu, Nx, Ny)
t1 = @spawn for j in 1:Ny
@spawn for i in 1:Nx
A_ij = view(A, :, i, j)
solve_kernel!(A_ij, Nu)
end
end
wait(t1)
nothing
end
function solve_model_spawn3!(A, Nu, Nx, Ny)
@sync for j in 1:Ny, i in 1:Nx
@spawn begin
A_ij = view(A, :, i, j)
solve_kernel!(A_ij, Nu)
end
end
nothing
end
```

These are the results I’m getting with 4 threads:

```
Nu = 32
Nx = 600
Ny = 600
A = zeros(Nu, Nx, Ny)
> @time solve_model_serial!(A, Nu, Nx, Ny)
0.146789 seconds
> @time solve_model_threads1!(A, Nu, Nx, Ny)
0.040972 seconds
> @time solve_model_spawn1!(A, Nu, Nx, Ny)
0.080115 seconds
> @time solve_model_spawn2!(A, Nu, Nx, Ny)
0.001354 seconds
> @time solve_model_spawn3!(A, Nu, Nx, Ny)
0.365831 seconds
```

From these, it seems `solve_model_spawn2!`

is clearly the best option, but I don’t fully understand why… It’s actually crazy how much faster it runs when compared with the serial version (and also with the other options). In particular, why is `solve_model_spawn3!`

so slow?

Is `solve_model_spawn2!`

the best way to handle such cases of nested independent loops? Are there any other (potentially better) options I’m forgetting?