The MWE

``````using LinearAlgebra
using Random
using BenchmarkTools

function distance(x, y, p::Real = 2)
norm((x[1] - y[1], x[2] - y[2]), p)
end

function jfa_serial!(grid, sites, p::Real=2)
for (color, (x, y)) in enumerate(sites)
grid[x, y] = convert(eltype(grid), color)
end
gridp = copy(grid)
k = max(size(grid)...)
while k > 1
k = k ÷ 2 + k % 2
for (x, y) in collect(Iterators.product(1:size(grid, 1), 1:size(grid, 2)))
for j in (-k, 0, k), i in (-k, 0, k)
checkbounds(Bool, grid, x + i, y + j) || continue
colorq = grid[x + i, y + j]
colorq !== 0 || continue
colorp = grid[x, y]
if colorp === 0
gridp[x, y] = colorq
elseif distance(sites[colorp], (x, y), p) > distance(sites[colorq], (x, y), p)
gridp[x, y] = colorq
end
end
end
grid .= gridp
end
return grid
end

function jfa_parallel!(grid, sites, p::Real=2)
for (color, (x, y)) in enumerate(sites)
grid[x, y] = convert(eltype(grid), color)
end
gridp = copy(grid)
k = max(size(grid)...)
while k > 1
k = k ÷ 2 + k % 2
Threads.@threads for (x, y) in collect(Iterators.product(1:size(grid, 1), 1:size(grid, 2)))
for j in (-k, 0, k), i in (-k, 0, k)
checkbounds(Bool, grid, x + i, y + j) || continue
colorq = grid[x + i, y + j]
colorq !== 0 || continue
colorp = grid[x, y]
if colorp === 0
gridp[x, y] = colorq
elseif distance(sites[colorp], (x, y), p) > distance(sites[colorq], (x, y), p)
gridp[x, y] = colorq
end
end
end
grid .= gridp
end
return grid
end

function rand_sites(M, N, K)
idx = [(m, n) for m in 1:M, n in 1:N]
shuffle!(idx)
idx[1:K]
end

@btime jfa_serial!(grid, sites) setup = (grid = zeros(Int, 100, 100); sites = collect(rand_sites(size(grid)..., 100)))
@btime jfa_parallel!(grid, sites) setup = (grid = zeros(Int, 100, 100); sites = collect(rand_sites(size(grid)..., 100)))
``````

yields

``````  7.030 ms (16 allocations: 1.14 MiB) #jfa_serial
55.060 ms (2747567 allocations: 75.80 MiB) #jfa_parallel
``````
• A workaround for this behavior is moving the inner part of the `@threads for` loop into an auxiliary function.
• Is it this issue. If yes, why?
• Is it time to check out `Polyester`?
• I would be most grateful for an explanation or experience report.

Thanks!

Fixing `@threads` with an auxiliary and adding `Polyester` to the mix

``````using LinearAlgebra
using Random
using BenchmarkTools
using Polyester

function distance(x, y, p::Real = 2)
norm((x[1] - y[1], x[2] - y[2]), p)
end

function jfa_serial!(grid, sites, p::Real=2)
for (color, (x, y)) in enumerate(sites)
grid[x, y] = convert(eltype(grid), color)
end
gridp = copy(grid)
k = max(size(grid)...)
while k > 1
k = k ÷ 2 + k % 2
for y in 1:size(grid, 1), x in 1:size(grid, 2)
for j in (-k, 0, k), i in (-k, 0, k)
checkbounds(Bool, grid, x + i, y + j) || continue
colorq = grid[x + i, y + j]
colorq !== 0 || continue
colorp = grid[x, y]
if colorp === 0
gridp[x, y] = colorq
elseif distance(sites[colorp], (x, y), p) > distance(sites[colorq], (x, y), p)
gridp[x, y] = colorq
end
end
end
grid .= gridp
end
return grid
end

function jfa_parallel_aux!(grid, gridp, sites, k, x, y, p)
for j in (-k, 0, k), i in (-k, 0, k)
checkbounds(Bool, grid, x + i, y + j) || continue
colorq = grid[x + i, y + j]
colorq !== 0 || continue
colorp = grid[x, y]
if colorp === 0
gridp[x, y] = colorq
elseif distance(sites[colorp], (x, y), p) > distance(sites[colorq], (x, y), p)
gridp[x, y] = colorq
end
end
end
function jfa_parallel!(grid, sites, p::Real=2)
for (color, (x, y)) in enumerate(sites)
grid[x, y] = convert(eltype(grid), color)
end
gridp = copy(grid)
k = max(size(grid)...)
while k > 1
k = k ÷ 2 + k % 2
Threads.@threads for (x, y) in collect(Iterators.product(1:size(grid, 1), 1:size(grid, 2)))
jfa_parallel_aux!(grid, gridp, sites, k, x, y, p)
end
grid .= gridp
end
return grid
end

function jfa_polyester!(grid, sites, p::Real=2)
for (color, (x, y)) in enumerate(sites)
grid[x, y] = convert(eltype(grid), color)
end
gridp = copy(grid)
k = max(size(grid)...)
while k > 1
k = k ÷ 2 + k % 2
@batch for y in 1:size(grid, 1), x in 1:size(grid, 2)
for j in (-k, 0, k), i in (-k, 0, k)
checkbounds(Bool, grid, x + i, y + j) || continue
colorq = grid[x + i, y + j]
colorq !== 0 || continue
colorp = grid[x, y]
if colorp === 0
gridp[x, y] = colorq
elseif distance(sites[colorp], (x, y), p) > distance(sites[colorq], (x, y), p)
gridp[x, y] = colorq
end
end
end
grid .= gridp
end
return grid
end

function rand_sites(M, N, K)
idx = [(m, n) for m in 1:M, n in 1:N]
shuffle!(idx)
idx[1:K]
end

@btime jfa_serial!(grid, sites) setup = (grid = zeros(Int, 100, 100); sites = collect(rand_sites(size(grid)..., 100)))
@btime jfa_parallel!(grid, sites) setup = (grid = zeros(Int, 100, 100); sites = collect(rand_sites(size(grid)..., 100)))
@btime jfa_polyester!(grid, sites) setup = (grid = zeros(Int, 100, 100); sites = collect(rand_sites(size(grid)..., 100)))
``````

yields

``````  6.647 ms (2 allocations: 78.17 KiB)
2.253 ms (234 allocations: 1.17 MiB)
1.434 ms (9 allocations: 78.83 KiB)
``````

and `Polyester` doesn’t even suffer from the (inference?) problems of `Threads`. How can that be? What are the tradeoffs between `Polyester` and `Threads`?

polyester loses computability. base threads will be relatively intelligent if you nest threaded loops, but polyester will result in over-subscription.

2 Likes

So no recursion with `Polyester`, yes I noticed. Thanks!

If you nest polyester with itself/ libraries using PolyesterWeave, it will not oversubscribe.

doesn’t even suffer from the (inference?) problems of `Threads` . How can that be?

It tries to avoid capturing variables into closures.

1 Like

Thanks a lot for this hint, will try. I’d like to stay with the original solution as it is the explanation for tradeoffs between `@threads` and `@batch`.

As an aside, I’d try to avoid using `collect`, especially in cases like:

``````for (x, y) in collect(Iterators.product(1:size(grid, 1), 1:size(grid, 2)))
``````

This was my easy way out w.r.t. syntactic restrictions of the `threads` macro, but I see your point. Thanks again!

1 Like

@Elrod : sorry for bothering you again: my search engine fu left me and I only found this. Do you have a better resource how to evaluate?

What exactly are you trying to evaluate?

Libraries that use PolyesterWeave.jl (this includes, among others, Polyester.jl and LoopVectorization.jl) use `PolyesterWeave.request_threads` to get the threads they are allowed to run on. This will make sure they do oversubscribe.

E.g., if you run `Octavian.matmul` from inside `@batch`, it’ll probably be single threaded. But if you run it from toplevel, it’ll be multithreaded.
At one point, `Polyester` supported reserving threads per threads (e.g., if you have 16 threads total, it’d like you run 4 threads, that each have 3 extra ones reserved the others do not have access to). I didn’t want to keep maintaining that code, so I eventually took it out, but if someone wants something like that, I’d be happy to explain how to add support for this once again.

None of these libraries will work well when mixed with `Threads.@threads` or `Threads.@spawn`, however.

1 Like

Thanks, if I understand you correctly this works similar to `BLAS.set_num_threads`. This will help…

You can set `BLAS.set_num_threads` in an existing Julia session, but there currently isn’t an API for limiting that in Polyester(Weave). Currently, the total number of threads used is based on `Threads.nthreads()`.
However, it’ll normally try and limit itself to one thread per physical core.

2 Likes