Another slowdown when using `Threads.@threads`?

The MWE

using LinearAlgebra
using Random
using BenchmarkTools

function distance(x, y, p::Real = 2)
    norm((x[1] - y[1], x[2] - y[2]), p)
end

function jfa_serial!(grid, sites, p::Real=2)
    for (color, (x, y)) in enumerate(sites)
        grid[x, y] = convert(eltype(grid), color)
    end
    gridp = copy(grid)
    k = max(size(grid)...)
    while k > 1
        k = k ÷ 2 + k % 2
        for (x, y) in collect(Iterators.product(1:size(grid, 1), 1:size(grid, 2)))
            for j in (-k, 0, k), i in (-k, 0, k)
                checkbounds(Bool, grid, x + i, y + j) || continue
                colorq = grid[x + i, y + j]
                colorq !== 0 || continue
                colorp = grid[x, y]
                if colorp === 0
                    gridp[x, y] = colorq
                elseif distance(sites[colorp], (x, y), p) > distance(sites[colorq], (x, y), p)
                    gridp[x, y] = colorq
                end
            end
        end
        grid .= gridp
    end
    return grid
end

function jfa_parallel!(grid, sites, p::Real=2)
    for (color, (x, y)) in enumerate(sites)
        grid[x, y] = convert(eltype(grid), color)
    end
    gridp = copy(grid)
    k = max(size(grid)...)
    while k > 1
        k = k ÷ 2 + k % 2
        Threads.@threads for (x, y) in collect(Iterators.product(1:size(grid, 1), 1:size(grid, 2)))
            for j in (-k, 0, k), i in (-k, 0, k)
                checkbounds(Bool, grid, x + i, y + j) || continue
                colorq = grid[x + i, y + j]
                colorq !== 0 || continue
                colorp = grid[x, y]
                if colorp === 0
                    gridp[x, y] = colorq
                elseif distance(sites[colorp], (x, y), p) > distance(sites[colorq], (x, y), p)
                    gridp[x, y] = colorq
                end
            end
        end
        grid .= gridp
    end
    return grid
end

function rand_sites(M, N, K)
    idx = [(m, n) for m in 1:M, n in 1:N]
    shuffle!(idx)
    idx[1:K]
end

@btime jfa_serial!(grid, sites) setup = (grid = zeros(Int, 100, 100); sites = collect(rand_sites(size(grid)..., 100)))
@btime jfa_parallel!(grid, sites) setup = (grid = zeros(Int, 100, 100); sites = collect(rand_sites(size(grid)..., 100)))

yields

  7.030 ms (16 allocations: 1.14 MiB) #jfa_serial
  55.060 ms (2747567 allocations: 75.80 MiB) #jfa_parallel
  • A workaround for this behavior is moving the inner part of the @threads for loop into an auxiliary function.
  • Is it this issue. If yes, why?
  • Is it time to check out Polyester?
  • I would be most grateful for an explanation or experience report.

Thanks!

Fixing @threads with an auxiliary and adding Polyester to the mix

using LinearAlgebra
using Random
using BenchmarkTools
using Polyester

function distance(x, y, p::Real = 2)
    norm((x[1] - y[1], x[2] - y[2]), p)
end

function jfa_serial!(grid, sites, p::Real=2)
    for (color, (x, y)) in enumerate(sites)
        grid[x, y] = convert(eltype(grid), color)
    end
    gridp = copy(grid)
    k = max(size(grid)...)
    while k > 1
        k = k ÷ 2 + k % 2
        for y in 1:size(grid, 1), x in 1:size(grid, 2)
            for j in (-k, 0, k), i in (-k, 0, k)
                checkbounds(Bool, grid, x + i, y + j) || continue
                colorq = grid[x + i, y + j]
                colorq !== 0 || continue
                colorp = grid[x, y]
                if colorp === 0
                    gridp[x, y] = colorq
                elseif distance(sites[colorp], (x, y), p) > distance(sites[colorq], (x, y), p)
                    gridp[x, y] = colorq
                end
            end
        end
        grid .= gridp
    end
    return grid
end

function jfa_parallel_aux!(grid, gridp, sites, k, x, y, p)
    for j in (-k, 0, k), i in (-k, 0, k)
        checkbounds(Bool, grid, x + i, y + j) || continue
        colorq = grid[x + i, y + j]
        colorq !== 0 || continue
        colorp = grid[x, y]
        if colorp === 0
            gridp[x, y] = colorq
        elseif distance(sites[colorp], (x, y), p) > distance(sites[colorq], (x, y), p)
            gridp[x, y] = colorq
        end
    end
end
function jfa_parallel!(grid, sites, p::Real=2)
    for (color, (x, y)) in enumerate(sites)
        grid[x, y] = convert(eltype(grid), color)
    end
    gridp = copy(grid)
    k = max(size(grid)...)
    while k > 1
        k = k ÷ 2 + k % 2
        Threads.@threads for (x, y) in collect(Iterators.product(1:size(grid, 1), 1:size(grid, 2)))
            jfa_parallel_aux!(grid, gridp, sites, k, x, y, p)
        end
        grid .= gridp
    end
    return grid
end

function jfa_polyester!(grid, sites, p::Real=2)
    for (color, (x, y)) in enumerate(sites)
        grid[x, y] = convert(eltype(grid), color)
    end
    gridp = copy(grid)
    k = max(size(grid)...)
    while k > 1
        k = k ÷ 2 + k % 2
        @batch for y in 1:size(grid, 1), x in 1:size(grid, 2)
            for j in (-k, 0, k), i in (-k, 0, k)
                checkbounds(Bool, grid, x + i, y + j) || continue
                colorq = grid[x + i, y + j]
                colorq !== 0 || continue
                colorp = grid[x, y]
                if colorp === 0
                    gridp[x, y] = colorq
                elseif distance(sites[colorp], (x, y), p) > distance(sites[colorq], (x, y), p)
                    gridp[x, y] = colorq
                end
            end
        end
        grid .= gridp
    end
    return grid
end

function rand_sites(M, N, K)
    idx = [(m, n) for m in 1:M, n in 1:N]
    shuffle!(idx)
    idx[1:K]
end

@btime jfa_serial!(grid, sites) setup = (grid = zeros(Int, 100, 100); sites = collect(rand_sites(size(grid)..., 100)))
@btime jfa_parallel!(grid, sites) setup = (grid = zeros(Int, 100, 100); sites = collect(rand_sites(size(grid)..., 100)))
@btime jfa_polyester!(grid, sites) setup = (grid = zeros(Int, 100, 100); sites = collect(rand_sites(size(grid)..., 100)))

yields

  6.647 ms (2 allocations: 78.17 KiB)
  2.253 ms (234 allocations: 1.17 MiB)
  1.434 ms (9 allocations: 78.83 KiB)

and Polyester doesn’t even suffer from the (inference?) problems of Threads. How can that be? What are the tradeoffs between Polyester and Threads?

polyester loses computability. base threads will be relatively intelligent if you nest threaded loops, but polyester will result in over-subscription.

2 Likes

So no recursion with Polyester, yes I noticed. Thanks!

If you nest polyester with itself/ libraries using PolyesterWeave, it will not oversubscribe.

doesn’t even suffer from the (inference?) problems of Threads . How can that be?

It tries to avoid capturing variables into closures.

1 Like

Thanks a lot for this hint, will try. I’d like to stay with the original solution as it is the explanation for tradeoffs between @threads and @batch.

As an aside, I’d try to avoid using collect, especially in cases like:

for (x, y) in collect(Iterators.product(1:size(grid, 1), 1:size(grid, 2)))

This was my easy way out w.r.t. syntactic restrictions of the threads macro, but I see your point. Thanks again!

1 Like

@Elrod : sorry for bothering you again: my search engine fu left me and I only found this. Do you have a better resource how to evaluate?

What exactly are you trying to evaluate?

Libraries that use PolyesterWeave.jl (this includes, among others, Polyester.jl and LoopVectorization.jl) use PolyesterWeave.request_threads to get the threads they are allowed to run on. This will make sure they do oversubscribe.

E.g., if you run Octavian.matmul from inside @batch, it’ll probably be single threaded. But if you run it from toplevel, it’ll be multithreaded.
At one point, Polyester supported reserving threads per threads (e.g., if you have 16 threads total, it’d like you run 4 threads, that each have 3 extra ones reserved the others do not have access to). I didn’t want to keep maintaining that code, so I eventually took it out, but if someone wants something like that, I’d be happy to explain how to add support for this once again.

None of these libraries will work well when mixed with Threads.@threads or Threads.@spawn, however.

1 Like

Thanks, if I understand you correctly this works similar to BLAS.set_num_threads. This will help…

You can set BLAS.set_num_threads in an existing Julia session, but there currently isn’t an API for limiting that in Polyester(Weave). Currently, the total number of threads used is based on Threads.nthreads().
However, it’ll normally try and limit itself to one thread per physical core.

2 Likes