Another slowdown when using `Threads.@threads`?

Fixing @threads with an auxiliary and adding Polyester to the mix

using LinearAlgebra
using Random
using BenchmarkTools
using Polyester

function distance(x, y, p::Real = 2)
    norm((x[1] - y[1], x[2] - y[2]), p)
end

function jfa_serial!(grid, sites, p::Real=2)
    for (color, (x, y)) in enumerate(sites)
        grid[x, y] = convert(eltype(grid), color)
    end
    gridp = copy(grid)
    k = max(size(grid)...)
    while k > 1
        k = k ÷ 2 + k % 2
        for y in 1:size(grid, 1), x in 1:size(grid, 2)
            for j in (-k, 0, k), i in (-k, 0, k)
                checkbounds(Bool, grid, x + i, y + j) || continue
                colorq = grid[x + i, y + j]
                colorq !== 0 || continue
                colorp = grid[x, y]
                if colorp === 0
                    gridp[x, y] = colorq
                elseif distance(sites[colorp], (x, y), p) > distance(sites[colorq], (x, y), p)
                    gridp[x, y] = colorq
                end
            end
        end
        grid .= gridp
    end
    return grid
end

function jfa_parallel_aux!(grid, gridp, sites, k, x, y, p)
    for j in (-k, 0, k), i in (-k, 0, k)
        checkbounds(Bool, grid, x + i, y + j) || continue
        colorq = grid[x + i, y + j]
        colorq !== 0 || continue
        colorp = grid[x, y]
        if colorp === 0
            gridp[x, y] = colorq
        elseif distance(sites[colorp], (x, y), p) > distance(sites[colorq], (x, y), p)
            gridp[x, y] = colorq
        end
    end
end
function jfa_parallel!(grid, sites, p::Real=2)
    for (color, (x, y)) in enumerate(sites)
        grid[x, y] = convert(eltype(grid), color)
    end
    gridp = copy(grid)
    k = max(size(grid)...)
    while k > 1
        k = k ÷ 2 + k % 2
        Threads.@threads for (x, y) in collect(Iterators.product(1:size(grid, 1), 1:size(grid, 2)))
            jfa_parallel_aux!(grid, gridp, sites, k, x, y, p)
        end
        grid .= gridp
    end
    return grid
end

function jfa_polyester!(grid, sites, p::Real=2)
    for (color, (x, y)) in enumerate(sites)
        grid[x, y] = convert(eltype(grid), color)
    end
    gridp = copy(grid)
    k = max(size(grid)...)
    while k > 1
        k = k ÷ 2 + k % 2
        @batch for y in 1:size(grid, 1), x in 1:size(grid, 2)
            for j in (-k, 0, k), i in (-k, 0, k)
                checkbounds(Bool, grid, x + i, y + j) || continue
                colorq = grid[x + i, y + j]
                colorq !== 0 || continue
                colorp = grid[x, y]
                if colorp === 0
                    gridp[x, y] = colorq
                elseif distance(sites[colorp], (x, y), p) > distance(sites[colorq], (x, y), p)
                    gridp[x, y] = colorq
                end
            end
        end
        grid .= gridp
    end
    return grid
end

function rand_sites(M, N, K)
    idx = [(m, n) for m in 1:M, n in 1:N]
    shuffle!(idx)
    idx[1:K]
end

@btime jfa_serial!(grid, sites) setup = (grid = zeros(Int, 100, 100); sites = collect(rand_sites(size(grid)..., 100)))
@btime jfa_parallel!(grid, sites) setup = (grid = zeros(Int, 100, 100); sites = collect(rand_sites(size(grid)..., 100)))
@btime jfa_polyester!(grid, sites) setup = (grid = zeros(Int, 100, 100); sites = collect(rand_sites(size(grid)..., 100)))

yields

  6.647 ms (2 allocations: 78.17 KiB)
  2.253 ms (234 allocations: 1.17 MiB)
  1.434 ms (9 allocations: 78.83 KiB)

and Polyester doesn’t even suffer from the (inference?) problems of Threads. How can that be? What are the tradeoffs between Polyester and Threads?