Threading race condition when pushing to arrays

jagot · May 29, 2023, 1:50pm

I’m trying to populate a sparse matrix in a multithreaded loop, along the lines of the below code:

function build_matrix(fun::Function, ::Type{T}, m, n) where T
    # Allocate thread-local arrays, which we will later concatenate.
    I = [Int[] for i in 1:Threads.nthreads()]
    J = [Int[] for i in 1:Threads.nthreads()]
    V = [T[] for i in 1:Threads.nthreads()]

    Threads.@threads for i = 1:m
        tid = Threads.threadid()
        for j = 1:n
            v = fun(i,j)
            iszero(v) && continue

            push!(I[tid], i)
            push!(J[tid], j)
            push!(V[tid], v)
        end
    end

    sparse(reduce(vcat, I),
           reduce(vcat, J),
           reduce(vcat, V), m, n)
end

This always works in serial calculations, and sometimes when running with many threads, but in the latter case, I very often get the error message

ERROR: LoadError: ArgumentError: the first three arguments' lengths must match,
length(I) (=332928) == length(J) (= 332927) == length(V) (= 332925)

where the numbers vary, but are typically close to one another.

What can be the cause of this? I cannot really see that the code should be thread-unsafe. I’m thinking that there may be some data reshuffling in memory as the “thread-local” arrays grow, but the vectors of vectors I thought would only hold pointers to the vectors actually storing the data.

I’m sorry that I’m unable to provide a reproducible MWE, I will see if I can cook something up. I’ve seen this behaviour on two different machines (one Intel(R) Xeon(R) CPU E5-2640 v4 @ 2.40GHz and one AMD Ryzen 9 3950X 16-Core Processor).

I have seen this since at least Julia 1.8, maybe already earlier, and it is still present on 1.9.

DanielVandH · May 29, 2023, 2:28pm

Does it still break if you use Threads.@threads :static for i = 1:m instead?

vchuravy · May 29, 2023, 3:04pm

The likely issue here is that due to task migration (e.g. tasks can be executed by different threads), the assumption that the tid is constant is wrong.

One way of solving this is to use a safe data-structure like a Channel.

    Is = Channel{Vector{Int}}(m)
    Js = Channel{Vector{Int}}(m)
    Vs = Channel{Vector{T}}(m)
    Threads.@threads for i = 1:m
        I = Int[]
        J = Int[]
        V = T[]
        for j = 1:n
            v = fun(i,j)
            iszero(v) && continue

            push!(I, i)
            push!(J, j)
            push!(V, v)
        end
        put!(Is, I)
        put!(Js, J)
        put!(Vs, V)
    end
    close(Is); close(Js); close(Vs)
    sparse(reduce(vcat, Is),
           reduce(vcat, Js),
           reduce(vcat, Vs), m, n)

Note: The close operation, otherwise iterating over the elements would hang your program.

jling · May 29, 2023, 3:05pm

how much slower is this?

vchuravy · May 29, 2023, 3:24pm

using FLoops
using BangBang # for `append!!`
using MicroCollections  # for `EmptyVector` and `SingletonVector`
using SparseArrays

function build_matrix(fun::Function, ::Type{T}, m, n) where T
    # Allocate thread-local arrays, which we will later concatenate.
    I = [Int[] for i in 1:Threads.nthreads()]
    J = [Int[] for i in 1:Threads.nthreads()]
    V = [T[] for i in 1:Threads.nthreads()]

    Threads.@threads for i = 1:m
        tid = Threads.threadid()
        for j = 1:n
            v = fun(i,j)
            iszero(v) && continue

            push!(I[tid], i)
            push!(J[tid], j)
            push!(V[tid], v)
        end
    end

    @show I
    sparse(reduce(vcat, I),
           reduce(vcat, J),
           reduce(vcat, V), m, n)
end


function build_matrix_floop(fun::Function, ::Type{T}, m, n) where T
    # Allocate thread-local arrays, which we will later concatenate.
    @floop for i = 1:m
        I = Int[]
        J = Int[]
        V = T[]
        for j = 1:n
            v = fun(i,j)
            iszero(v) && continue

            push!(I, i)
            push!(J, j)
            push!(V, v)
        end
        @reduce(Is = append!!(EmptyVector(), I))
        @reduce(Js = append!!(EmptyVector(), J))
        @reduce(Vs = append!!(EmptyVector(), V))
    end
    sparse(Is, Js, Vs, m, n)
end

function build_matrix_channels(fun::Function, ::Type{T}, m, n) where T
    Is = Channel{Vector{Int}}(m)
    Js = Channel{Vector{Int}}(m)
    Vs = Channel{Vector{T}}(m)
    Threads.@threads for i = 1:m
        I = Int[]
        J = Int[]
        V = T[]
        for j = 1:n
            v = fun(i,j)
            iszero(v) && continue

            push!(I, i)
            push!(J, j)
            push!(V, v)
        end
        put!(Is, I)
        put!(Js, J)
        put!(Vs, V)
    end
    close(Is); close(Js); close(Vs)
    sparse(reduce(vcat, Is),
           reduce(vcat, Js),
           reduce(vcat, Vs), m, n)
end

Without threads:

julia> @benchmark build_matrix((i,j)->i*j, Int, 10, 10)
BenchmarkTools.Trial: 10000 samples with 6 evaluations.
 Range (min … max):  5.053 μs … 357.837 μs  ┊ GC (min … max): 0.00% … 95.01%
 Time  (median):     5.547 μs               ┊ GC (median):    0.00%
 Time  (mean ± σ):   6.406 μs ±  13.773 μs  ┊ GC (mean ± σ):  8.91% ±  4.07%

   ▂▄▇███▇▆▅▄▂             ▁▁▁▂▁▂▂▂▂▂▁ ▁                      ▂
  ▆█████████████▇▇█▇▇▇▇██▇███████████████▇▇▇▇▇▇▇▅▅▅▆▅▄▃▃▃▅▄▅▃ █
  5.05 μs      Histogram: log(frequency) by time      9.69 μs <

 Memory estimate: 13.36 KiB, allocs estimate: 38.

julia> @benchmark build_matrix_floop((i,j)->i*j, Int, 10, 10)
BenchmarkTools.Trial: 10000 samples with 6 evaluations.
 Range (min … max):  6.175 μs … 345.855 μs  ┊ GC (min … max):  0.00% … 96.24%
 Time  (median):     6.743 μs               ┊ GC (median):     0.00%
 Time  (mean ± σ):   7.963 μs ±  18.642 μs  ┊ GC (mean ± σ):  13.00% ±  5.43%

   ▂▄▆████▇▆▅▃▃▂▁ ▁                                           ▂
  ▇██████████████████▇▇▆▆▇▆▆▄▆▄▃▄▄▃▄▅▅▃▅▃▄▄▅▅▅▆▇▆▇▇▇▇▆▆▅▅▆▇▇▇ █
  6.18 μs      Histogram: log(frequency) by time      11.3 μs <

 Memory estimate: 24.22 KiB, allocs estimate: 120.

julia> @benchmark build_matrix_channels((i,j)->i*j, Int, 10, 10)
BenchmarkTools.Trial: 10000 samples with 1 evaluation.
 Range (min … max):  10.680 μs …  2.335 ms  ┊ GC (min … max):  0.00% … 98.02%
 Time  (median):     12.430 μs              ┊ GC (median):     0.00%
 Time  (mean ± σ):   14.909 μs ± 65.073 μs  ┊ GC (mean ± σ):  12.14% ±  2.76%

    ▃▅▇▇████▇▆▆▅▄▄▃▁▁▁     ▁ ▁▂▁▂▁▁▂▁▁▂▁▁▁                    ▃
  ▃███████████████████▇███████████████████████▇▇▆▇▇▅▆▅▄▆▅▃▅▅▄ █
  10.7 μs      Histogram: log(frequency) by time      22.4 μs <

 Memory estimate: 36.42 KiB, allocs estimate: 203.

Haven’t had time to look at why and of course would need an actually useful test.

nilshg · May 29, 2023, 4:37pm

Not that I have anything useful to add but welcome back!

jagot · May 29, 2023, 5:07pm

It seems the Channel-based approach works, so marking this as solved. Thank you!

Topic		Replies	Views
Use Dynamic Arrays (Vectors) during Multi-threading Julia at Scale multithreading , array , memory-allocation	7	2038	September 10, 2018
Threads.@threads not working, even without i/o General Usage	3	885	March 23, 2019
Threaded push! different for supposedly similar array initializations General Usage	2	283	August 5, 2021
Thread-safe array building General Usage multithreading	21	7475	October 24, 2017
Threaded loop shows error when using sparse matrix, but not with dense matrices New to Julia sparse , threads	3	384	September 15, 2022

Threading race condition when pushing to arrays

Related topics