The creation of FFTW plans is not threadsafe https://discourse.julialang.org/t/fftw-plans-for-multiple-threads/21861. (Or is it?) However, if the plan is pre-allocated, that should be thread safe? I want to pre-compute a matrix factorization (that uses FFTW internally) and use it in multiple threads to do multiplications (which use FFT internally). I guess the issue is that the multiplication makes another plan internally.
In this example, it seems that pre-computing the factorization speeds up the multiplication by about 2x (in a single thread). However, when naively used in a multi-threaded context, the results are wrong.
The code below performs the same computation with all combinations of “threaded/not-threaded” and “with/without pre-computing factorization”. How to parallelize the non-threaded code-path with pre-factorization?
using Random, SHA, Test
using ToeplitzMatrices, LinearAlgebra, FFTW
using Folds
"""
Computes matrix vector product T*x, rounds numbers, takes mod2 and returns BitVector
"""
function toep_mul_mod2(
    T::AbstractMatrix,
    x::Vec,
) where {Vec<:Union{BitVector,SubArray{Bool,N,BitMatrix} where N}}
    # slow for BitVector input, so we reallocate a Vector{Bool}.
    # Using Vector{Bool} throughout uses too much memory in my application
    round.(Int64, T * Vector(x)) .% 2 .% Int8
end
"""
Computes matrix-vector-product T*x, rounds result, take mod2 and returns BitVector
Takes as input a factorization of a matrix.
Note: In a multi-threaded context, this gives wrong results!
"""
function toep_mul_mod2(Tfac::ToeplitzMatrices.ToeplitzFactorization, x::Any)
    x = convert(Vector{Float64}, collect(x))  # slow for BitVector input, so we reallocate a Vector{Bool}.
    # pre-allocate result
    result = zeros(Float64, length(Tfac.tmp) - length(x) + 1)   # This uses a dirty hack to get the output size from the Factorization object
    mul!(result, Tfac, x, 1.0, 1.0)  # This use seems intended but is not well-documented
    # Just doing `Tfac*x` is not implemented
    # See https://github.com/JuliaLinearAlgebra/ToeplitzMatrices.jl/issues/101
    round.(Int64, result) .% 2 .% Int8
end
"""
For each sub-vector of length `block_size`, compute matrix-vector-product T*x, 
round result, take mod2, concatenate results again and return BitVector
"""
function toep_mul_mod2_subblocks(xbits, T; prefactorize, use_Folds, block_size = 4096)
    Tfac = LinearAlgebra.factorize(T)
    if use_Folds
        return Folds.mapreduce(
            vcat,
            eachcol(reshape(xbits, block_size, :)),
            init = BitVector(),
        ) do block
            toep_mul_mod2(prefactorize ? Tfac : T, block) .% Bool
        end
    else
        return mapreduce(
            vcat,
            eachcol(reshape(xbits, block_size, :)),
            init = BitVector(),
        ) do block
            toep_mul_mod2(prefactorize ? Tfac : T, block) .% Bool
        end
    end
end
check_hash(x::BitVector) =
    (@show sha256(reinterpret(UInt8, x.chunks)) .|> Char |> String) ==
    "\e\u91i4\u85«Î*Ú½Òhaã\x1eðEvä¡»\t\b§àË.ãr\x1fÅY"
function test_main()
    # generates a random Toeplitz matrix and long BitVector
    rng = MersenneTwister(42)
    T = Toeplitz{Float64}(
        [1.0; rand(rng, [0.0, 1.0], 1759)],
        [1.0; rand(rng, [0.0, 1.0], 4095)],
    )
    x = rand(rng, [0, 1], 4096 * 1000) .% Bool
    for prefactorize in [true, false]
        for use_Folds in [true, false]
            # Exception occurs only when using threading (Folds.jl) AND pre-factorizing the matrix
            # The `InexactError` happens because the result of the matrix-vector multiplication is wrong.
            if prefactorize && use_Folds
                @show @test_throws InexactError toep_mul_mod2_subblocks(
                    x,
                    T;
                    use_Folds = true,
                    prefactorize = true,
                ) |> check_hash
            else
                @test toep_mul_mod2_subblocks(x, T; use_Folds, prefactorize) |> check_hash
            end
        end
    end
end
test_main()