The creation of FFTW plans is not threadsafe https://discourse.julialang.org/t/fftw-plans-for-multiple-threads/21861. (Or is it?) However, if the plan is pre-allocated, that should be thread safe? I want to pre-compute a matrix factorization (that uses FFTW internally) and use it in multiple threads to do multiplications (which use FFT internally). I guess the issue is that the multiplication makes another plan internally.
In this example, it seems that pre-computing the factorization speeds up the multiplication by about 2x (in a single thread). However, when naively used in a multi-threaded context, the results are wrong.
The code below performs the same computation with all combinations of “threaded/not-threaded” and “with/without pre-computing factorization”. How to parallelize the non-threaded code-path with pre-factorization?
using Random, SHA, Test
using ToeplitzMatrices, LinearAlgebra, FFTW
using Folds
"""
Computes matrix vector product T*x, rounds numbers, takes mod2 and returns BitVector
"""
function toep_mul_mod2(
T::AbstractMatrix,
x::Vec,
) where {Vec<:Union{BitVector,SubArray{Bool,N,BitMatrix} where N}}
# slow for BitVector input, so we reallocate a Vector{Bool}.
# Using Vector{Bool} throughout uses too much memory in my application
round.(Int64, T * Vector(x)) .% 2 .% Int8
end
"""
Computes matrix-vector-product T*x, rounds result, take mod2 and returns BitVector
Takes as input a factorization of a matrix.
Note: In a multi-threaded context, this gives wrong results!
"""
function toep_mul_mod2(Tfac::ToeplitzMatrices.ToeplitzFactorization, x::Any)
x = convert(Vector{Float64}, collect(x)) # slow for BitVector input, so we reallocate a Vector{Bool}.
# pre-allocate result
result = zeros(Float64, length(Tfac.tmp) - length(x) + 1) # This uses a dirty hack to get the output size from the Factorization object
mul!(result, Tfac, x, 1.0, 1.0) # This use seems intended but is not well-documented
# Just doing `Tfac*x` is not implemented
# See https://github.com/JuliaLinearAlgebra/ToeplitzMatrices.jl/issues/101
round.(Int64, result) .% 2 .% Int8
end
"""
For each sub-vector of length `block_size`, compute matrix-vector-product T*x,
round result, take mod2, concatenate results again and return BitVector
"""
function toep_mul_mod2_subblocks(xbits, T; prefactorize, use_Folds, block_size = 4096)
Tfac = LinearAlgebra.factorize(T)
if use_Folds
return Folds.mapreduce(
vcat,
eachcol(reshape(xbits, block_size, :)),
init = BitVector(),
) do block
toep_mul_mod2(prefactorize ? Tfac : T, block) .% Bool
end
else
return mapreduce(
vcat,
eachcol(reshape(xbits, block_size, :)),
init = BitVector(),
) do block
toep_mul_mod2(prefactorize ? Tfac : T, block) .% Bool
end
end
end
check_hash(x::BitVector) =
(@show sha256(reinterpret(UInt8, x.chunks)) .|> Char |> String) ==
"\e\u91i4\u85«Î*Ú½Òhaã\x1eðEvä¡»\t\b§à Ë.ãr\x1fÅY"
function test_main()
# generates a random Toeplitz matrix and long BitVector
rng = MersenneTwister(42)
T = Toeplitz{Float64}(
[1.0; rand(rng, [0.0, 1.0], 1759)],
[1.0; rand(rng, [0.0, 1.0], 4095)],
)
x = rand(rng, [0, 1], 4096 * 1000) .% Bool
for prefactorize in [true, false]
for use_Folds in [true, false]
# Exception occurs only when using threading (Folds.jl) AND pre-factorizing the matrix
# The `InexactError` happens because the result of the matrix-vector multiplication is wrong.
if prefactorize && use_Folds
@show @test_throws InexactError toep_mul_mod2_subblocks(
x,
T;
use_Folds = true,
prefactorize = true,
) |> check_hash
else
@test toep_mul_mod2_subblocks(x, T; use_Folds, prefactorize) |> check_hash
end
end
end
end
test_main()