I have a generic implementation of the LU factorization of a rectangular matrix:
function _swap_col!(X, j1, j2)
@inbounds begin m=size(X,1);
(j1==j2 || m<1) && return nothing;
@simd for i=1:m X[i,j1], X[i,j2] = X[i,j2], X[i,j1] end
end end;
function _swap_row!(X, i1, i2)
@inbounds begin n=size(X,2);
(i1==i2 || n<1) && return nothing;
@simd for j=1:n X[i1,j], X[i2,j] = X[i2,j], X[i1,j] end
end end;
function _loc_pivot(X, i0, j0)
# in the first nonzero row of submatrix X[i0:end,j0:end], find the location of entry with largest absolute value
@inbounds begin m,n=size(X); d_ = abs(X[i0,j0]);
for i=i0:m j_=j0;
for j=j0:n
X[i,j]==0 && continue;
d = abs(X[i,j]);
if (X[i,j_]==0 || d_<d) j_=j; d_=d; end end;
X[i,j_]≠0 && return i,j_ end;
return i0,j0 end end;
function factor_LU!(X, σ, τ)
m,n = size(X)
mn = min(m,n)
for k=1:mn
i,j = _loc_pivot(X,k,k);
X[i,j]==0 && break; # if X[i,j] is zero, then X[k:m,k:n] are zero
if i≠k _swap_row!(X,k,i); σ[k],σ[i] = σ[i],σ[k] end
if j≠k _swap_col!(X,k,j); τ[k],τ[j] = τ[j],τ[k] end
xkk = X[k,k];
@simd for i=k+1:m X[i,k] /= xkk end
for j=k+1:n
xkj = X[k,j];
@simd for i=k+1:m
X[i,j] -= X[i,k] * xkj end end end end;
function getLU(X::Matrix{tw}) where tw
m,n = size(X); r = min(m,n);
L=zeros(tw,m,r); U=zeros(tw,r,n);
for k=1:r L[k,k] = tw(1) end
for i=1:m, j=1:min(i-1,r) L[i,j] = X[i,j] end
for i=1:r, j=i:n U[i,j] = X[i,j] end
return L, U end;
It works efficiently and correctly:
julia> using LinearAlgebra
julia> X = rand(-1:1e-6:1,1000,1000); Y = copy(X); σ,τ = (collect(1:d) for d ∈ size(X));
julia> @time factor_LU!(Y,σ,τ); L,U=getLU(Y); norm(X[σ,τ]-L*U)
0.324735 seconds (1 allocation: 32 bytes)
3.755177227563265e-12
But for large matrices (m,n>10_000), it is still slow. So I’d like to parallelize it.
If I just put @threads
in front of for j=k+1:n
, there is no improvement, since the threads are spawned for each k, which incurs some time penalty. So I’d like to spawn threads outside the outer loop and use them repeatedly in the inner loop. My attempt, with ChatGPT:
using Base.Threads
function factor_LU_parallel!(X, σ, τ)
@inbounds begin m,n=size(X); mn=min(m,n)
nt = nthreads()
jobs = Channel{Tuple{Int,Int,Int}}(nt)
nactive = Atomic{Int}(0)
workers = [ # spawn threads that wait to receive work
@spawn while true
job = try take!(jobs) catch; break end; k,j0,j1=job;
for j=j0:j1
xkj = X[k,j]
for i=k+1:m X[i,j] -= X[i,k]*xkj end end;
atomic_sub!(nactive,1) # subtract 1 from the number of unfinished jobs
end for _=1:nt];
for k=1:mn
i,j = _loc_pivot(X,k,k);
X[i,j]==0 && break;
if i≠k _swap_row!(X,k,i); σ[k],σ[i] = σ[i],σ[k] end
if j≠k _swap_col!(X,k,j); τ[k],τ[j] = τ[j],τ[k] end
xkk=X[k,k]; @simd for i=k+1:m X[i,k] /= xkk end
bs=max(1,cld(n-k,nt)); # split k+1:n into nt disjoint chunks
for j=k+1:bs:n # at each time, there are at most nt jobs open
atomic_add!(nactive,1);
put!(jobs, (k,j,min(j+bs-1,n))) end
while nactive[]>0 sleep(1e-9) end # wait until all jobs are done
end;
close(jobs); foreach(fetch,workers); end end;
This is slower, has lock conflicts, and gives an incorrect result:
julia> X = rand(-1:1e-6:1,1000,1000); Y = copy(X); σ,τ = (collect(1:d) for d ∈ size(X));
julia> @time factor_LU_parallel!(Y,σ,τ); L,U=getLU(Y); norm(X[σ,τ]-L*U)
2.622632 seconds (3.83 k allocations: 125.250 KiB, 0.40% gc time, 34037 lock conflicts)
3311.8813323932263
I’d appreciate help in making factor_LU!
more performant.