I wish to write a function `decompose_diag_over_field!`

that, given an mxn matrix `X`

over a field, decomposes it **in-place** as `X=U*D*V`

, where `D`

is a diagonal mxn matrix and `U,V`

are invertible mxm and nxn matrices, with inverses `Ui,Vi`

.

I don’t need `U,V`

to be unitary, like in singular value decomposition. I don’t need `D[i,i]`

to divide `D[i+1,i+1]`

, like in Smith normal form. I’m just aiming for efficient computation of matrix equivalence.

I would like the type of weights/entries `tw`

to be generic (representing an arbitrary field), assuming only that functions `+, -, *, inv, zero, one`

and pivot selection are defined for that type.

My current solution:

```
using LinearAlgebra
function swap_col!(X ::Matrix{tw}, j1 ::Int, j2 ::Int) ::Nothing where {tw}
@inbounds for i = 1:size(X,1)
X[i,j1], X[i,j2] = X[i,j2], X[i,j1];
end
return nothing
end
function swap_row!(X ::Matrix{tw}, i1 ::Int, i2 ::Int) ::Nothing where {tw}
@inbounds for j = 1:size(X,2)
X[i1,j], X[i2,j] = X[i2,j], X[i1,j];
end
return nothing
end
function swap_colrow!(X ::Tuple{Vararg{Matrix{tw}}}, Y ::Tuple{Vararg{Matrix{tw}}}, v1 ::Int, v2 ::Int) ::Nothing where {tw}
#= Given composable matrices (R-linear maps) R^l <--X-- R^m <--Y-- R^n (several of them, where l,n can vary but m is fixed),
we replace basis elements e_v1,e_v2 ∈ R^m with e_v2,e_v1 and adjust all X,Y accordingly (swap v1,v2-columns in Xi and v1,v2-rows in Yj). =#
for t = 1:length(X)
swap_col!(X[t],v1,v2)
end
for t = 1:length(Y)
swap_row!(Y[t],v1,v2)
end
return nothing
end
function add_col!(X ::Matrix{tw}, a ::tw, j1::Int, j2::Int) ::Nothing where {tw}
@inbounds @simd for i = 1:size(X,1)
X[i,j2] += a*X[i,j1]
end
return nothing
end
function add_row!(X ::Matrix{tw}, a ::tw, i1 ::Int, i2 ::Int) ::Nothing where {tw}
@inbounds @simd for j = 1:size(X,2)
X[i2,j] += a*X[i1,j]
end
return nothing
end
function add_col!(X ::Tuple{Vararg{Matrix{tw}}}, Y ::Tuple{Vararg{Matrix{tw}}}, a ::tw, j1 ::Int, j2 ::Int) ::Nothing where {tw}
#= Given many composable matrices R^l <--X-- R^m <--Y-- R^n (where l,n can vary but m is fixed),
we replace the basis element e_j2 ∈ R^m with the linear combination a*e_i1 + e_i2. =#
@inbounds for t = 1:length(X)
add_col!(X[t], a, j1, j2)
end
@inbounds for t = 1:length(Y)
add_row!(Y[t], -a, j2, j1)
end
return nothing
end
function add_row!(X ::Tuple{Vararg{Matrix{tw}}}, Y ::Tuple{Vararg{Matrix{tw}}}, a ::tw, i1 ::Int, i2 ::Int) ::Nothing where {tw}
#= Given many composable matrices (R^l <--X-- R^m <--Y-- R^n (where l,n can vary but m is fixed),
we replace the basis element e_i1 ∈ R^m with the linear combination e_i1 - a*e_i2. =#
@inbounds for t = 1:length(Y)
add_row!(Y[t], a, i1, i2)
end
@inbounds for t = 1:length(X)
add_col!(X[t],-a, i2, i1)
end
return nothing
end
function elim_col_over_field!(X ::Matrix{tw}, U ::Matrix{tw}, Ui ::Matrix{tw}, i ::Int, j ::Int; prl ::Int =1) ::Nothing where tw
#= After applying this function, the only nonzero entry in j-th column of the altered matrix is X[i,j]. Throughout, U*X is preserved and U*Ui=I. =#
x ::tw = inv(X[i,j] ::tw)
_0, _1 = zero(tw), one(tw)
@assert !iszero(X[i,j])
if prl==0
for i_ = i+1:size(X,1)
X[i_,j]==_0 && continue;
add_row!((U,), (X,Ui), -X[i_,j]*x, i, i_);
X[i_,j] = _0
end
elseif prl==1
Threads.@threads for i_ = i+1:size(X,1)
X[i_,j] == _0 && continue
add_row!((U,), (X,Ui), -X[i_,j]*x, i, i_)
X[i_,j] = _0
end
else error("prl=$prl is not supported")
end
return nothing
end
function elim_row_over_field!(X ::Matrix{tw}, V ::Matrix{tw}, Vi ::Matrix{tw}, i ::Int, j ::Int; prl ::Int =1) ::Nothing where tw
#= After applying this function, the only nonzero entry in i-th row of the altered matrix is X[i,j]. Throughout, X*V is preserved and V*Vi=I. =#
x ::tw = inv(X[i,j] ::tw)
_0, _1 = zero(tw), one(tw)
@assert !iszero(X[i,j])
if prl==0
for j_ = j+1:size(X,2)
X[i,j_] == _0 && continue
add_col!((Vi,X), (V,), -X[i,j_]*x, j, j_)
X[i,j_] = _0
end
elseif prl==1
Threads.@threads for j_ = j+1:size(X,2)
X[i,j_] == _0 && continue
add_col!((Vi,X), (V,), -X[i,j_]*x, j,j_)
X[i,j_]=_0
end
else error("prl=$prl is not supported")
end
return nothing
end
function decompose_diag_over_field!(D ::Matrix{tw}; U ::Matrix{tw} =Matrix{tw}(undef,0,0), V ::Matrix{tw} =Matrix{tw}(undef,0,0),
Ui ::Matrix{tw} =Matrix{tw}(undef,0,0), Vi ::Matrix{tw} =Matrix{tw}(undef,0,0), prl ::Int =1) ::Int where tw
m,n = size(D)
_0, _1 = zero(tw), one(tw)
mn=min(m,n)
r = 0 # this will be the rank of X
@inbounds for k=1:mn # transform X into a diagonal matrix
i,j = loc_max(D,k,k) # i,j=location of pivot
iszero(D[i,j]) && break;
r = k
swap_colrow!((Vi,D), (V,), k, j)
swap_colrow!((U,), (D,Ui), k, i) # move pivot to position k,k
elim_col_over_field!(D, U, Ui, k, k; prl=prl)
elim_row_over_field!(D, V, Vi, k, k; prl=prl)
end
return r
end
function loc_max(X ::Matrix{tw}, i0 ::Int =1, j0 ::Int =1) ::Tuple{Int,Int} where {tw}
#= Return the location in X where the nonzero entry with the smallest value is. This is used to select the best pivot for Gaussian elimination. =#
m,n = size(X);
i_, j_, d_ = i0, j0, abs(X[i0,j0])
@inbounds for i=i0:m, j=j0:n
d = abs(X[i,j])
if (!iszero(X[i,j]) && (iszero(X[i_,j_]) || d_<d))
i_, j_ = i, j;
d_ = d;
end
end
return i_, j_
end
```

Notice that if I am interested only in some of U,V,Ui,Vi, then I can omit those that aren’t needed from the function call and `Matrix{tw}(undef,0,0)`

is used as a dummy argument, so that the computation finishes faster.

Trying out my implementation:

```
function create_matrices(m, n, vals)
X = rand(vals,(m,n));
U = Matrix{eltype(X)}(I,m,m)
V = Matrix{eltype(X)}(I,n,n)
return X, copy(X), U, V, copy(U), copy(V)
end
julia> X,D,U,V,Ui,Vi = create_matrices(10^3, 10^3, -2:0.00001:2);
julia> @time decompose_diag_over_field!(D,U=U,V=V,Ui=Ui,Vi=Vi,prl=0);
12.390055 seconds (3 allocations: 112 bytes)
julia> maximum(abs.(X-U*D*V))
5.262457136723242e-14
julia> X,D,U,V,Ui,Vi = create_matrices(10^3, 10^3, -2:0.00001:2);
julia> @time decompose_diag_over_field!(D,U=U,V=V,Ui=Ui,Vi=Vi,prl=1);
6.160460 seconds (148.09 k allocations: 14.468 MiB)
julia> maximum(abs.(X-U*D*V))
73.91626375378561
julia> X,_,_,_,_,_ = create_matrices(10^3, 10^3, -2:0.00001:2);
julia> @time U,D,V=svd(X); D=diagm(D);
0.354754 seconds (16 allocations: 45.899 MiB, 1.90% gc time)
julia> maximum(abs.(X-U*D*V'))
4.596323321948148e-14
```

**Questions:**

**(1)** There should be 0 allocations caused by my functions, yet there are 2M. Why?? How can I get rid of them?

*Edit:* Since I replaced `Vector{Matrix{tw}}`

with `Tuple{Vararg{Matrix{tw}}}`

, the unparallelized code has minimal allocations.

**(2)** Why is the parallelized version (`prl=1`

) incorrect? All threads read from the same col/row, but write into different cols/rows, so there should be no data race.

**(3)** How is `svd`

so incredibly fast and precise? I tried going through julia/stdlib/LinearAlgebra/src/svd.jl at master · JuliaLang/julia · GitHub but it is a bit overwhelming. I’d appreciate if anyone could give me an outline of how linear algebra operations can be made more efficient. Are there similar tricks (for matrix equivalency) when we compute over finite fields, like \mathbb{Z}_2?

**Note 1:** I wish to keep the above functions in-place/mutating, because in my case, `U,V,Ui,Vi`

will not always start as identity matrices.

**Note 2:** Forgive my condensed style of writing. It is easier for me to comprehend code like that. Also, forgive me that docstrings are inside of functions and `end`

is on the same line. Those are habits from Python that I prefer.