Benchmark code
using Random, LinearAlgebra, BlockBandedMatrices, SparseArrays
using BlockArrays: getblock
using BenchmarkTools
# Parameters
k = 2 # block size
n = 5 # number of blocks
# Create SPD blocks
function make_spd_block(k)
A = randn(k, k)
return A'A + k * I # ensure SPD
end
Bs = [make_spd_block(k) for _ = 1:n] # diagonal blocks
Cs = [randn(k, k) for _ = 1:(n-1)] # off-diagonal blocks
# Build full matrix A
function build_full_block_tridiag(Bs, Cs)
k, n = size(Bs[1], 1), length(Bs)
A = zeros(k * n, k * n)
for i = 1:n
A[(k*(i-1)+1):(k*i), (k*(i-1)+1):(k*i)] .= Bs[i]
if i < n
A[(k*(i-1)+1):(k*i), (k*i+1):(k*(i+1))] .= Cs[i]'
A[(k*i+1):(k*(i+1)), (k*(i-1)+1):(k*i)] .= Cs[i]
end
end
return Symmetric(A)
end
A_dense = build_full_block_tridiag(Bs, Cs)
b = randn(k * n)
function custom_block_cholesky(Bs, Cs, b)
n = length(Bs)
k = size(Bs[1], 1)
Ls = Vector{Matrix{Float64}}(undef, n)
Loffs = Vector{Matrix{Float64}}(undef, n - 1)
# Cholesky factorization
Ls[1] = cholesky(Bs[1]).L
for i = 2:n
# Solve Cᵢ₋₁ = Lᵢ₋₁ Lᵢ₋₁ᵗ ⇒ Loffs[i-1] = Cᵢ₋₁ * Lᵢ₋₁⁻ᵗ
Loffs[i-1] = Cs[i-1] * inv(Ls[i-1]')
S = Bs[i] - Loffs[i-1] * Loffs[i-1]'
Ls[i] = cholesky(S).L
end
# Forward substitution
y = zeros(k * n)
for i = 1:n
idx = (k*(i-1)+1):(k*i)
y[idx] .= b[idx]
if i > 1
prev_idx = (k*(i-2)+1):(k*(i-1))
y[idx] .-= Loffs[i-1] * y[prev_idx]
end
y[idx] .= Ls[i] \ y[idx]
end
# Backward substitution
x = zeros(k * n)
for i = n:-1:1
idx = (k*(i-1)+1):(k*i)
x[idx] .= y[idx]
if i < n
next_idx = (k*i+1):(k*(i+1))
x[idx] .-= Loffs[i]' * x[next_idx]
end
x[idx] .= Ls[i]' \ x[idx]
end
return x
end
function custom_block_lu(As, Bs, Cs, b)
n = length(As) # number of block rows
k = size(As[1], 1) # block size
# Storage for LU factors
Us = Vector{Matrix{Float64}}(undef, n) # Upper diagonal blocks
Ls = Vector{Matrix{Float64}}(undef, n - 1) # Lower off-diagonal blocks
# First block: U₁ = A₁
Us[1] = copy(As[1])
for i = 1:n-1
# Compute lower off-diagonal: Lᵢ = Bᵢ * Uᵢ⁻¹
Ls[i] = Bs[i] / Us[i]
# Update next diagonal block: Uᵢ₊₁ = Aᵢ₊₁ - Lᵢ * Cᵢ
Us[i+1] = As[i+1] - Ls[i] * Cs[i]
end
# Forward Substitution
y = zeros(k * n)
for i = 1:n
idx = (k*(i-1)+1):(k*i)
y[idx] = b[idx]
# Subtract contribution from previous block
if i > 1
prev_idx = (k*(i-2)+1):(k*(i-1))
y[idx] -= Ls[i-1] * y[prev_idx]
end
end
# Backward Substitution
x = zeros(k * n)
for i = n:-1:1
idx = (k*(i-1)+1):(k*i)
x[idx] = y[idx]
# Subtract contribution from next block
if i < n
next_idx = (k*i+1):(k*(i+1))
x[idx] -= Cs[i] * x[next_idx]
end
# Solve with diagonal block
x[idx] = Us[i] \ x[idx]
end
return x
end
## Method 1: Custom Block Cholesky
x_custom = custom_block_cholesky(Bs, Cs, b)
@benchmark custom_block_cholesky($Bs, $Cs, $b)
## Method 2: Custom Block LU
c_prime = [copy(cs') for cs in Cs]
x_custom_lu = custom_block_lu(Bs, Cs, c_prime, b)
@benchmark custom_block_lu($Bs, $Cs, $c_prime, $b)
## Method 3: BlockBandedMatrices.jl
block_sizes = fill(k, n)
A_bbm = BlockBandedMatrix{Float64}(undef, block_sizes, block_sizes, (1, 1))
for i = 1:n
getblock(A_bbm, i, i) .= Bs[i]
end
for i = 1:(n-1)
getblock(A_bbm, i + 1, i) .= Cs[i]
getblock(A_bbm, i, i + 1) .= Cs[i]'
end
x_bbm = A_bbm \ b
@benchmark $A_bbm \ $b
## Method 4: LinearAlgebra.lu(A::SparseMatrixCSC) \ b
A_sparse = sparse(A_dense)
x_sparse = A_sparse \ b
@benchmark $A_sparse \ $b
## Method 5: block_thomas_tridiagonal! that I wrote (lu decomposition)
result_x = zeros(k * n)
c_prime = [copy(cs') for cs in Cs]
Bs_copy = [copy(bs) for bs in Bs]
Cs_copy = [copy(cs) for cs in Cs]
b_copy = copy(b)
block_thomas_tridiagonal!(result_x, Cs, Bs_copy, c_prime, b_copy)
@benchmark block_thomas_tridiagonal!($result_x, $Cs_copy, $Bs_copy, $c_prime, $b_copy) setup = begin
c_prime = [copy(cs') for cs in Cs]
Bs_copy = [copy(bs) for bs in Bs]
Cs_copy = [copy(cs) for cs in Cs]
b_copy = copy(b)
end
## results
# Method 1: Time (mean): 35.675 μs Memory estimate: 7.58 KiB, allocs estimate: 89.
# Method 2: Time (mean): 35.928 μs Memory estimate: 7.66 KiB, allocs estimate: 91.
# Method 3: Time (mean): 86.943 μs Memory estimate: 413.56 KiB, allocs estimate: 211.
# Method 4: Time (mean): 7.275 μs Memory estimate: 7.89 KiB, allocs estimate: 54.
# Method 5: Time (mean): 1.420 μs Memory estimate: 720 bytes, allocs estimate: 9.
using PlotlyLight, PlotlyKaleido
results = [35.675, 35.928, 86.943, 7.275, 1.420]
fig = Config(x=["Custom Block Cholesky", "Custom Block LU", "BlockBandedMatrices.jl",
"SparseMatrix div b", "block_thomas_t!"], y=results, type="bar",
text=results, textposition="outside", texttemplate="%{text:.3f}",
textfont=Config(size=12)
)
layout = Config(xaxis = Config(title=Config(text="Method")),
yaxis = Config(title=Config(text="Mean time (μs)")),
margin = Config(l=60, r=40, t=10, b=40, pad=0),
font = Config(family="Arial", size=12, color="black")
)
(p = Plot(fig, layout)) |> display
PlotlyKaleido.start()
savefig(p, "comparison.png")
I’m not sure what’s going on behind BlockBandedMatrices.jl, but the fact is that its performance is very poor when solving block-tridiagonal.
When A is a sparse array, the multifrontal LU factorization algorithm is called in the background. This algorithm is specifically designed to solve linear equations of sparse matrices.
Of course, the algorithm does not fully utilize the advantages of block-tridiagonal. That is why I wrote a block_thomas_tridiagonal! myself. To maximize performance, the function is written in a very aggressive manner, modifying all input parameters in place except the upper diagonal.
Since your matrix is SPD, it would be better to replace the LU with Cholesky decomposition in block_thomas_tridiagonal!.
