Solvers for Block-Tridiagonal

Benchmark code
using Random, LinearAlgebra, BlockBandedMatrices, SparseArrays
using BlockArrays: getblock
using BenchmarkTools

# Parameters
k = 2       # block size
n = 5       # number of blocks

# Create SPD blocks
function make_spd_block(k)
    A = randn(k, k)
    return A'A + k * I  # ensure SPD
end

Bs = [make_spd_block(k) for _ = 1:n]         # diagonal blocks
Cs = [randn(k, k) for _ = 1:(n-1)]           # off-diagonal blocks

# Build full matrix A
function build_full_block_tridiag(Bs, Cs)
    k, n = size(Bs[1], 1), length(Bs)
    A = zeros(k * n, k * n)
    for i = 1:n
        A[(k*(i-1)+1):(k*i), (k*(i-1)+1):(k*i)] .= Bs[i]
        if i < n
            A[(k*(i-1)+1):(k*i), (k*i+1):(k*(i+1))] .= Cs[i]'
            A[(k*i+1):(k*(i+1)), (k*(i-1)+1):(k*i)] .= Cs[i]
        end
    end
    return Symmetric(A)
end

A_dense = build_full_block_tridiag(Bs, Cs)
b = randn(k * n)

function custom_block_cholesky(Bs, Cs, b)
    n = length(Bs)
    k = size(Bs[1], 1)
    Ls = Vector{Matrix{Float64}}(undef, n)
    Loffs = Vector{Matrix{Float64}}(undef, n - 1)

    # Cholesky factorization
    Ls[1] = cholesky(Bs[1]).L
    for i = 2:n
        # Solve Cᵢ₋₁ = Lᵢ₋₁ Lᵢ₋₁ᵗ ⇒ Loffs[i-1] = Cᵢ₋₁ * Lᵢ₋₁⁻ᵗ
        Loffs[i-1] = Cs[i-1] * inv(Ls[i-1]')
        S = Bs[i] - Loffs[i-1] * Loffs[i-1]'
        Ls[i] = cholesky(S).L
    end

    # Forward substitution 
    y = zeros(k * n)
    for i = 1:n
        idx = (k*(i-1)+1):(k*i)
        y[idx] .= b[idx]
        if i > 1
            prev_idx = (k*(i-2)+1):(k*(i-1))
            y[idx] .-= Loffs[i-1] * y[prev_idx]
        end
        y[idx] .= Ls[i] \ y[idx]
    end

    # Backward substitution 
    x = zeros(k * n)
    for i = n:-1:1
        idx = (k*(i-1)+1):(k*i)
        x[idx] .= y[idx]
        if i < n
            next_idx = (k*i+1):(k*(i+1))
            x[idx] .-= Loffs[i]' * x[next_idx]
        end
        x[idx] .= Ls[i]' \ x[idx]
    end

    return x
end

function custom_block_lu(As, Bs, Cs, b)
    n = length(As)  # number of block rows
    k = size(As[1], 1)  # block size

    # Storage for LU factors
    Us = Vector{Matrix{Float64}}(undef, n)      # Upper diagonal blocks
    Ls = Vector{Matrix{Float64}}(undef, n - 1)    # Lower off-diagonal blocks

    # First block: U₁ = A₁
    Us[1] = copy(As[1])

    for i = 1:n-1
        # Compute lower off-diagonal: Lᵢ = Bᵢ * Uᵢ⁻¹
        Ls[i] = Bs[i] / Us[i]

        # Update next diagonal block: Uᵢ₊₁ = Aᵢ₊₁ - Lᵢ * Cᵢ
        Us[i+1] = As[i+1] - Ls[i] * Cs[i]
    end

    # Forward Substitution
    y = zeros(k * n)
    for i = 1:n
        idx = (k*(i-1)+1):(k*i)
        y[idx] = b[idx]

        # Subtract contribution from previous block
        if i > 1
            prev_idx = (k*(i-2)+1):(k*(i-1))
            y[idx] -= Ls[i-1] * y[prev_idx]
        end
    end

    # Backward Substitution
    x = zeros(k * n)
    for i = n:-1:1
        idx = (k*(i-1)+1):(k*i)
        x[idx] = y[idx]

        # Subtract contribution from next block
        if i < n
            next_idx = (k*i+1):(k*(i+1))
            x[idx] -= Cs[i] * x[next_idx]
        end

        # Solve with diagonal block
        x[idx] = Us[i] \ x[idx]
    end

    return x
end

## Method 1: Custom Block Cholesky
x_custom = custom_block_cholesky(Bs, Cs, b)
@benchmark custom_block_cholesky($Bs, $Cs, $b)

## Method 2: Custom Block LU
c_prime = [copy(cs') for cs in Cs]
x_custom_lu = custom_block_lu(Bs, Cs, c_prime, b)
@benchmark custom_block_lu($Bs, $Cs, $c_prime, $b)

## Method 3: BlockBandedMatrices.jl
block_sizes = fill(k, n)
A_bbm = BlockBandedMatrix{Float64}(undef, block_sizes, block_sizes, (1, 1))
for i = 1:n
    getblock(A_bbm, i, i) .= Bs[i]
end
for i = 1:(n-1)
    getblock(A_bbm, i + 1, i) .= Cs[i]
    getblock(A_bbm, i, i + 1) .= Cs[i]'
end
x_bbm = A_bbm \ b
@benchmark $A_bbm \ $b

## Method 4: LinearAlgebra.lu(A::SparseMatrixCSC) \ b
A_sparse = sparse(A_dense)
x_sparse = A_sparse \ b
@benchmark $A_sparse \ $b

## Method 5: block_thomas_tridiagonal! that I wrote (lu decomposition)
result_x = zeros(k * n)
c_prime = [copy(cs') for cs in Cs]
Bs_copy = [copy(bs) for bs in Bs]
Cs_copy = [copy(cs) for cs in Cs]
b_copy = copy(b)
block_thomas_tridiagonal!(result_x, Cs, Bs_copy, c_prime, b_copy)
@benchmark block_thomas_tridiagonal!($result_x, $Cs_copy, $Bs_copy, $c_prime, $b_copy) setup = begin
    c_prime = [copy(cs') for cs in Cs]
    Bs_copy = [copy(bs) for bs in Bs]
    Cs_copy = [copy(cs) for cs in Cs]
    b_copy = copy(b)
end
## results                                                                          
# Method 1: Time (mean): 35.675 μs Memory estimate: 7.58 KiB, allocs estimate: 89.
# Method 2: Time (mean): 35.928 μs Memory estimate: 7.66 KiB, allocs estimate: 91.
# Method 3: Time (mean): 86.943 μs Memory estimate: 413.56 KiB, allocs estimate: 211.
# Method 4: Time (mean): 7.275 μs Memory estimate: 7.89 KiB, allocs estimate: 54.
# Method 5: Time (mean): 1.420 μs Memory estimate: 720 bytes, allocs estimate: 9.
using PlotlyLight, PlotlyKaleido
results = [35.675, 35.928, 86.943, 7.275, 1.420]
fig = Config(x=["Custom Block Cholesky", "Custom Block LU", "BlockBandedMatrices.jl",
        "SparseMatrix div b", "block_thomas_t!"], y=results, type="bar",
    text=results, textposition="outside", texttemplate="%{text:.3f}",
    textfont=Config(size=12)
)
layout = Config(xaxis  = Config(title=Config(text="Method")),
    yaxis  = Config(title=Config(text="Mean time (μs)")),
    margin = Config(l=60, r=40, t=10, b=40, pad=0),
    font   = Config(family="Arial", size=12, color="black")
)
(p = Plot(fig, layout)) |> display

PlotlyKaleido.start()
savefig(p, "comparison.png")

I’m not sure what’s going on behind BlockBandedMatrices.jl, but the fact is that its performance is very poor when solving block-tridiagonal.

When A is a sparse array, the multifrontal LU factorization algorithm is called in the background. This algorithm is specifically designed to solve linear equations of sparse matrices.

Of course, the algorithm does not fully utilize the advantages of block-tridiagonal. That is why I wrote a block_thomas_tridiagonal! myself. To maximize performance, the function is written in a very aggressive manner, modifying all input parameters in place except the upper diagonal.

Since your matrix is SPD, it would be better to replace the LU with Cholesky decomposition in block_thomas_tridiagonal!.