Solvers for Block-Tridiagonal

rsenne · June 18, 2025, 11:44pm

Block Tridiagonal Linear System Performance: Custom vs Package Implementations

I recently submitted an issue to BlockBandedMatrices.jl about this, but thought it might be interesting to the broader community.

Problem

I have symmetric block tridiagonal linear systems that should theoretically solve in O(T·n³) time, where T is the number of blocks and n is the block dimension. However, when using BlockBandedMatrices.jl, I found that SparseArrays was actually faster. This suggests either:

I’m doing something wrong, or
The block structure isn’t being properly exploited

I implemented custom algorithms based on Golub & Van Loan’s Matrix Computations and found them to be much faster than both approaches.

Here is a MWRE that I will also include as a Pluto Notebook.

Setup

using Random, LinearAlgebra, BlockBandedMatrices, SparseArrays
using BlockArrays: getblock
using BenchmarkTools

# Parameters
k = 2       # block size
n = 5       # number of blocks

# Create SPD blocks
function make_spd_block(k)
    A = randn(k, k)
    return A'A + k*I  # ensure SPD
end

Bs = [make_spd_block(k) for _ in 1:n]         # diagonal blocks
Cs = [randn(k, k) for _ in 1:(n-1)]           # off-diagonal blocks

# Build full matrix A
function build_full_block_tridiag(Bs, Cs)
    k, n = size(Bs[1], 1), length(Bs)
    A = zeros(k*n, k*n)
    for i in 1:n
        A[(k*(i-1)+1):(k*i), (k*(i-1)+1):(k*i)] .= Bs[i]
        if i < n
            A[(k*(i-1)+1):(k*i), (k*i+1):(k*(i+1))] .= Cs[i]'
            A[(k*i+1):(k*(i+1)), (k*(i-1)+1):(k*i)] .= Cs[i]
        end
    end
    return Symmetric(A)
end

A_dense = build_full_block_tridiag(Bs, Cs)
b = randn(k * n)

Method 1: Custom Block Cholesky (brittle but fast)

function custom_block_cholesky(Bs, Cs, b)
    n = length(Bs)
    k = size(Bs[1], 1)
    Ls = Vector{Matrix{Float64}}(undef, n)
    Loffs = Vector{Matrix{Float64}}(undef, n - 1)
    
    # Cholesky factorization
    Ls[1] = cholesky(Bs[1]).L
    for i in 2:n
        # Solve Cᵢ₋₁ = Lᵢ₋₁ Lᵢ₋₁ᵗ ⇒ Loffs[i-1] = Cᵢ₋₁ * Lᵢ₋₁⁻ᵗ
        Loffs[i-1] = Cs[i-1] * inv(Ls[i-1]')
        S = Bs[i] - Loffs[i-1] * Loffs[i-1]'
        Ls[i] = cholesky(S).L
    end
    
    # Forward substitution 
    y = zeros(k * n)
    for i in 1:n
        idx = (k*(i-1)+1):(k*i)
        y[idx] .= b[idx]
        if i > 1
            prev_idx = (k*(i-2)+1):(k*(i-1))  
            y[idx] .-= Loffs[i-1] * y[prev_idx]
        end
        y[idx] .= Ls[i] \ y[idx]  
    end
    
    # Backward substitution 
    x = zeros(k * n)
    for i in n:-1:1
        idx = (k*(i-1)+1):(k*i)
        x[idx] .= y[idx]
        if i < n
            next_idx = (k*i+1):(k*(i+1))  
            x[idx] .-= Loffs[i]' * x[next_idx]
        end
        x[idx] .= Ls[i]' \ x[idx] 
    end
    
    return x
end

Method 2: Custom Block LU (more robust)

function custom_block_lu(As, Bs, Cs, b)
    n = length(As)  # number of block rows
    k = size(As[1], 1)  # block size
    
    # Storage for LU factors
    Us = Vector{Matrix{Float64}}(undef, n)      # Upper diagonal blocks
    Ls = Vector{Matrix{Float64}}(undef, n-1)    # Lower off-diagonal blocks
    
    # First block: U₁ = A₁
    Us[1] = copy(As[1])
    
    for i = 1:n-1
        # Compute lower off-diagonal: Lᵢ = Bᵢ * Uᵢ⁻¹
        Ls[i] = Bs[i] / Us[i]
        
        # Update next diagonal block: Uᵢ₊₁ = Aᵢ₊₁ - Lᵢ * Cᵢ
        Us[i+1] = As[i+1] - Ls[i] * Cs[i]
    end
    
    # Forward Substitution
    y = zeros(k * n)
    for i = 1:n
        idx = (k*(i-1)+1):(k*i)
        y[idx] = b[idx]
        
        # Subtract contribution from previous block
        if i > 1
            prev_idx = (k*(i-2)+1):(k*(i-1))
            y[idx] -= Ls[i-1] * y[prev_idx]
        end
    end
    
    # Backward Substitution
    x = zeros(k * n)
    for i = n:-1:1
        idx = (k*(i-1)+1):(k*i)
        x[idx] = y[idx]
        
        # Subtract contribution from next block
        if i < n
            next_idx = (k*i+1):(k*(i+1))
            x[idx] -= Cs[i] * x[next_idx]
        end
        
        # Solve with diagonal block
        x[idx] = Us[i] \ x[idx]
    end
    
    return x
end

Method 3: BlockBandedMatrices.jl

block_sizes = fill(k, n)
A_bbm = BlockBandedMatrix{Float64}(undef, block_sizes, block_sizes, (1, 1))

for i in 1:n
    getblock(A_bbm, i, i) .= Bs[i]
end

for i in 1:(n-1)
    getblock(A_bbm, i+1, i) .= Cs[i]
    getblock(A_bbm, i, i+1) .= Cs[i]'
end

x_bbm = A_bbm \ b

Method 4: SparseArrays.jl

A_sparse = sparse(A_dense)
x_sparse = A_sparse \ b

Results

# Method 1: Custom Block Cholesky
x_custom = custom_block_cholesky(Bs, Cs, b)

# Method 2: Custom Block LU
c_prime = [cs' for cs in Cs]
x_custom_lu = custom_block_lu(Bs, Cs, c_prime, b)

# Verify solutions are identical
println("Custom Cholesky vs BlockBanded: ", norm(x_custom - x_bbm))
println("Custom Cholesky vs Sparse: ", norm(x_custom - x_sparse))
println("Sparse vs BlockBanded: ", norm(x_sparse - x_bbm))
println("Custom Cholesky vs Custom LU: ", norm(x_custom_lu - x_custom))

# Performance comparison
@btime custom_block_cholesky($Bs, $Cs, $b)
@btime $A_bbm \ $b
@btime $A_sparse \ $b
@btime custom_block_lu($Bs, $Cs, $c_prime, $b)

Questions

Am I using BlockBandedMatrices.jl correctly? Is there a better way to construct or solve with these matrices?
Why is SparseArrays faster than BlockBandedMatrices? Shouldn’t the block structure provide advantages?

Any insights or suggestions would be greatly appreciated!

stevengj · June 19, 2025, 3:14am

See also previous discussions:

karei · June 19, 2025, 6:08am

Benchmark code

using Random, LinearAlgebra, BlockBandedMatrices, SparseArrays
using BlockArrays: getblock
using BenchmarkTools

# Parameters
k = 2       # block size
n = 5       # number of blocks

# Create SPD blocks
function make_spd_block(k)
    A = randn(k, k)
    return A'A + k * I  # ensure SPD
end

Bs = [make_spd_block(k) for _ = 1:n]         # diagonal blocks
Cs = [randn(k, k) for _ = 1:(n-1)]           # off-diagonal blocks

# Build full matrix A
function build_full_block_tridiag(Bs, Cs)
    k, n = size(Bs[1], 1), length(Bs)
    A = zeros(k * n, k * n)
    for i = 1:n
        A[(k*(i-1)+1):(k*i), (k*(i-1)+1):(k*i)] .= Bs[i]
        if i < n
            A[(k*(i-1)+1):(k*i), (k*i+1):(k*(i+1))] .= Cs[i]'
            A[(k*i+1):(k*(i+1)), (k*(i-1)+1):(k*i)] .= Cs[i]
        end
    end
    return Symmetric(A)
end

A_dense = build_full_block_tridiag(Bs, Cs)
b = randn(k * n)

function custom_block_cholesky(Bs, Cs, b)
    n = length(Bs)
    k = size(Bs[1], 1)
    Ls = Vector{Matrix{Float64}}(undef, n)
    Loffs = Vector{Matrix{Float64}}(undef, n - 1)

    # Cholesky factorization
    Ls[1] = cholesky(Bs[1]).L
    for i = 2:n
        # Solve Cᵢ₋₁ = Lᵢ₋₁ Lᵢ₋₁ᵗ ⇒ Loffs[i-1] = Cᵢ₋₁ * Lᵢ₋₁⁻ᵗ
        Loffs[i-1] = Cs[i-1] * inv(Ls[i-1]')
        S = Bs[i] - Loffs[i-1] * Loffs[i-1]'
        Ls[i] = cholesky(S).L
    end

    # Forward substitution 
    y = zeros(k * n)
    for i = 1:n
        idx = (k*(i-1)+1):(k*i)
        y[idx] .= b[idx]
        if i > 1
            prev_idx = (k*(i-2)+1):(k*(i-1))
            y[idx] .-= Loffs[i-1] * y[prev_idx]
        end
        y[idx] .= Ls[i] \ y[idx]
    end

    # Backward substitution 
    x = zeros(k * n)
    for i = n:-1:1
        idx = (k*(i-1)+1):(k*i)
        x[idx] .= y[idx]
        if i < n
            next_idx = (k*i+1):(k*(i+1))
            x[idx] .-= Loffs[i]' * x[next_idx]
        end
        x[idx] .= Ls[i]' \ x[idx]
    end

    return x
end

function custom_block_lu(As, Bs, Cs, b)
    n = length(As)  # number of block rows
    k = size(As[1], 1)  # block size

    # Storage for LU factors
    Us = Vector{Matrix{Float64}}(undef, n)      # Upper diagonal blocks
    Ls = Vector{Matrix{Float64}}(undef, n - 1)    # Lower off-diagonal blocks

    # First block: U₁ = A₁
    Us[1] = copy(As[1])

    for i = 1:n-1
        # Compute lower off-diagonal: Lᵢ = Bᵢ * Uᵢ⁻¹
        Ls[i] = Bs[i] / Us[i]

        # Update next diagonal block: Uᵢ₊₁ = Aᵢ₊₁ - Lᵢ * Cᵢ
        Us[i+1] = As[i+1] - Ls[i] * Cs[i]
    end

    # Forward Substitution
    y = zeros(k * n)
    for i = 1:n
        idx = (k*(i-1)+1):(k*i)
        y[idx] = b[idx]

        # Subtract contribution from previous block
        if i > 1
            prev_idx = (k*(i-2)+1):(k*(i-1))
            y[idx] -= Ls[i-1] * y[prev_idx]
        end
    end

    # Backward Substitution
    x = zeros(k * n)
    for i = n:-1:1
        idx = (k*(i-1)+1):(k*i)
        x[idx] = y[idx]

        # Subtract contribution from next block
        if i < n
            next_idx = (k*i+1):(k*(i+1))
            x[idx] -= Cs[i] * x[next_idx]
        end

        # Solve with diagonal block
        x[idx] = Us[i] \ x[idx]
    end

    return x
end

## Method 1: Custom Block Cholesky
x_custom = custom_block_cholesky(Bs, Cs, b)
@benchmark custom_block_cholesky($Bs, $Cs, $b)

## Method 2: Custom Block LU
c_prime = [copy(cs') for cs in Cs]
x_custom_lu = custom_block_lu(Bs, Cs, c_prime, b)
@benchmark custom_block_lu($Bs, $Cs, $c_prime, $b)

## Method 3: BlockBandedMatrices.jl
block_sizes = fill(k, n)
A_bbm = BlockBandedMatrix{Float64}(undef, block_sizes, block_sizes, (1, 1))
for i = 1:n
    getblock(A_bbm, i, i) .= Bs[i]
end
for i = 1:(n-1)
    getblock(A_bbm, i + 1, i) .= Cs[i]
    getblock(A_bbm, i, i + 1) .= Cs[i]'
end
x_bbm = A_bbm \ b
@benchmark $A_bbm \ $b

## Method 4: LinearAlgebra.lu(A::SparseMatrixCSC) \ b
A_sparse = sparse(A_dense)
x_sparse = A_sparse \ b
@benchmark $A_sparse \ $b

## Method 5: block_thomas_tridiagonal! that I wrote (lu decomposition)
result_x = zeros(k * n)
c_prime = [copy(cs') for cs in Cs]
Bs_copy = [copy(bs) for bs in Bs]
Cs_copy = [copy(cs) for cs in Cs]
b_copy = copy(b)
block_thomas_tridiagonal!(result_x, Cs, Bs_copy, c_prime, b_copy)
@benchmark block_thomas_tridiagonal!($result_x, $Cs_copy, $Bs_copy, $c_prime, $b_copy) setup = begin
    c_prime = [copy(cs') for cs in Cs]
    Bs_copy = [copy(bs) for bs in Bs]
    Cs_copy = [copy(cs) for cs in Cs]
    b_copy = copy(b)
end
## results                                                                          
# Method 1: Time (mean): 35.675 μs Memory estimate: 7.58 KiB, allocs estimate: 89.
# Method 2: Time (mean): 35.928 μs Memory estimate: 7.66 KiB, allocs estimate: 91.
# Method 3: Time (mean): 86.943 μs Memory estimate: 413.56 KiB, allocs estimate: 211.
# Method 4: Time (mean): 7.275 μs Memory estimate: 7.89 KiB, allocs estimate: 54.
# Method 5: Time (mean): 1.420 μs Memory estimate: 720 bytes, allocs estimate: 9.
using PlotlyLight, PlotlyKaleido
results = [35.675, 35.928, 86.943, 7.275, 1.420]
fig = Config(x=["Custom Block Cholesky", "Custom Block LU", "BlockBandedMatrices.jl",
        "SparseMatrix div b", "block_thomas_t!"], y=results, type="bar",
    text=results, textposition="outside", texttemplate="%{text:.3f}",
    textfont=Config(size=12)
)
layout = Config(xaxis  = Config(title=Config(text="Method")),
    yaxis  = Config(title=Config(text="Mean time (μs)")),
    margin = Config(l=60, r=40, t=10, b=40, pad=0),
    font   = Config(family="Arial", size=12, color="black")
)
(p = Plot(fig, layout)) |> display

PlotlyKaleido.start()
savefig(p, "comparison.png")

I’m not sure what’s going on behind BlockBandedMatrices.jl, but the fact is that its performance is very poor when solving block-tridiagonal.

When A is a sparse array, the multifrontal LU factorization algorithm is called in the background. This algorithm is specifically designed to solve linear equations of sparse matrices.

Of course, the algorithm does not fully utilize the advantages of block-tridiagonal. That is why I wrote a block_thomas_tridiagonal! myself. To maximize performance, the function is written in a very aggressive manner, modifying all input parameters in place except the upper diagonal.

Since your matrix is SPD, it would be better to replace the LU with Cholesky decomposition in block_thomas_tridiagonal!.

rveltz · June 19, 2025, 7:13am

Did you check the precision of the solver?

dlfivefifty · June 19, 2025, 10:22am

I addressed this in the issue you filed: LinearSolve of Block-Tridiagonal matrix is slower than Sparse Arrays · Issue #225 · JuliaLinearAlgebra/BlockBandedMatrices.jl · GitHub

Basically, there are performance issues in BlockArrays.jl related to views, caused by calls to axes allocating. A fix to this would improve the whole block array suite of packages. But I don’t really have time to do this right now so for block arrays to achieve the performance they should would take someone to take the lead on this.

rsenne · June 19, 2025, 1:25pm

Interestingly, on my machine, the custom solvers I wrote were faster than the sparse solution. A good bit as well.

  Custom Cholesky: 13.900 μs (178 allocations: 8.08 KiB)
  Custom LU: 17.300 μs (182 allocations: 8.08 KiB)
  BlockBandedMatrices: 77.600 μs (394 allocations: 412.82 KiB)
  SparseArrays: 30.300 μs (60 allocations: 7.42 KiB)

Beside the point, but interesting nonetheless. But thank you for pointing me to your algorithm, I was hoping I just wasn’t using the package properly but based on the later discussion this seems to be a larger known problem. It would be great to eventually get support for this in a package as it seems based on the post history provided by @stevengj that independent minds are rediscovering this repeatedly.

rsenne · June 19, 2025, 1:30pm

Good to know. I’d definitely be interested in contributing this, as I was hoping to integrate your package into one that I am currently developing as I am making heavy use of block tridiagonal matrices. I’ll read up on the issue history and maybe I can get started on a future PR.

rsenne · June 19, 2025, 1:32pm

All the solutions I checked were within machine precision of one another. Can’t say I religiously checked across a bunch of conditions but at least in the few test cases they were all identical

dlfivefifty · June 19, 2025, 2:04pm

Cool! One solution is to rewrite the algorithms without using views.

Though ive decided having BlockBandedMatrix a special case of BlockSkylineMatrix was a bad idea. So probably a first task is to move BlockSkylineMatrix to its own package…

karei · June 19, 2025, 3:30pm

It may be related to the CPU and Julia version. I used AMD ryzen 9 7950x and Julia 1.10.9.

rsenne · June 19, 2025, 4:00pm

Okay yeah probably the Julia version or CPU. This was all on 1.11.5 and an Intel i9-12900HK for me Thanks!

jindavid · June 24, 2025, 1:24am

You might be interested in BlockStructuredSolvers.jl, a package we recently released for solving block-tridiagonal and more general block-structured linear systems efficiently.
It supports Cholesky-based direct solvers with reusable factorizations on both CPU and GPU (Nvidia/AMD).
Happy to hear feedback or feature suggestions!

karei · June 24, 2025, 2:43am

I know that A_list is a block matrix on the main diagonal and B_list is a block matrix on the upper diagonal. Is it not possible to solve for a non-symmetric tridiagonal matrix because it is not possible to specify the matrix on the lower diagonal?

jindavid · June 24, 2025, 2:51am

Yeah, since the initial goal is to utilize the Schur complement to enable parallel solving on GPU so it is targeted to solve for symmetric (positive definite) systems. But, I am interested in extending it to more general systems.

jindavid · June 24, 2025, 6:41pm

Here is a code snippet for the CPU version

T = Float64
M = Array

N = 180
n = 32 # size of each block

A_list = Vector{M{T, 2}}(undef, N);
B_list = Vector{M{T, 2}}(undef, N-1);

for i in 1:N
    temp = randn(n, n)
    A_list[i] = M(temp * temp' + n * I) 
end

for i in 1:N-1
    temp = randn(n, n)
    B_list[i] = M(temp)
end

d_list = Vector{M{T, 2}}(undef, N);

x_list = Vector{M{T, 2}}(undef, N);
for i in 1:N
    x_list[i] = M{T, 2}(rand(n, 1))
end

d_list[1] = A_list[1] * x_list[1] + B_list[1] * x_list[2];
@views for i = 2:N-1
    d_list[i] = B_list[i-1]' * x_list[i-1] + A_list[i] * x_list[i] + B_list[i] * x_list[i+1]
end
d_list[N] = B_list[N-1]' * x_list[N-1] + A_list[N] * x_list[N];


solver_cpu = initialize_cpu(N, n, T);

for i in 1:N
    copyto!(solver_cpu.A_list[i], A_list[i])
end

for i in 1:N-1
    copyto!(solver_cpu.B_list[i], B_list[i])
end

for i in 1:N
    copyto!(solver_cpu.d_list[i], d_list[i])
end

factorize!(solver_cpu)

solve!(solver_cpu);

solver_cpu.d_list - x_list

and for the GPU version

using BlockStructuredSolvers
using LinearAlgebra
using Random
using CUDA

N = 180 # number of blocks
n = 32 # size of each block

T = Float64
M = CuArray

solver = initialize_batched(N, n, T, M); # use batched solver
# solver = initialize_seq(N, n, T, M); # use sequential solver

A_tensor = M{T, 3}(zeros(n, n, N)); # diagonal blocks
B_tensor = M{T, 3}(zeros(n, n, N-1)); # (upper) off-diagonal blocks

for i in 1:N
    temp = randn(n, n)
    A_tensor[:, :, i] .= M{T, 2}(temp * temp' + n * I) 
end

for i in 1:N-1
    temp = randn(n, n)
    B_tensor[:, :, i] .= M{T, 2}(temp)
end

# if you use a tensor of matrices, you can use the following code
copyto!(solver.A_tensor, A_tensor);
copyto!(solver.B_tensor, B_tensor);

# if you use a list of matrices, you can use the following code
# copyto!(data.A_list, A_save);
# copyto!(data.B_list, B_save);

@time factorize!(solver);

d_tensor = M{T, 3}(zeros(n, 1, N));

x_list = Vector{M{T, 2}}(undef, N);
for i in 1:N
    x_list[i] = M{T, 2}(rand(n, 1))
end

d_tensor[:, :, 1] = A_tensor[:, :, 1] * x_list[1] + B_tensor[:, :, 1] * x_list[2];
@views for i = 2:N-1
    d_tensor[:, :, i] = B_tensor[:, :, i-1]' * x_list[i-1] + A_tensor[:, :, i] * x_list[i] + B_tensor[:, :, i] * x_list[i+1]
end
d_tensor[:, :, N] = B_tensor[:, :, N-1]' * x_list[N-1] + A_tensor[:, :, N] * x_list[N];

copyto!(solver.d_tensor, d_tensor);

@time solve!(solver);

Topic		Replies	Views
How to improve the Thomas algorithm for block tridiagonal matrices Numerics	16	321	March 17, 2025
How to solve this Ax=b faster? Numerics linearalgebra , linearsolve	24	2559	June 7, 2022
Solving a symmetric block tridiagonal system Numerics	3	628	March 29, 2022
Block diagonal matrices General Usage linearalgebra	8	7539	May 29, 2019
LU factorisation of a block tridiagonal square matrix General Usage linearalgebra , general , speed-optimization	3	186	January 17, 2025