Hi! I’m an undergrad and I’m trying to replicate a result from a paper for a fast matrix-matrix multiplication algorithm (specifically the Bernstein lift matrix). Unfortunately, my code seems some 2-3x slower than what’s expected, compared to a naive matrix-matrix multiplication.
Here’s a (somewhat lengthy) MWE with precomputed values:
using SparseArrays
using BenchmarkTools
using LinearAlgebra
const L0 = sprand(36, 36, 0.164)
const tri_offset_table = (
    (0, 2, 3, 3, 2, 0, -3, -7, -12, -18, -25, -33, -42, -52, -63, -75, -88, -102, -117, -133, -150), 
    (0, 3, 5, 6, 6, 5, 3, 0, -4, -9, -15, -22, -30, -39, -49, -60, -72, -85, -99, -114, -130), 
    (0, 4, 7, 9, 10, 10, 9, 7, 4, 0, -5, -11, -18, -26, -35, -45, -56, -68, -81, -95, -110), 
    (0, 5, 9, 12, 14, 15, 15, 14, 12, 9, 5, 0, -6, -13, -21, -30, -40, -51, -63, -76, -90), 
    (0, 6, 11, 15, 18, 20, 21, 21, 20, 18, 15, 11, 6, 0, -7, -15, -24, -34, -45, -57, -70), 
    (0, 7, 13, 18, 22, 25, 27, 28, 28, 27, 25, 22, 18, 13, 7, 0, -8, -17, -27, -38, -50), 
    (0, 8, 15, 21, 26, 30, 33, 35, 36, 36, 35, 33, 30, 26, 21, 15, 8, 0, -9, -19, -30), 
    (0, 9, 17, 24, 30, 35, 39, 42, 44, 45, 45, 44, 42, 39, 35, 30, 24, 17, 9, 0, -10), 
    (0, 10, 19, 27, 34, 40, 45, 49, 52, 54, 55, 55, 54, 52, 49, 45, 40, 34, 27, 19, 10), 
    (0, 11, 21, 30, 38, 45, 51, 56, 60, 63, 65, 66, 66, 65, 63, 60, 56, 51, 45, 38, 30)) 
const l_j_table = (-3.5, 7.0, -8.75, 7.0, -3.5, 1.0, -0.125, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
function reduction_multiply!(out, N, x, offset)
    row = 1
    @inbounds @fastmath for j in 0:N-1
        for i in 0:N-1-j
            k = N-1-i-j
            linear_index1 = i+1 + offset[j+1] + 1
            linear_index2 = i + offset[j+2] + 1
            linear_index3 = i + offset[j+1] + 1
            val = muladd((i+1), x[linear_index1], 
            muladd((j+1), x[linear_index2],
            muladd((k+1), x[linear_index3], 0.0)))
            out[row] = val/N
            row += 1
        end
    end
    return out
end
# E is a preallocated vector to perform some operations. N is the polynomial order.
function matrix_vector_multiply!(out, N, x, E)
    @inbounds @fastmath begin 
        LinearAlgebra.mul!(E, L0, x)
        index1 = div((N + 1) * (N + 2), 2)
        view(out,1:index1) .= E
        reduction_multiply!(E, N, E, tri_offset_table[N])
        for j in 1:N
            diff = div((N + 1 - j) * (N + 2 - j), 2)
            for i in 1:diff
                out[index1 + i] = l_j_table[j] * E[i]
            end
            if j < N
                index1 += diff
                reduction_multiply!(E, N-j, E, tri_offset_table[N-j])
            end
        end
    end
    return out
end
function matrix_matrix_multiply!(out, N, x, E)
    @simd for n in axes(x,2)
        @inbounds matrix_vector_multiply!(view(out,:,n), N, view(x,:,n), E)
    end
    return out
end
N = 7
Np2 = div((N + 1) * (N + 2), 2)
Np3 = div((N + 1) * (N + 2) * (N + 3), 6)
@benchmark matrix_vector_multiply!($(zeros(Np3)), $(N), $(rand(Float64, Np2)), $(zeros(Np2)))
@benchmark matrix_matrix_multiply!($(zeros(Np3, Np3)), $(N), $(rand(Float64, Np2, Np3)), $(zeros(Np2)))
I’m not quite sure if my matrix_matrix_multiply! function is the best way to stitch together the matrix-vector multiplications. When I compared to the mul! function, I found that my code is much faster for the matrix-vector multiplication, but slower for the matrix-matrix multiplication. Is there some better way to get a matrix-matrix multiplication from matrix-vector multiplication?
I’d appreciate any other pointers for where I might look to improve this! Thanks!