Matrix multiplication with Duals

I was wondering if I could make this function faster. XC_dual would come in as a Dual with 10-100 Partials. Here is the MWE:

using LinearAlgebra, Random, ForwardDiff, Test, BenchmarkTools
# Set a seed for reproducibility.
Random.seed!(1234)

# Define dimensions
nb = 4      # number of rows (for IDC[1] and DC[1])
nk = 4      # dimension for DC[2]
ny = 3      # dimension for DC[3]
M = nb * nk * ny  # total number of elements

# Create dummy matrices for DC and IDC.
# DC[1] is nb×nb, DC[2] is nk×nk, and DC[3] is ny×ny.
DC = [rand(nb, nb), rand(nk, nk), rand(ny, ny)]
IDC = [rand(nb, nb)]

# Create a simple identity mapping for compressionIndexes.
compressionIndexes = collect(1:M)

# Create a dummy vector XC of real numbers.
XC = rand(M)
XC_dual = [ForwardDiff.Dual(x, (rand(100)...)) for x in XC]

# ------------------------------------------------------------------
# (1) Original uncompress function (generic version)
function uncompress_generic(compressionIndexes, XC, DC, IDC)
    nb = size(DC[1], 1)
    nk = size(DC[2], 1)
    ny = size(DC[3], 1)

    θ1 = zeros(eltype(XC), nb, nk, ny)
    for j in eachindex(XC)
        θ1[compressionIndexes[j]] = copy(XC[j])
    end
    @views for yy = 1:ny
        θ1[:, :, yy] = IDC[1] * θ1[:, :, yy] * DC[2]
    end
    @views for bb = 1:nb
        θ1[bb, :, :] = θ1[bb, :, :] * DC[3]
    end
    θ = reshape(θ1, nb * nk * ny)
    return θ
end

@btime uncompress_generic($compressionIndexes, $XC_dual, $DC, $IDC);

I tried to split the real part from the partials, but I guess * is quite optimized s.t. it’s not necessary. Any suggestions?

function uncompress_optimized(compressionIndexes, XC::AbstractVector{T}, DC, IDC) where {T<:Real}
    nb = size(DC[1], 1)
    nk = size(DC[2], 1)
    ny = size(DC[3], 1)
    θ1 = zeros(eltype(XC), nb, nk, ny)

    # Populate the 3D array using compressionIndexes.
    for j in eachindex(XC)
        θ1[compressionIndexes[j]] = XC[j]
    end

    temp1 = similar(θ1[:, :, 1])
    temp2 = similar(θ1[:, :, 1])

    @inbounds for yy = 1:ny
        # Multiply: temp1 = IDC[1] * θ1[:, :, yy]
        mul!(temp1, IDC[1], θ1[:, :, yy])
        # Multiply: temp2 = temp1 * DC[2]
        mul!(temp2, temp1, DC[2])
        # Write the result back.
        θ1[:, :, yy] .= temp2
    end

    temp = similar(θ1[1, :, :])
    @inbounds for bb = 1:nb
        # Multiply: temp = θ1[bb, :, :] * DC[3]
        mul!(temp, θ1[bb, :, :], DC[3])
        # Write the result back.
        θ1[bb, :, :] .= temp
    end

    return reshape(θ1, nb * nk * ny)
end

In the end, this gave me a pretty good speed up and reduced allocations.

original: 4.547 ms (35182 allocations: 43.06 MiB)

optimized:  335.292 μs (70 allocations: 992.28 KiB)

I’m not sufficiently familiar with the specific implementation of dual numbers in julia to know specific improvements there, but I can point out some general performance hints:

  • If you are benchmarking, try to make sure that you don’t use any variables from global scope. I’ve written everything in a function instead, and the benchmarks come a bit closer together already. (In general, it is better to avoid putting non const variables in global scope).
  • I wrote an uncompress_further_optimized which squeezes out a bit more by using views (as slices allocate).
  • A further improvement may be possible by filling theta1 with undef values (using similar) instead of all zeros if you don’t need those zeros (it feels like you’re populating it in a for-loop afterwards?).
  • iirc, looping over the second index of theta1 should give more efficient memory access (if I recall correctly for 3d arrays), but your loops are over the first and third index. Perhaps you can rewrite things a bit to have a loop over the second axis and one over the first axis?

My version:

using LinearAlgebra, Random, ForwardDiff, Test, BenchmarkTools

# ------------------------------------------------------------------
# (1) Original uncompress function (generic version)
function uncompress_generic(compressionIndexes, XC, DC, IDC)
    nb = size(DC[1], 1)
    nk = size(DC[2], 1)
    ny = size(DC[3], 1)

    θ1 = zeros(eltype(XC), nb, nk, ny)
    for j in eachindex(XC)
        θ1[compressionIndexes[j]] = copy(XC[j])
    end
    @views for yy = 1:ny
        θ1[:, :, yy] = IDC[1] * θ1[:, :, yy] * DC[2]
    end
    @views for bb = 1:nb
        θ1[bb, :, :] = θ1[bb, :, :] * DC[3]
    end
    θ = reshape(θ1, nb * nk * ny)
    return θ
end

function uncompress_optimized(compressionIndexes, XC::AbstractVector{T}, DC, IDC) where {T<:Real}
    nb = size(DC[1], 1)
    nk = size(DC[2], 1)
    ny = size(DC[3], 1)
    θ1 = zeros(eltype(XC), nb, nk, ny)

    # Populate the 3D array using compressionIndexes.
    for j in eachindex(XC)
        θ1[compressionIndexes[j]] = XC[j]
    end

    temp1 = similar(θ1[:, :, 1])
    temp2 = similar(θ1[:, :, 1])

    @inbounds for yy = 1:ny
        # Multiply: temp1 = IDC[1] * θ1[:, :, yy]
        mul!(temp1, IDC[1], θ1[:, :, yy])
        # Multiply: temp2 = temp1 * DC[2]
        mul!(temp2, temp1, DC[2])
        # Write the result back.
        θ1[:, :, yy] .= temp2
    end

    temp = similar(θ1[1, :, :])
    @inbounds for bb = 1:nb
        # Multiply: temp = θ1[bb, :, :] * DC[3]
        mul!(temp, θ1[bb, :, :], DC[3])
        # Write the result back.
        θ1[bb, :, :] .= temp
    end

    return reshape(θ1, nb * nk * ny)
end

function uncompress_further_optimized(compressionIndexes, XC::AbstractVector{T}, DC, IDC) where {T<:Real}
    nb = size(DC[1], 1)
    nk = size(DC[2], 1)
    ny = size(DC[3], 1)
    θ1 = zeros(eltype(XC), nb, nk, ny)

    # Populate the 3D array using compressionIndexes.
    @inbounds for j in eachindex(XC)
        θ1[compressionIndexes[j]] = XC[j]
    end

    @views temp1 = similar(θ1[:, :, 1])
    @views temp2 = similar(θ1[:, :, 1])

    @inbounds @views for yy = 1:ny
        # Multiply: temp1 = IDC[1] * θ1[:, :, yy]
        mul!(temp1, IDC[1], θ1[:, :, yy])
        # Multiply: temp2 = temp1 * DC[2]
        mul!(temp2, temp1, DC[2])
        # Write the result back.
        θ1[:, :, yy] .= temp2
    end

    @views temp = similar(θ1[1, :, :])
    @inbounds @views for bb = 1:nb
        # Multiply: temp = θ1[bb, :, :] * DC[3]
        mul!(temp, θ1[bb, :, :], DC[3])
        # Write the result back.
        θ1[bb, :, :] .= temp
    end

    return reshape(θ1, nb * nk * ny)
end

function to_test()
    # Set a seed for reproducibility.
    Random.seed!(1234)

    # Define dimensions
    nb = 4      # number of rows (for IDC[1] and DC[1])
    nk = 4      # dimension for DC[2]
    ny = 3      # dimension for DC[3]
    M = nb * nk * ny  # total number of elements

    # Create dummy matrices for DC and IDC.
    # DC[1] is nb×nb, DC[2] is nk×nk, and DC[3] is ny×ny.
    DC = [rand(nb, nb), rand(nk, nk), rand(ny, ny)]
    IDC = [rand(nb, nb)]

    # Create a simple identity mapping for compressionIndexes.
    compressionIndexes = collect(1:M)

    # Create a dummy vector XC of real numbers.
    XC = rand(M)
    XC_dual = [ForwardDiff.Dual(x, (rand(100)...)) for x in XC]
    uncompress_generic(compressionIndexes, XC_dual, DC, IDC);
    uncompress_optimized(compressionIndexes, XC_dual, DC, IDC);
    uncompress_further_optimized(compressionIndexes, XC_dual, DC, IDC);

    display(@benchmark uncompress_generic($compressionIndexes, $XC_dual, $DC, $IDC))
    display(@benchmark uncompress_optimized($compressionIndexes, $XC_dual, $DC, $IDC))
    display(@benchmark uncompress_further_optimized($compressionIndexes, $XC_dual, $DC, $IDC))
end

to_test()

gives

BenchmarkTools.Trial: 10000 samples with 1 evaluation per sample.
 Range (min … max):  44.333 μs …   4.543 ms  ┊ GC (min … max):  0.00% … 98.35%
 Time  (median):     61.458 μs               ┊ GC (median):     0.00%
 Time  (mean ± σ):   71.112 μs ± 119.149 μs  ┊ GC (mean ± σ):  12.97% ±  7.70%

             ▅█▅▁                                               
  ▂▄▄▃▃▃▂▂▃▄▇████▆▄▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▂▂▂▂▂ ▃
  44.3 μs         Histogram: frequency by time          125 μs <

 Memory estimate: 303.61 KiB, allocs estimate: 61.
BenchmarkTools.Trial: 10000 samples with 1 evaluation per sample.
 Range (min … max):  40.542 μs …   4.622 ms  ┊ GC (min … max):  0.00% … 98.36%
 Time  (median):     59.375 μs               ┊ GC (median):     0.00%
 Time  (mean ± σ):   70.607 μs ± 130.289 μs  ┊ GC (mean ± σ):  15.08% ±  8.13%

            ▅█▆▃                                                
  ▅▅▃▃▃▃▂▂▃▇████▇▅▄▃▃▃▃▂▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▂▂▂▂▂▂▂▂▂▂▁▂▂ ▃
  40.5 μs         Histogram: frequency by time          135 μs <

 Memory estimate: 335.55 KiB, allocs estimate: 57.
BenchmarkTools.Trial: 10000 samples with 1 evaluation per sample.
 Range (min … max):  38.208 μs …   4.101 ms  ┊ GC (min … max):  0.00% … 98.27%
 Time  (median):     51.917 μs               ┊ GC (median):     0.00%
 Time  (mean ± σ):   60.204 μs ± 110.552 μs  ┊ GC (mean ± σ):  12.14% ±  6.62%

             ▅█▆▆▅▂                                             
  ▃▆▅▅▅▄▃▃▃▅████████▆▅▄▄▄▃▃▃▃▃▃▂▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▂▂ ▃
  38.2 μs         Histogram: frequency by time         97.2 μs <

 Memory estimate: 222.86 KiB, allocs estimate: 54.
1 Like

Slight (stylistic) improvement:

function uncompress_further_optimized(compressionIndexes, XC::AbstractVector{T}, DC, IDC) where {T<:Real}
    nb = size(DC[1], 1)
    nk = size(DC[2], 1)
    ny = size(DC[3], 1)    
    θ1 = Array{eltype(XC)}(undef, nb, nk, ny)

    # Populate the 3D array using compressionIndexes.
    @inbounds for j in eachindex(XC)
        θ1[compressionIndexes[j]] = XC[j]
    end

    @views temp1 = similar(θ1[:, :, 1])
    @views temp2 = similar(θ1[:, :, 1])

    @inbounds @views for yy in axes(θ1, 3)
        # Multiply: temp1 = IDC[1] * θ1[:, :, yy]
        mul!(temp1, IDC[1], θ1[:, :, yy])
        # Multiply: temp2 = temp1 * DC[2]
        mul!(temp2, temp1, DC[2])
        # Write the result back.
        θ1[:, :, yy] .= temp2
    end

    @views temp = similar(θ1[1, :, :])
    @inbounds @views for bb in axes(θ1, 1)
        # Multiply: temp = θ1[bb, :, :] * DC[3]
        mul!(temp, θ1[bb, :, :], DC[3])
        # Write the result back.
        θ1[bb, :, :] .= temp
    end

    return reshape(θ1, nb * nk * ny)
end

This is the best you can get for non-dual types

using LinearAlgebra, Random, ForwardDiff, Test, BenchmarkTools,LoopVectorization

# ------------------------------------------------------------------
# (1) Original uncompress function (generic version)
function uncompress_generic(compressionIndexes, XC, DC, IDC)
    nb = size(DC[1], 1)
    nk = size(DC[2], 1)
    ny = size(DC[3], 1)

    θ1 = zeros(eltype(XC), nb, nk, ny)
    for j in eachindex(XC)
        θ1[compressionIndexes[j]] = copy(XC[j])
    end
    @views for yy = 1:ny
        θ1[:, :, yy] = IDC[1] * θ1[:, :, yy] * DC[2]
    end
    @views for bb = 1:nb
        θ1[bb, :, :] = θ1[bb, :, :] * DC[3]
    end
    θ = reshape(θ1, nb * nk * ny)
    return θ
end

function uncompress_further_optimized(compressionIndexes, XC::AbstractVector{T}, DC, IDC) where {T<:Real}
    nb = size(DC[1], 1)
    nk = size(DC[2], 1)
    ny = size(DC[3], 1)
    θ1 = reshape(XC, nb, nk, ny)
    @inbounds IDC1 = IDC[1]
    @inbounds DC2 = DC[2]
    @inbounds DC3 = DC[3]
    @tullio temp[i,j,l] := IDC1[i,r]*θ1[r,k,ll]*DC2[k,j]*DC3[ll,l]
    return vec(temp)
end

function to_test()
    # Set a seed for reproducibility.
    Random.seed!(1234)

    # Define dimensions
    nb = 4      # number of rows (for IDC[1] and DC[1])
    nk = 4      # dimension for DC[2]
    ny = 3      # dimension for DC[3]
    M = nb * nk * ny  # total number of elements

    # Create dummy matrices for DC and IDC.
    # DC[1] is nb×nb, DC[2] is nk×nk, and DC[3] is ny×ny.
    DC = [rand(nb, nb), rand(nk, nk), rand(ny, ny)]
    IDC = [rand(nb, nb)]

    # Create a simple identity mapping for compressionIndexes.
    compressionIndexes = collect(1:M)

    # Create a dummy vector XC of real numbers.
    XC = rand(M)
    XC_dual = [ForwardDiff.Dual(x, (rand(100)...)) for x in XC]
    a = uncompress_generic(compressionIndexes, XC_dual, DC, IDC);
    c = uncompress_further_optimized(compressionIndexes, XC_dual, DC, IDC);
    @info a ≈ c
    display(@benchmark uncompress_generic($compressionIndexes, $XC, $DC, $IDC))
    display(@benchmark uncompress_further_optimized($compressionIndexes, $XC, $DC, $IDC))
    return a
end

to_test();

giving

[ Info: true
BenchmarkTools.Trial: 10000 samples with 10 evaluations per sample.
 Range (min … max):  1.130 μs …  26.900 μs  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     1.810 μs               ┊ GC (median):    0.00%
 Time  (mean ± σ):   1.906 μs ± 729.199 ns  ┊ GC (mean ± σ):  0.00% ± 0.00%

            ▁▅▅█▆█▄▂▃
  ▂▂▂▁▂▃▅▅▅██████████▇▇▅▆▄▄▃▃▃▂▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ ▃  
  1.13 μs         Histogram: frequency by time        3.75 μs <  

 Memory estimate: 2.73 KiB, allocs estimate: 31.
BenchmarkTools.Trial: 10000 samples with 280 evaluations per sample.
 Range (min … max):  247.500 ns … 180.116 μs  ┊ GC (min … max):  0.00% … 99.27%
 Time  (median):     439.286 ns               ┊ GC (median):     0.00%
 Time  (mean ± σ):   539.191 ns ±   2.750 μs  ┊ GC (mean ± σ):  12.34% ±  2.80%

         ▅██▂▂
  █▇▅▃▃▃▄█████▆▆▅▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▂▂▂ ▃
  248 ns           Histogram: frequency by time          1.5 μs <

 Memory estimate: 576 bytes, allocs estimate: 4.

but it will be really slow for Dual types for some reason, I guess you would have to keep the other form for this.

1 Like

One more trick is to avoid using the temporary allocations at all and simply write to theta directly by unfolding in a triple for-loop, accessing the memory in an efficient order. I’ve made this change for the last for-loop, and if I didn’t make a linear algebra mistake, that is again a bit faster; the other loop should allow you to get a similar speed-up (at least by eliminating temp2)

function uncompress_further_optimized(compressionIndexes, XC::AbstractVector{T}, DC, IDC) where {T<:Real}
    nb = size(DC[1], 1)
    nk = size(DC[2], 1)
    ny = size(DC[3], 1)
    θ1 = Array{eltype(XC)}(undef, nb, nk, ny)

    # Populate the 3D array using compressionIndexes.
    @inbounds for j in eachindex(XC)
        θ1[compressionIndexes[j]] = XC[j]
    end

    @views temp1 = similar(θ1[:, :, 1])
    @views temp2 = similar(θ1[:, :, 1])

    @inbounds @views for yy in axes(θ1, 3)
        # Multiply: temp1 = IDC[1] * θ1[:, :, yy]
        mul!(temp1, IDC[1], θ1[:, :, yy])
        # Multiply: temp2 = temp1 * DC[2]
        mul!(temp2, temp1, DC[2])
        # Write the result back.
        θ1[:, :, yy] .= temp2
    end

    @inbounds @views for i in axes(θ1, 2), bb in axes(θ1, 1), k in axes(θ1, 3)
        θ1[bb, i, k] = θ1[bb, i, :] ⋅ DC[3][:, k]
    end

    return reshape(θ1, nb * nk * ny)
end

gives

BenchmarkTools.Trial: 10000 samples with 1 evaluation per sample.
 Range (min … max):  35.625 μs …  4.564 ms  ┊ GC (min … max): 0.00% … 98.41%
 Time  (median):     44.625 μs              ┊ GC (median):    0.00%
 Time  (mean ± σ):   49.842 μs ± 88.113 μs  ┊ GC (mean ± σ):  9.19% ±  5.42%

              ▃▆██▄▁                                           
  ▂▄▄▄▄▄▃▃▃▄▅███████▇▆▅▄▄▃▂▂▂▂▂▁▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ ▂
  35.6 μs         Histogram: frequency by time        71.3 μs <

 Memory estimate: 153.30 KiB, allocs estimate: 33.

@JADekker I can definitely work with this, thanks! @yolhan_mannes since I also need this for non-dual types, I’d also like to give you credit as well. I’ve learned a lot from you both!

Actually, the linear algebra is not correct …

Yeah I probably fumbled with the indices, right? It should be straightforward to fix. I could try to take a look tomorrow if necessary? What helps me is to take all dimensions co-prime, so that julia tells me if I’m messing up the indices somewhere (as no unintended matrix multiplication will “magically work”)

Sorry, linear algebra was correct, just needed to do this and then it worked:

@inbounds @views for bb in 1:nb
    temp_slice = copy(θ1[bb, :, :])
    for i in 1:nk, k in 1:ny
        θ1[bb, i, k] = dot(temp_slice[i, :], DC[3][:, k])
    end
end
1 Like

The problem is the inner-most loop that I wrote, you can’t do that step but should do that one vector by vector. (Which effectively amounts to what you just wrote indeed, but it’d avoid copying)

1 Like

Perhaps you can work from this? Since your implementation is super fast, I tried splitting the Dual into its real part and partials. It’s 5x slower than @JADekker’s last implementation, but perhaps it’s just my inefficient coding:

function uncompress_inner(compressionIndexes, XC::AbstractVector{T}, DC, IDC) where {T<:Real}
    nb = size(DC[1], 1)
    nk = size(DC[2], 1)
    ny = size(DC[3], 1)
    θ1 = reshape(XC, nb, nk, ny)
    @inbounds IDC1 = IDC[1]
    @inbounds DC2 = DC[2]
    @inbounds DC3 = DC[3]
    @tullio temp[i,j,l] := IDC1[i,r]*θ1[r,k,ll]*DC2[k,j]*DC3[ll,l]
    return vec(temp)
end

function uncompress_split(compressionIndexes, XC::Vector{ForwardDiff.Dual{T,V,N}}, DC, IDC) where {T,V,N}
    # Extract the real part.
    XC_r = ForwardDiff.value.(XC)
    # For each dual, extract the first (and only) derivative component.
    XC_ds = Vector{Vector{V}}(undef, N)
    Threads.@threads for j in 1:N
        XC_ds[j] = [ForwardDiff.partials(x)[j] for x in XC]
    end

    # Process the real and dual parts using the real method.
    result_r = uncompress_inner(compressionIndexes, XC_r, DC, IDC)
    
    # Process the uncompression for each derivative direction.
    result_ds = Vector{Vector{V}}(undef, N)
    Threads.@threads for j in 1:N
        result_ds[j] = uncompress_inner(compressionIndexes, XC_ds[j], DC, IDC)
    end


    # Reassemble the dual numbers.
    result = Vector{ForwardDiff.Dual{T,V,N}}(undef, length(result_r))
    for i in eachindex(result)
        deriv_tuple = ntuple(j -> result_ds[j][i], N)
        result[i] = ForwardDiff.Dual{T,V,N}(result_r[i], ForwardDiff.Partials{N,V}(deriv_tuple))
    end

    return result
end

Can you try with @ tensor from TensorOperations.jl results are better for large enough matrix and it will be a lot less dependent on the machine, also seems working nice with dual types

using LinearAlgebra, Random, ForwardDiff, Test, BenchmarkTools,TensorOperations
function uncompress_further_optimized( XC::AbstractVector{T}, DC, IDC) where {T<:Real}
    nb = size(DC[1], 1)
    nk = size(DC[2], 1)
    ny = size(DC[3], 1)
    θ1 = reshape(XC, nb, nk, ny)
    @inbounds IDC1 = IDC[1]
    @inbounds DC2 = DC[2]
    @inbounds DC3 = DC[3]
    @tensor temp[i,j,l] := IDC1[i,r]*θ1[r,k,ll]*DC2[k,j]*DC3[ll,l]
    return vec(temp)
end

sorry for the compression_index removal you may wanted to do something special with that, you can add it back if you want.
Trying split function on both it has around the same perf but its unneeded here.
Also don’t forget to add the best backend for your system (MKL for intel cpu and AppleAccelerate for Apple)

1 Like