Reduce allocations in row-by-row dotproduct

Note that you can fuse two of the loops there, and hoist some bounds checks to get another big speed-up:

function add_and_mul3(X1, A, r, s, d)
    X2 = Matrix{Float64}(undef, size(X1))
    @boundscheck checkbounds(A, axes(X2, 1), 1:d)
    @boundscheck checkbounds(X1, :, 1:d+1)
    
    @views X2[:, d+1] .= X1[:, d+1] .* r
    for j in 1:d
        @inbounds @simd for i in axes(X2, 1)
            X1ij = X1[i, j]
            X2ij = X1ij * s
            X2[i, j] = X2ij
            X2[i, d+1] += A[i,j] * (X2ij - X1ij * r)
        end
    end
    return X2
end

function RunBench(;N=10_000, d=5)
    r = rand()
    s = rand()
    X1 = rand(N, d+1)
    A = rand(N, d)
    @info "" add_and_mul(X1, A, r, s, d) ≈ add_and_mul2(X1, A, r, s, d) ≈ add_and_mul3(X1, A, r, s, d)
    display(@benchmark add_and_mul($X1, $A, $r, $s, $d))
    sleep(1)
    display(@benchmark add_and_mul2($X1, $A, $r, $s, $d))
    sleep(1)
    display(@benchmark add_and_mul3($X1, $A, $r, $s, $d))
end

gives me

julia> RunBench()
┌ Info: 
└   add_and_mul(X1, A, r, s, d) ≈ add_and_mul2(X1, A, r, s, d) ≈ add_and_mul3(X1, A, r, s, d) = true
┌ Info: 
│   add_and_mul =
│    BenchmarkTools.Trial: 10000 samples with 1 evaluation per sample.
│     Range (min … max):   72.898 μs …   1.970 ms  ┊ GC (min … max):  0.00% … 87.09%
│     Time  (median):     113.355 μs               ┊ GC (median):     0.00%
│     Time  (mean ± σ):   199.154 μs ± 214.478 μs  ┊ GC (mean ± σ):  16.48% ± 16.41%
│    
│      ▆█▇▆▅▅▅▄▃▂▁▁                             ▁▁ ▁                 ▂
│      ██████████████▇█▇▇▇▆▅▅▄▅▆▇▇███▆▆▆▇███▇████████▇▇▆▇▇▇▇▆▅▆▆▆▇▆▆ █
│      72.9 μs       Histogram: log(frequency) by time       1.02 ms <
│    
└     Memory estimate: 2.21 MiB, allocs estimate: 24.
┌ Info: 
│   add_and_mul2 =
│    BenchmarkTools.Trial: 10000 samples with 1 evaluation per sample.
│     Range (min … max):  41.067 μs …  1.736 ms  ┊ GC (min … max):  0.00% … 94.50%
│     Time  (median):     49.454 μs              ┊ GC (median):     0.00%
│     Time  (mean ± σ):   75.996 μs ± 82.370 μs  ┊ GC (mean ± σ):  10.38% ± 10.69%
│    
│      █▇▅▅▃             ▂▃▃▃▁                                     ▂
│      █████▇▇▆▅▆▅▄▄▇▇▇▆▆█████▇▆▅▄▅▅▅▅▄▁▄▁▅▄▅▃▅▅▅▅▅▄▅▆▃▅▄▆▅▅▄▃▅▅▅▅ █
│      41.1 μs      Histogram: log(frequency) by time       483 μs <
│    
└     Memory estimate: 468.83 KiB, allocs estimate: 3.
┌ Info: 
│   add_and_mul3 =
│    BenchmarkTools.Trial: 10000 samples with 1 evaluation per sample.
│     Range (min … max):  14.006 μs …  1.580 ms  ┊ GC (min … max):  0.00% … 97.28%
│     Time  (median):     23.685 μs              ┊ GC (median):     0.00%
│     Time  (mean ± σ):   46.578 μs ± 76.171 μs  ┊ GC (mean ± σ):  14.22% ± 10.53%
│    
│      █▇▆▆▂                 ▃▃▃▃▂                                 ▂
│      █████▆▄▃▄▁▃▄▅▄▁▃▇█▇▅▄▇██████▆▆▆▆▆▆▆▅▆▄▃▄▄▄▄▃▃▆▇▆▅▅▅▅▄▆▅▅▅▅▆ █
│      14 μs        Histogram: log(frequency) by time       380 μs <
│    
└     Memory estimate: 468.83 KiB, allocs estimate: 3.
4 Likes