Note that you can fuse two of the loops there, and hoist some bounds checks to get another big speed-up:
function add_and_mul3(X1, A, r, s, d)
X2 = Matrix{Float64}(undef, size(X1))
@boundscheck checkbounds(A, axes(X2, 1), 1:d)
@boundscheck checkbounds(X1, :, 1:d+1)
@views X2[:, d+1] .= X1[:, d+1] .* r
for j in 1:d
@inbounds @simd for i in axes(X2, 1)
X1ij = X1[i, j]
X2ij = X1ij * s
X2[i, j] = X2ij
X2[i, d+1] += A[i,j] * (X2ij - X1ij * r)
end
end
return X2
end
function RunBench(;N=10_000, d=5)
r = rand()
s = rand()
X1 = rand(N, d+1)
A = rand(N, d)
@info "" add_and_mul(X1, A, r, s, d) ≈ add_and_mul2(X1, A, r, s, d) ≈ add_and_mul3(X1, A, r, s, d)
display(@benchmark add_and_mul($X1, $A, $r, $s, $d))
sleep(1)
display(@benchmark add_and_mul2($X1, $A, $r, $s, $d))
sleep(1)
display(@benchmark add_and_mul3($X1, $A, $r, $s, $d))
end
gives me
julia> RunBench()
┌ Info:
└ add_and_mul(X1, A, r, s, d) ≈ add_and_mul2(X1, A, r, s, d) ≈ add_and_mul3(X1, A, r, s, d) = true
┌ Info:
│ add_and_mul =
│ BenchmarkTools.Trial: 10000 samples with 1 evaluation per sample.
│ Range (min … max): 72.898 μs … 1.970 ms ┊ GC (min … max): 0.00% … 87.09%
│ Time (median): 113.355 μs ┊ GC (median): 0.00%
│ Time (mean ± σ): 199.154 μs ± 214.478 μs ┊ GC (mean ± σ): 16.48% ± 16.41%
│
│ ▆█▇▆▅▅▅▄▃▂▁▁ ▁▁ ▁ ▂
│ ██████████████▇█▇▇▇▆▅▅▄▅▆▇▇███▆▆▆▇███▇████████▇▇▆▇▇▇▇▆▅▆▆▆▇▆▆ █
│ 72.9 μs Histogram: log(frequency) by time 1.02 ms <
│
└ Memory estimate: 2.21 MiB, allocs estimate: 24.
┌ Info:
│ add_and_mul2 =
│ BenchmarkTools.Trial: 10000 samples with 1 evaluation per sample.
│ Range (min … max): 41.067 μs … 1.736 ms ┊ GC (min … max): 0.00% … 94.50%
│ Time (median): 49.454 μs ┊ GC (median): 0.00%
│ Time (mean ± σ): 75.996 μs ± 82.370 μs ┊ GC (mean ± σ): 10.38% ± 10.69%
│
│ █▇▅▅▃ ▂▃▃▃▁ ▂
│ █████▇▇▆▅▆▅▄▄▇▇▇▆▆█████▇▆▅▄▅▅▅▅▄▁▄▁▅▄▅▃▅▅▅▅▅▄▅▆▃▅▄▆▅▅▄▃▅▅▅▅ █
│ 41.1 μs Histogram: log(frequency) by time 483 μs <
│
└ Memory estimate: 468.83 KiB, allocs estimate: 3.
┌ Info:
│ add_and_mul3 =
│ BenchmarkTools.Trial: 10000 samples with 1 evaluation per sample.
│ Range (min … max): 14.006 μs … 1.580 ms ┊ GC (min … max): 0.00% … 97.28%
│ Time (median): 23.685 μs ┊ GC (median): 0.00%
│ Time (mean ± σ): 46.578 μs ± 76.171 μs ┊ GC (mean ± σ): 14.22% ± 10.53%
│
│ █▇▆▆▂ ▃▃▃▃▂ ▂
│ █████▆▄▃▄▁▃▄▅▄▁▃▇█▇▅▄▇██████▆▆▆▆▆▆▆▅▆▄▃▄▄▄▄▃▃▆▇▆▅▅▅▅▄▆▅▅▅▅▆ █
│ 14 μs Histogram: log(frequency) by time 380 μs <
│
└ Memory estimate: 468.83 KiB, allocs estimate: 3.