I’m trying to optimize a non-allocating matrix update. Here is the simple code.
function update_W_original!(
W::AbstractMatrix,
l::Int,
K::Int,
col_cache::AbstractVector,
row_cache::AbstractVector,
)
inv_WKL = inv(W[K, l])
copyto!(row_cache, view(W, K, :))
row_cache[l] -= 1.0
copyto!(col_cache, view(W, :, l))
rows, cols = axes(W)
@inbounds for j in cols
scalar_Krow_j = row_cache[j]
if scalar_Krow_j != 0
@simd for i in rows
factor_i = col_cache[i] * inv_WKL
W[i, j] -= factor_i * scalar_Krow_j
end
end
end
return nothing
end
This is some code advised by Gemini, and reached best speed on my machine so far. But I’m confused with the if
, since it’s expensive in loops, and my row_cache
is unlikely to be sparse.
I want to remove the if
and see whether it helps with performance.
# Modified update_W! function without the if condition
function update_W_no_if!(
W::AbstractMatrix,
l::Int,
K::Int,
col_cache::AbstractVector,
row_cache::AbstractVector,
)
inv_WKL = inv(W[K, l])
copyto!(row_cache, view(W, K, :))
row_cache[l] -= 1.0
copyto!(col_cache, view(W, :, l))
rows, cols = axes(W)
@inbounds for j in cols
scalar_Krow_j = row_cache[j]
@simd for i in rows
factor_i = col_cache[i] * inv_WKL
W[i, j] -= factor_i * scalar_Krow_j
end
end
return nothing
end
# Setup for benchmarking
function setup_benchmark(matrix_size::Int)
W = rand(ComplexF64, matrix_size, matrix_size)
l = rand(1:matrix_size)
K = rand(1:matrix_size)
col_cache = rand(ComplexF64, matrix_size)
row_cache = rand(ComplexF64, matrix_size)
return W, l, K, col_cache, row_cache
end
matrix_size = 1000 # Example size, adjust as needed
W, l, K, col_cache, row_cache = setup_benchmark(matrix_size)
println("\nOriginal update_W!:")
@btime update_W_original!($W, $l, $K, $col_cache, $row_cache) setup = (W_copy = deepcopy(W))
println("\nupdate_W! without if condition:")
@btime update_W_no_if!($W, $l, $K, $col_cache, $row_cache) setup = (W_copy = deepcopy(W))
To my suprise, I saw a HUGE performance variation:
Original update_W!:
3.716 μs (2 allocations: 32 bytes)
update_W! without if condition:
1.591 ms (2 allocations: 32 bytes)
I tried to investigate and I infiltrate into the loop, where I saw the random row_cache
unexpectedly turns into nearly zero!
infil> row_cache
1000-element Vector{ComplexF64}:
2.7755575615628914e-17 + 0.0im
5.551115123125783e-17 + 0.0im
0.0 + 0.0im
0.0 + 0.0im
0.0 - 1.3877787807814457e-17im
0.0 - 5.551115123125783e-17im
0.0 + 0.0im
0.0 + 0.0im
0.0 - 5.551115123125783e-17im
2.7755575615628914e-17 + 0.0im
0.0 - 2.7755575615628914e-17im
2.0816681711721685e-17 + 0.0im
0.0 - 5.551115123125783e-17im
0.0 - 5.551115123125783e-17im
0.0 - 5.377642775528102e-17im
2.7755575615628914e-17 + 0.0im
0.0 + 0.0im
0.0 - 2.7755575615628914e-17im
2.7755575615628914e-17 + 0.0im
0.0 - 5.551115123125783e-17im
⋮
5.551115123125783e-17 + 0.0im
5.551115123125783e-17 + 0.0im
0.0 + 0.0im
0.0 + 0.0im
0.0 + 0.0im
5.551115123125783e-17 + 0.0im
2.7755575615628914e-17 + 0.0im
0.0 + 0.0im
0.0 + 0.0im
0.0 - 2.42861286636753e-17im
2.7755575615628914e-17 + 0.0im
0.0 + 0.0im
0.0 + 0.0im
0.0 + 0.0im
0.0 - 4.7596475372113645e-17im
0.0 + 0.0im
5.551115123125783e-17 + 0.0im
0.0 + 0.0im
0.0 - 5.551115123125783e-17im
That’s quite strange, I couldn’t figure out why this row_cache
changes. And I feel really unsafe on this optimization, though some naive test says it reproduces the same data…
Can anyone help on this? I’m really confused…