I’m trying to optimize a non-allocating matrix update. Here is the simple code.
function update_W_original!(
W::AbstractMatrix,
l::Int,
K::Int,
col_cache::AbstractVector,
row_cache::AbstractVector,
)
inv_WKL = inv(W[K, l])
copyto!(row_cache, view(W, K, :))
row_cache[l] -= 1.0
copyto!(col_cache, view(W, :, l))
rows, cols = axes(W)
@inbounds for j in cols
scalar_Krow_j = row_cache[j]
if scalar_Krow_j != 0
@simd for i in rows
factor_i = col_cache[i] * inv_WKL
W[i, j] -= factor_i * scalar_Krow_j
end
end
end
return nothing
end
This is some code advised by Gemini, and reached best speed on my machine so far. But I’m confused with the if, since it’s expensive in loops, and my row_cache is unlikely to be sparse.
I want to remove the if and see whether it helps with performance.
# Modified update_W! function without the if condition
function update_W_no_if!(
W::AbstractMatrix,
l::Int,
K::Int,
col_cache::AbstractVector,
row_cache::AbstractVector,
)
inv_WKL = inv(W[K, l])
copyto!(row_cache, view(W, K, :))
row_cache[l] -= 1.0
copyto!(col_cache, view(W, :, l))
rows, cols = axes(W)
@inbounds for j in cols
scalar_Krow_j = row_cache[j]
@simd for i in rows
factor_i = col_cache[i] * inv_WKL
W[i, j] -= factor_i * scalar_Krow_j
end
end
return nothing
end
# Setup for benchmarking
function setup_benchmark(matrix_size::Int)
W = rand(ComplexF64, matrix_size, matrix_size)
l = rand(1:matrix_size)
K = rand(1:matrix_size)
col_cache = rand(ComplexF64, matrix_size)
row_cache = rand(ComplexF64, matrix_size)
return W, l, K, col_cache, row_cache
end
matrix_size = 1000 # Example size, adjust as needed
W, l, K, col_cache, row_cache = setup_benchmark(matrix_size)
println("\nOriginal update_W!:")
@btime update_W_original!($W, $l, $K, $col_cache, $row_cache) setup = (W_copy = deepcopy(W))
println("\nupdate_W! without if condition:")
@btime update_W_no_if!($W, $l, $K, $col_cache, $row_cache) setup = (W_copy = deepcopy(W))
To my suprise, I saw a HUGE performance variation:
Original update_W!:
3.716 μs (2 allocations: 32 bytes)
update_W! without if condition:
1.591 ms (2 allocations: 32 bytes)
I tried to investigate and I infiltrate into the loop, where I saw the random row_cache unexpectedly turns into nearly zero!
infil> row_cache
1000-element Vector{ComplexF64}:
2.7755575615628914e-17 + 0.0im
5.551115123125783e-17 + 0.0im
0.0 + 0.0im
0.0 + 0.0im
0.0 - 1.3877787807814457e-17im
0.0 - 5.551115123125783e-17im
0.0 + 0.0im
0.0 + 0.0im
0.0 - 5.551115123125783e-17im
2.7755575615628914e-17 + 0.0im
0.0 - 2.7755575615628914e-17im
2.0816681711721685e-17 + 0.0im
0.0 - 5.551115123125783e-17im
0.0 - 5.551115123125783e-17im
0.0 - 5.377642775528102e-17im
2.7755575615628914e-17 + 0.0im
0.0 + 0.0im
0.0 - 2.7755575615628914e-17im
2.7755575615628914e-17 + 0.0im
0.0 - 5.551115123125783e-17im
⋮
5.551115123125783e-17 + 0.0im
5.551115123125783e-17 + 0.0im
0.0 + 0.0im
0.0 + 0.0im
0.0 + 0.0im
5.551115123125783e-17 + 0.0im
2.7755575615628914e-17 + 0.0im
0.0 + 0.0im
0.0 + 0.0im
0.0 - 2.42861286636753e-17im
2.7755575615628914e-17 + 0.0im
0.0 + 0.0im
0.0 + 0.0im
0.0 + 0.0im
0.0 - 4.7596475372113645e-17im
0.0 + 0.0im
5.551115123125783e-17 + 0.0im
0.0 + 0.0im
0.0 - 5.551115123125783e-17im
That’s quite strange, I couldn’t figure out why this row_cache changes. And I feel really unsafe on this optimization, though some naive test says it reproduces the same data…
Can anyone help on this? I’m really confused…