You can use LoopVectorization.jl
to get somewhat close in performance (at least on my laptop):
using LoopVectorization
function bAb_loopvectorization(A,b)
total = zero(eltype(b))
@tturbo for i in eachindex(b), j in eachindex(b)
total += b[i]*A[i,j]*b[j]
end
return total
end
Full code snippet
Pkg.activate(;temp=true)
Pkg.add(["BenchmarkTools", "LinearAlgebra", "LoopVectorization"])
using BenchmarkTools
using LinearAlgebra
using LoopVectorization
A = randn(10, 10)
b = randn(10)
bAb(A, b) = b' * A * b
function bAb_noalloc(A, b)
total = zero(eltype(b))
for ind in eachindex(b)
total += @inbounds (b' * view(A, :, ind)) * b[ind]
end
return total
end
function bAb_loopvectorization(A,b)
total = zero(eltype(b))
@tturbo for i in eachindex(b), j in eachindex(b)
total += b[i]*A[i,j]*b[j]
end
return total
end
bAb!(w,A,b) = b'*mul!(w,A',b)
function bench(N)
A = randn(N, N)
b = randn(N)
w = similar(b)
@assert isapprox(bAb(A,b), dot(b, A, b))
@assert isapprox(bAb!(w, A,b), dot(b, A, b))
@assert isapprox(bAb_loopvectorization(A,b), dot(b, A, b))
println("Benchmark: $N")
println("naive")
@btime bAb($A, $b)
println("explicite loop")
@btime bAb_noalloc($A, $b)
println("loopvectorization")
@btime bAb_loopvectorization($A, $b)
println("dot")
@btime dot($b, $A, $b)
println("bAb!")
@btime bAb!($w, $A, $b)
println()
end
bench(10)
bench(100)
bench(1000)
bench(10000)
This gives me the results
matrix size |
bAb |
bAb_noalloc |
dot |
bAb_loopvectorization |
bAb! |
1e1 |
108.411 ns |
125.656 ns |
54.433 ns |
19.335 ns |
68.340 ns |
1e2 |
1.320 μs |
1.928 μs |
832.520 ns |
745.377 ns |
1.187 μs |
1e3 |
17.042 μs |
362.067 μs |
342.302 μs |
69.913 μs |
16.273 μs |
1e4 |
20.249 ms |
35.923 ms |
36.475 ms |
19.696 ms |
20.214 ms |
Run with 8 Julia threads and 8 BLAS threads (that why used @tturbo
which generated threaded code as opposed to @turbo
).
LV wins for small matrices by quite a bit, but loses out in an intermediate regime and then wins again.
Edit: Also added a version where the temporary is preallocated, but this of course is more annoying to call and does only offer gains for small(ish) matrices.
bAb!(w,A,b) = b'*mul!(w,A',b) # note A' is faster than A