Efficient use of @turbo for linear algebra operations (LoopVectorization.jl)

Removing the branch, I get

julia> using LoopVectorization, LinearAlgebra

julia> function fastmath_dot(x::AbstractVector{Float64}, A::AbstractMatrix{Float64}, y::AbstractVector{Float64})
         s = 0.0;
         @fastmath for j in eachindex(y)
           yj = y[j];
           if !iszero(yj)
             temp = 0.0;
             for i in eachindex(x)
               temp += x[i]*A[i,j];
             end
             s += temp*yj;
           end
         end
         return s;
       end
fastmath_dot (generic function with 1 method)

julia> function turbo_dot(x::AbstractVector{Float64}, A::AbstractMatrix{Float64}, y::AbstractVector{Float64})
         s = 0.0;
         @turbo for j in eachindex(y), i in eachindex(x)
           s += x[i]*A[i,j]*y[j];
         end
         return s;
       end
turbo_dot (generic function with 1 method)

julia> x = rand(100);

julia> y = rand(100);

julia> A = rand(100,100);

julia> y[rand(axes(y,1), length(y) ÷ 2)] .= 0;

julia> @btime turbo_dot($x, $A, $y)
  639.714 ns (0 allocations: 0 bytes)
745.9084558196317

julia> @btime fastmath_dot($x, $A, $y)
  4.253 μs (0 allocations: 0 bytes)
745.9084558196316

julia> @btime dot($x, $A, $y)
  434.628 ns (0 allocations: 0 bytes)
745.9084558196316

julia> @btime turbo_dot($y, $A, $x)
  635.095 ns (0 allocations: 0 bytes)
743.8372158481377

julia> @btime fastmath_dot($y, $A, $x)
  6.860 μs (0 allocations: 0 bytes)
743.8372158481382

julia> @btime dot($y, $A, $x)
  781.343 ns (0 allocations: 0 bytes)
743.837215848138

julia> versioninfo()
Julia Version 1.8.0-DEV.383
Commit eb83c4d25b* (2021-08-20 20:03 UTC)
Platform Info:
  OS: Linux (x86_64-redhat-linux)
  CPU: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-12.0.1 (ORCJIT, tigerlake)

So which is faster depends on just how many zeros.
You can also try this, but it was slower for me at this size:

julia> function turbo_dot2(x::AbstractVector{Float64}, A::AbstractMatrix{Float64}, y::AbstractVector{Float64})
         s = 0.0;
         @turbo for j in eachindex(y)
           temp = 0.0;
           for i in eachindex(x)
             temp += x[i]*A[i,j];
           end
           s += temp*y[j];
         end
         return s;
       end
turbo_dot2 (generic function with 1 method)

julia> @btime turbo_dot2($y, $A, $x)
  663.428 ns (3 allocations: 64 bytes)
743.8372158481376

julia> @btime turbo_dot2($x, $A, $y)
  665.138 ns (3 allocations: 64 bytes)
745.9084558196316
1 Like