OpenBLAS is faster than Intel MKL on AMD Hardware (Ryzen)

Cholesky
Moving onto LAPACK, lets create positive definite matrices.

gen_pd(n) = randn(2n, n) |> x -> Symmetric(x' * x)
D = gen_pd.(N);
function bench_f(A, f, N, info = "")
    for i in 1:length(N)
        println("Size: $(N[i])" * info)
        Aᵢ = A[i]
        @show @benchmark $f(B) setup=( B = copy($Aᵢ) ) evals=1
    end
end
bench_f(D, cholfact!, N, ", OpenBLAS:")

Also, because StaticArrays performs poorly here, I wanted to add a different Julia function for comparison:

function julia_chol!(U::AbstractArray{<:Real,2})
    p = size(U,1)
    @inbounds for i ∈ 1:p
        Uᵢᵢ = U[i,i]
        for j ∈ 1:i-1
            Uⱼᵢ = U[j,i]
            for k ∈ 1:j-1
                Uⱼᵢ -= U[k,i] * U[k,j]
            end
            Uⱼᵢ /= U[j,j]
            U[j,i] = Uⱼᵢ
            Uᵢᵢ -= abs2(Uⱼᵢ)
        end
        U[i,i] = √Uᵢᵢ
    end
    U
end

This is a naive algorithm and scales very poorly, but it is still faster for 8x8 matrices and in the same ballpark for 64x64.

8x8, Cholfact

Size: 8, MKL:
  memory estimate:  64 bytes
  allocs estimate:  2
  --------------
  minimum time:     210.000 ns (0.00% GC)
  median time:      240.000 ns (0.00% GC)
  mean time:        280.039 ns (0.00% GC)
  maximum time:     8.055 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1
Size: 8, OpenBLAS: 
  memory estimate:  64 bytes
  allocs estimate:  2
  --------------
  minimum time:     561.000 ns (0.00% GC)
  median time:      571.000 ns (0.00% GC)
  mean time:        605.324 ns (0.00% GC)
  maximum time:     43.101 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1
Size: 8, julia_chol!
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     170.000 ns (0.00% GC)
  median time:      171.000 ns (0.00% GC)
  mean time:        176.252 ns (0.00% GC)
  maximum time:     4.769 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1
StaticArrays
  memory estimate:  14.58 KiB
  allocs estimate:  112
  --------------
  minimum time:     64.371 μs (0.00% GC)
  median time:      66.635 μs (0.00% GC)
  mean time:        72.561 μs (6.66% GC)
  maximum time:     34.846 ms (99.43% GC)
  --------------
  samples:          10000
  evals/sample:     1

64x64, Cholfact

Size: 64, MKL: 
  memory estimate:  64 bytes
  allocs estimate:  2
  --------------
  minimum time:     15.560 μs (0.00% GC)
  median time:      15.930 μs (0.00% GC)
  mean time:        16.068 μs (0.00% GC)
  maximum time:     54.824 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1
Size: 64, OpenBLAS: 
  memory estimate:  64 bytes
  allocs estimate:  2
  --------------
  minimum time:     19.497 μs (0.00% GC)
  median time:      24.216 μs (0.00% GC)
  mean time:        28.747 μs (0.00% GC)
  maximum time:     28.050 ms (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1
Size: 64, julia_chol!
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     28.744 μs (0.00% GC)
  median time:      29.185 μs (0.00% GC)
  mean time:        29.262 μs (0.00% GC)
  maximum time:     74.320 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

512x512, Cholfact

Size: 512, MKL: 
  memory estimate:  64 bytes
  allocs estimate:  2
  --------------
  minimum time:     788.878 μs (0.00% GC)
  median time:      842.179 μs (0.00% GC)
  mean time:        858.057 μs (0.00% GC)
  maximum time:     4.506 ms (0.00% GC)
  --------------
  samples:          3553
  evals/sample:     1
Size: 512, OpenBLAS:
  memory estimate:  64 bytes
  allocs estimate:  2
  --------------
  minimum time:     1.048 ms (0.00% GC)
  median time:      1.098 ms (0.00% GC)
  mean time:        1.114 ms (0.00% GC)
  maximum time:     1.702 ms (0.00% GC)
  --------------
  samples:          3097
  evals/sample:     1
Size: 512, julia_chol!
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     16.201 ms (0.00% GC)
  median time:      16.402 ms (0.00% GC)
  mean time:        16.456 ms (0.00% GC)
  maximum time:     17.533 ms (0.00% GC)
  --------------
  samples:          298
  evals/sample:     1

4096x4096, Cholfact

Size: 4096, MKL: 
  memory estimate:  64 bytes
  allocs estimate:  2
  --------------
  minimum time:     174.107 ms (0.00% GC)
  median time:      176.916 ms (0.00% GC)
  mean time:        177.837 ms (0.00% GC)
  maximum time:     190.099 ms (0.00% GC)
  --------------
  samples:          24
  evals/sample:     1
Size: 4096, OpenBLAS: 
  memory estimate:  64 bytes
  allocs estimate:  2
  --------------
  minimum time:     99.876 ms (0.00% GC)
  median time:      101.212 ms (0.00% GC)
  mean time:        102.345 ms (0.00% GC)
  maximum time:     107.673 ms (0.00% GC)
  --------------
  samples:          39
  evals/sample:     1
Size: 4096, julia_chol!
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     12.152 s (0.00% GC)
  median time:      12.152 s (0.00% GC)
  mean time:        12.152 s (0.00% GC)
  maximum time:     12.152 s (0.00% GC)
  --------------
  samples:          1
  evals/sample:     1

Eigendecomposition of Positive Definite Matrices

bench_f(D, eigfact!, N, ", OpenBLAS:")

8x8, eigfact of positive definite matrix

Size: 8, MKL:
  memory estimate:  4.38 KiB
  allocs estimate:  11
  --------------
  minimum time:     12.203 μs (0.00% GC)
  median time:      12.964 μs (0.00% GC)
  mean time:        13.451 μs (1.92% GC)
  maximum time:     1.319 ms (98.45% GC)
  --------------
  samples:          10000
  evals/sample:     1
Size: 8, OpenBLAS: 
  memory estimate:  4.41 KiB
  allocs estimate:  11
  --------------
  minimum time:     15.339 μs (0.00% GC)
  median time:      17.763 μs (0.00% GC)
  mean time:        17.936 μs (1.45% GC)
  maximum time:     1.345 ms (97.59% GC)
  --------------
  samples:          10000
  evals/sample:     1
StaticArrays
  memory estimate:  5.59 KiB
  allocs estimate:  12
  --------------
  minimum time:     15.660 μs (0.00% GC)
  median time:      18.745 μs (0.00% GC)
  mean time:        23.590 μs (20.61% GC)
  maximum time:     36.955 ms (99.85% GC)
  --------------
  samples:          10000
  evals/sample:     1

64x64, eigfact of positive definite matrix

Size: 64, MKL: 
  memory estimate:  84.89 KiB
  allocs estimate:  13
  --------------
  minimum time:     312.257 μs (0.00% GC)
  median time:      321.615 μs (0.00% GC)
  mean time:        327.857 μs (1.37% GC)
  maximum time:     1.618 ms (74.54% GC)
  --------------
  samples:          10000
  evals/sample:     1
Size: 64, OpenBLAS: 
  memory estimate:  85.36 KiB
  allocs estimate:  13
  --------------
  minimum time:     693.786 μs (0.00% GC)
  median time:      736.937 μs (0.00% GC)
  mean time:        740.982 μs (0.46% GC)
  maximum time:     1.668 ms (53.11% GC)
  --------------
  samples:          6677
  evals/sample:     1

512x512, eigfact of positive definite matrix

Size: 512, MKL: 
  memory estimate:  4.16 MiB
  allocs estimate:  13
  --------------
  minimum time:     22.577 ms (0.00% GC)
  median time:      23.149 ms (0.00% GC)
  mean time:        23.257 ms (0.71% GC)
  maximum time:     26.396 ms (3.49% GC)
  --------------
  samples:          212
  evals/sample:     1
Size: 512, OpenBLAS:
  memory estimate:  4.16 MiB
  allocs estimate:  13
  --------------
  minimum time:     44.988 ms (0.00% GC)
  median time:      45.718 ms (0.00% GC)
  mean time:        45.967 ms (0.27% GC)
  maximum time:     50.037 ms (1.41% GC)
  --------------
  samples:          108
  evals/sample:     1

4096x4096, eigfact of positive definite matrix

Size: 4096, MKL: 
  memory estimate:  257.47 MiB
  allocs estimate:  16
  --------------
  minimum time:     6.548 s (0.14% GC)
  median time:      6.548 s (0.14% GC)
  mean time:        6.548 s (0.14% GC)
  maximum time:     6.548 s (0.14% GC)
  --------------
  samples:          1
  evals/sample:     1
Size: 4096, OpenBLAS: 
  memory estimate:  257.28 MiB
  allocs estimate:  16
  --------------
  minimum time:     8.973 s (0.10% GC)
  median time:      8.973 s (0.10% GC)
  mean time:        8.973 s (0.10% GC)
  maximum time:     8.973 s (0.10% GC)
  --------------
  samples:          1
  evals/sample:     1

Here, MKL maintained its edge. I recall reading that some of the LAPACK functions from OpenBLAS are reference, rather than optimized, implentations.

Eigendecomposition of general square matrices

bench_f(A, eigfact!, N, ", OpenBLAS:")

8x8, eigfact of general square matrix

Size: 8, MKL:
  memory estimate:  3.64 KiB
  allocs estimate:  19
  --------------
  minimum time:     15.269 μs (0.00% GC)
  median time:      15.960 μs (0.00% GC)
  mean time:        16.593 μs (2.06% GC)
  maximum time:     1.744 ms (97.80% GC)
  --------------
  samples:          10000
  evals/sample:     1
Size: 8, OpenBLAS: 
  memory estimate:  11.41 KiB
  allocs estimate:  17
  --------------
  minimum time:     15.499 μs (0.00% GC)
  median time:      16.311 μs (0.00% GC)
  mean time:        17.403 μs (3.45% GC)
  maximum time:     1.243 ms (98.43% GC)
  --------------
  samples:          10000
  evals/sample:     1

64x64, eigfact of general square matrix

Size: 64, MKL: 
  memory estimate:  117.66 KiB
  allocs estimate:  23
  --------------
  minimum time:     1.754 ms (0.00% GC)
  median time:      1.787 ms (0.00% GC)
  mean time:        1.796 ms (0.39% GC)
  maximum time:     2.998 ms (39.95% GC)
  --------------
  samples:          2774
  evals/sample:     1
Size: 64, OpenBLAS: 
  memory estimate:  166.25 KiB
  allocs estimate:  25
  --------------
  minimum time:     1.250 ms (0.00% GC)
  median time:      1.265 ms (0.00% GC)
  mean time:        1.281 ms (0.57% GC)
  maximum time:     2.253 ms (38.01% GC)
  --------------
  samples:          3887
  evals/sample:     1

512x512, eigfact of general square matrix

Size: 512, MKL: 
  memory estimate:  6.16 MiB
  allocs estimate:  37
  --------------
  minimum time:     288.296 ms (0.00% GC)
  median time:      292.630 ms (0.00% GC)
  mean time:        300.290 ms (0.10% GC)
  maximum time:     361.709 ms (0.00% GC)
  --------------
  samples:          17
  evals/sample:     1
Size: 512, OpenBLAS:
  memory estimate:  6.54 MiB
  allocs estimate:  31
  --------------
  minimum time:     286.010 ms (0.00% GC)
  median time:      288.243 ms (0.00% GC)
  mean time:        289.080 ms (0.10% GC)
  maximum time:     297.906 ms (0.00% GC)
  --------------
  samples:          18
  evals/sample:     1

4096x4096, eigfact of general square matrix

Size: 4096, MKL: 
  memory estimate:  385.25 MiB
  allocs estimate:  81
  --------------
  minimum time:     43.321 s (0.02% GC)
  median time:      43.321 s (0.02% GC)
  mean time:        43.321 s (0.02% GC)
  maximum time:     43.321 s (0.02% GC)
  --------------
  samples:          1
  evals/sample:     1
Size: 4096, OpenBLAS: 
  memory estimate:  388.28 MiB
  allocs estimate:  69
  --------------
  minimum time:     35.017 s (0.03% GC)
  median time:      35.017 s (0.03% GC)
  mean time:        35.017 s (0.03% GC)
  maximum time:     35.017 s (0.03% GC)
  --------------
  samples:          1
  evals/sample:     1

OpenBLAS ties or wins for general eigen-decomposition at all sizes.

QR Decomposition

bench_f(A, qrfact!, N, ", OpenBLAS:")

8x8, qrfact

Size: 8, MKL:
  memory estimate:  1.28 KiB
  allocs estimate:  4
  --------------
  minimum time:     13.195 μs (0.00% GC)
  median time:      13.405 μs (0.00% GC)
  mean time:        13.669 μs (0.00% GC)
  maximum time:     60.815 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1
Size: 8, OpenBLAS: 
  memory estimate:  1.28 KiB
  allocs estimate:  4
  --------------
  minimum time:     16.861 μs (0.00% GC)
  median time:      16.982 μs (0.00% GC)
  mean time:        17.279 μs (0.00% GC)
  maximum time:     56.927 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1
StaticArrays
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     310.129 ns (0.00% GC)
  median time:      314.905 ns (0.00% GC)
  mean time:        315.720 ns (0.00% GC)
  maximum time:     503.892 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     241

64x64, qrfact

Size: 64, MKL: 
  memory estimate:  36.22 KiB
  allocs estimate:  6
  --------------
  minimum time:     663.038 μs (0.00% GC)
  median time:      697.192 μs (0.00% GC)
  mean time:        706.418 μs (0.48% GC)
  maximum time:     1.987 ms (62.28% GC)
  --------------
  samples:          6973
  evals/sample:     1
Size: 64, OpenBLAS: 
  memory estimate:  36.22 KiB
  allocs estimate:  6
  --------------
  minimum time:     452.482 μs (0.00% GC)
  median time:      494.570 μs (0.00% GC)
  mean time:        517.678 μs (0.48% GC)
  maximum time:     12.591 ms (0.00% GC)
  --------------
  samples:          9450
  evals/sample:     1

512x512, qrfact

Size: 512, MKL: 
  memory estimate:  288.22 KiB
  allocs estimate:  6
  --------------
  minimum time:     13.664 ms (0.00% GC)
  median time:      14.642 ms (0.00% GC)
  mean time:        14.648 ms (0.00% GC)
  maximum time:     19.407 ms (0.00% GC)
  --------------
  samples:          328
  evals/sample:     1
Size: 512, OpenBLAS:
  memory estimate:  288.22 KiB
  allocs estimate:  6
  --------------
  minimum time:     12.646 ms (0.00% GC)
  median time:      12.867 ms (0.00% GC)
  mean time:        13.025 ms (0.00% GC)
  maximum time:     18.334 ms (0.00% GC)
  --------------
  samples:          368
  evals/sample:     1

4096x4096, qrfact

Size: 4096, MKL: 
  memory estimate:  2.25 MiB
  allocs estimate:  6
  --------------
  minimum time:     2.596 s (0.00% GC)
  median time:      2.679 s (0.00% GC)
  mean time:        2.679 s (0.00% GC)
  maximum time:     2.763 s (0.00% GC)
  --------------
  samples:          2
  evals/sample:     1
Size: 4096, OpenBLAS: 
  memory estimate:  2.25 MiB
  allocs estimate:  6
  --------------
  minimum time:     2.061 s (0.00% GC)
  median time:      2.093 s (0.00% GC)
  mean time:        2.101 s (0.00% GC)
  maximum time:     2.150 s (0.00% GC)
  --------------
  samples:          3
  evals/sample:     1
10 Likes