Understanding why one function is faster than the other

Hi!

I wanted to try to learn a bit more about why some functions are faster than others. To do this I decided to use Distances.jl and calculate some Euclidean distances. The code I have used is:

using Distances
using BenchmarkTools

# Calculate distance between two vectors
x1 = [1.5,0,3]
y1 = [2.0,0,3]
@btime r1 = evaluate(Euclidean(), $x1, $y1)
# Result is "0.5" as exptected
#faster than sqrt(sum((x.-y).^2))

# Calculate colwise distance between two vectors
# Small test to understand if it works
# Has to be converted into "row" matrix
# Still "1D"
X1 = reshape(x1,(length(x1)),1)
Y1 = reshape(y1,(length(y1)),1)
@btime r2 = colwise(Euclidean(), X1, Y1)
# Testing with larger matrix - insert X1 and Y1 for easy check of accuracy
X1L = rand(3,100)
X1L[:,1] = X1
Y1L = rand(3,100)
Y1L[:,1] = Y1
@btime r2l = colwise(Euclidean(), $X1L, $Y1L)
# Test if it is faster to do manually
# This was not faster much more allocations. USING VIEWS FIXED IT SLIGHTLY
function calc_euclidean(X,Y)
    N = size(X)[2]
    arr = zeros(Float64,N)
    @inbounds for i = 1:N
        arr[i] = @views evaluate(Euclidean(), X[:,i], Y[:,i])
    end

    return arr
end
@btime r2l_calc_euclidean = calc_euclidean($X1L,$Y1L)

Where I use x1 and y1 as a sanity check etc. My main question is why:

@btime r2l = colwise(Euclidean(), $X1L, $Y1L);
  229.648 ns (1 allocation: 896 bytes)

While the loop I made:

@btime r2l_calc_euclidean = calc_euclidean($X1L,$Y1L);
  1.120 μs (1 allocation: 896 bytes)

So basically why is my function calc_euclidean slower?

EDIT: Please don’t mind the comments, just a way for me to take some notes - I know some of the math terminology is not good :slight_smile:

Kind regards

I really don’t understand why it is faster? I used @code_llvm @Meta.lower

colwise

@code_llvm @Meta.lower colwise(Euclidean(), X1L, Y1L)

;  @ meta.jl:132 within `@lower'
; Function Attrs: uwtable
define nonnull %jl_value_t* @"julia_@lower_1638"({ i64, %jl_value_t* }* nocapture nonnull readonly dereferenceable(16), %jl_value_t* nonnull align 8 dereferenceable(16), %jl_value_t* nonnull align 8 dereferenceable(16)) #0 {
top:
  %3 = alloca %jl_value_t*, i32 4
  %gcframe = alloca %jl_value_t*, i32 3, align 16
  %4 = bitcast %jl_value_t** %gcframe to i8*
  call void @llvm.memset.p0i8.i32(i8* align 16 %4, i8 0, i32 24, i1 false)
  %5 = call %jl_value_t*** inttoptr (i64 1720679008 to %jl_value_t*** ()*)() #6
;  @ meta.jl:133 within `@lower'
  %6 = getelementptr %jl_value_t*, %jl_value_t** %gcframe, i32 0
  %7 = bitcast %jl_value_t** %6 to i64*
  store i64 4, i64* %7
  %8 = getelementptr %jl_value_t**, %jl_value_t*** %5, i32 0
  %9 = getelementptr %jl_value_t*, %jl_value_t** %gcframe, i32 1
  %10 = bitcast %jl_value_t** %9 to %jl_value_t***
  %11 = load %jl_value_t**, %jl_value_t*** %8
  store %jl_value_t** %11, %jl_value_t*** %10
  %12 = bitcast %jl_value_t*** %8 to %jl_value_t***
  store %jl_value_t** %gcframe, %jl_value_t*** %12
  %13 = bitcast %jl_value_t*** %5 to i8*
  %14 = call noalias nonnull %jl_value_t* @jl_gc_pool_alloc(i8* %13, i32 1400, i32 16) #2
  %15 = bitcast %jl_value_t* %14 to %jl_value_t**
  %16 = getelementptr %jl_value_t*, %jl_value_t** %15, i64 -1
  store %jl_value_t* inttoptr (i64 170668448 to %jl_value_t*), %jl_value_t** %16
  %17 = bitcast %jl_value_t* %14 to %jl_value_t**
  store %jl_value_t* %2, %jl_value_t** %17, align 8
  %18 = getelementptr %jl_value_t*, %jl_value_t** %gcframe, i32 2
  store %jl_value_t* %14, %jl_value_t** %18
  %19 = getelementptr %jl_value_t*, %jl_value_t** %3, i32 0
  store %jl_value_t* inttoptr (i64 145976776 to %jl_value_t*), %jl_value_t** %19
  %20 = getelementptr %jl_value_t*, %jl_value_t** %3, i32 1
  store %jl_value_t* inttoptr (i64 145767352 to %jl_value_t*), %jl_value_t** %20
  %21 = getelementptr %jl_value_t*, %jl_value_t** %3, i32 2
  store %jl_value_t* %1, %jl_value_t** %21
  %22 = getelementptr %jl_value_t*, %jl_value_t** %3, i32 3
  store %jl_value_t* %14, %jl_value_t** %22
  %23 = call nonnull %jl_value_t* @jl_f__expr(%jl_value_t* null, %jl_value_t** %3, i32 4)
  %24 = getelementptr %jl_value_t*, %jl_value_t** %gcframe, i32 1
  %25 = load %jl_value_t*, %jl_value_t** %24
  %26 = getelementptr %jl_value_t**, %jl_value_t*** %5, i32 0
  %27 = bitcast %jl_value_t*** %26 to %jl_value_t**
  store %jl_value_t* %25, %jl_value_t** %27
  ret %jl_value_t* %23
}

calc_euclidean

@code_llvm @Meta.lower calc_euclidean(X1L,Y1L)

;  @ meta.jl:132 within `@lower'
; Function Attrs: uwtable
define nonnull %jl_value_t* @"julia_@lower_1639"({ i64, %jl_value_t* }* nocapture nonnull readonly dereferenceable(16), %jl_value_t* nonnull align 8 dereferenceable(16), %jl_value_t* nonnull align 8 dereferenceable(16)) #0 {
top:
  %3 = alloca %jl_value_t*, i32 4
  %gcframe = alloca %jl_value_t*, i32 3, align 16
  %4 = bitcast %jl_value_t** %gcframe to i8*
  call void @llvm.memset.p0i8.i32(i8* align 16 %4, i8 0, i32 24, i1 false)
  %5 = call %jl_value_t*** inttoptr (i64 1720679008 to %jl_value_t*** ()*)() #6
;  @ meta.jl:133 within `@lower'
  %6 = getelementptr %jl_value_t*, %jl_value_t** %gcframe, i32 0
  %7 = bitcast %jl_value_t** %6 to i64*
  store i64 4, i64* %7
  %8 = getelementptr %jl_value_t**, %jl_value_t*** %5, i32 0
  %9 = getelementptr %jl_value_t*, %jl_value_t** %gcframe, i32 1
  %10 = bitcast %jl_value_t** %9 to %jl_value_t***
  %11 = load %jl_value_t**, %jl_value_t*** %8
  store %jl_value_t** %11, %jl_value_t*** %10
  %12 = bitcast %jl_value_t*** %8 to %jl_value_t***
  store %jl_value_t** %gcframe, %jl_value_t*** %12
  %13 = bitcast %jl_value_t*** %5 to i8*
  %14 = call noalias nonnull %jl_value_t* @jl_gc_pool_alloc(i8* %13, i32 1400, i32 16) #2
  %15 = bitcast %jl_value_t* %14 to %jl_value_t**
  %16 = getelementptr %jl_value_t*, %jl_value_t** %15, i64 -1
  store %jl_value_t* inttoptr (i64 170668448 to %jl_value_t*), %jl_value_t** %16
  %17 = bitcast %jl_value_t* %14 to %jl_value_t**
  store %jl_value_t* %2, %jl_value_t** %17, align 8
  %18 = getelementptr %jl_value_t*, %jl_value_t** %gcframe, i32 2
  store %jl_value_t* %14, %jl_value_t** %18
  %19 = getelementptr %jl_value_t*, %jl_value_t** %3, i32 0
  store %jl_value_t* inttoptr (i64 145976776 to %jl_value_t*), %jl_value_t** %19
  %20 = getelementptr %jl_value_t*, %jl_value_t** %3, i32 1
  store %jl_value_t* inttoptr (i64 145767352 to %jl_value_t*), %jl_value_t** %20
  %21 = getelementptr %jl_value_t*, %jl_value_t** %3, i32 2
  store %jl_value_t* %1, %jl_value_t** %21
  %22 = getelementptr %jl_value_t*, %jl_value_t** %3, i32 3
  store %jl_value_t* %14, %jl_value_t** %22
  %23 = call nonnull %jl_value_t* @jl_f__expr(%jl_value_t* null, %jl_value_t** %3, i32 4)
  %24 = getelementptr %jl_value_t*, %jl_value_t** %gcframe, i32 1
  %25 = load %jl_value_t*, %jl_value_t** %24
  %26 = getelementptr %jl_value_t**, %jl_value_t*** %5, i32 0
  %27 = bitcast %jl_value_t*** %26 to %jl_value_t**
  store %jl_value_t* %25, %jl_value_t** %27
  ret %jl_value_t* %23
}

I can’t understand the time difference if the output is so similar?

EDIT: Comparing line by line, I find it to be exactly similar? I used crtl+f line by line

Kind regards

Some more benchmarks:

@benchmark colwise($Euclidean(),$X1L,$Y1L)
BenchmarkTools.Trial: 
  memory estimate:  896 bytes
  allocs estimate:  1
  --------------
  minimum time:     224.064 ns (0.00% GC)
  median time:      252.075 ns (0.00% GC)
  mean time:        319.950 ns (4.31% GC)
  maximum time:     5.854 μs (88.58% GC)
  --------------
  samples:          10000
  evals/sample:     482
 @benchmark calc_euclidean($X1L,$Y1L)
BenchmarkTools.Trial: 
  memory estimate:  896 bytes
  allocs estimate:  1
  --------------
  minimum time:     1.140 μs (0.00% GC)
  median time:      1.360 μs (0.00% GC)
  mean time:        1.352 μs (0.35% GC)
  maximum time:     48.980 μs (96.10% GC)
  --------------
  samples:          10000
  evals/sample:     10

Check the @code_typed of each function, they are very different. One is invoking Distances.colwise!, and if I am not mistaken, that is will call a (loop vetorized) blas routine under the hood.

3 Likes

Thanks! Didn’t know about that macro. It is probably doing something like that internally. That explains why I could never get the exact same performance, without delving into more difficult parts.

Kind regards

1 Like

If you’d like a faster function:

using LoopVectorization

function distances_avx(X::AbstractMatrix{XT}, Y::AbstractMatrix{YT}) where {XT,YT}
    distances_avx!(Vector{promote_type(XT,YT,Float32)}(undef, size(X,1)), X, Y)
end

function distances_avx!(d, X, Y)
    @avx for n ∈ axes(X,1)
        dₙ = zero(eltype(d))
        for p ∈ axes(X,2)
            dₙ += (X[n,p] - Y[n,p])^2
        end
        d[n] = sqrt(dₙ)
    end
    d
end

This yields:

julia> distances_avx(X1L', Y1L') ≈ colwise(Euclidean(), X1L, Y1L)
true

julia> X1Lt = copy(X1L'); Y1Lt = copy(Y1L');

julia> @benchmark distances_avx($X1L', $Y1L')
BenchmarkTools.Trial:
  memory estimate:  896 bytes
  allocs estimate:  1
  --------------
  minimum time:     234.650 ns (0.00% GC)
  median time:      244.064 ns (0.00% GC)
  mean time:        263.833 ns (5.22% GC)
  maximum time:     2.707 μs (86.20% GC)
  --------------
  samples:          10000
  evals/sample:     409

julia> @benchmark distances_avx($X1Lt, $Y1Lt)
BenchmarkTools.Trial:
  memory estimate:  896 bytes
  allocs estimate:  1
  --------------
  minimum time:     99.239 ns (0.00% GC)
  median time:      105.655 ns (0.00% GC)
  mean time:        121.963 ns (10.68% GC)
  maximum time:     1.207 μs (83.88% GC)
  --------------
  samples:          10000
  evals/sample:     944

julia> @benchmark colwise(Euclidean(), $X1L, $Y1L)
BenchmarkTools.Trial:
  memory estimate:  896 bytes
  allocs estimate:  1
  --------------
  minimum time:     310.889 ns (0.00% GC)
  median time:      356.742 ns (0.00% GC)
  mean time:        376.197 ns (3.68% GC)
  maximum time:     4.532 μs (87.48% GC)
  --------------
  samples:          10000
  evals/sample:     244

julia> @benchmark calc_euclidean($X1L,$Y1L)
BenchmarkTools.Trial:
  memory estimate:  896 bytes
  allocs estimate:  1
  --------------
  minimum time:     796.737 ns (0.00% GC)
  median time:      803.463 ns (0.00% GC)
  mean time:        834.667 ns (1.82% GC)
  maximum time:     11.567 μs (88.25% GC)
  --------------
  samples:          10000
  evals/sample:     95

You can get a bit more performance from evaluating it inplace:

julia> d = similar(X1L, size(X1L,2));

julia> @benchmark distances_avx!($d, $X1L', $Y1L')
BenchmarkTools.Trial:
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     212.117 ns (0.00% GC)
  median time:      212.466 ns (0.00% GC)
  mean time:        212.701 ns (0.00% GC)
  maximum time:     258.545 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     506

julia> @benchmark distances_avx!($d, $X1Lt, $Y1Lt)
BenchmarkTools.Trial:
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     91.683 ns (0.00% GC)
  median time:      91.764 ns (0.00% GC)
  mean time:        91.873 ns (0.00% GC)
  maximum time:     114.600 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     957

Also:

@code_llvm @Meta.lower colwise(Euclidean(), X1L, Y1L)

Just do @code_llvm. You’re not seeing the correct output because of the `@Meta.lower.

6 Likes

Thank you very much for your code!

I had to spend some time looking at it, but I feel I understand it mostly now. I have one small request though. I want to use this on a particle simulation I am programming in which I have to do an operation, where I calculate the Euclidean distance from one particle to all its neighbouring particles. My total particle array is given as:

 pg
3883-element Array{SArray{Tuple{3},Float64,1,3},1}:
 [-0.9789985766900218, 0.0, -0.311561860399199]
 [-0.979596039492844, 0.0, -0.267781833918858]
 [-0.978974440663915, 0.0, -0.20583967405597176]
 [-0.9786126278489403, 0.0, -0.16411719504891698]
 [-0.9796960890206647, 0.0, -0.12325297602150954]
 [-0.9781927539671196, 0.0, -0.08150932543540486]
 [-0.9789525516873936, 0.0, -0.04326742871806638]
...

And one element, p_i, is then:

p_i = pg[1]
3-element SArray{Tuple{3},Float64,1,3} with indices SOneTo(3):
 -0.9789985766900218
  0.0
 -0.311561860399199

Then based on your initial code I have programmed as such:

function distances_avx(x::SArray{Tuple{3},Float64,1,3}, X::Array{SArray{Tuple{3},Float64,1,3},1}) 
    distances_avx!(Vector{Float64}(undef, size(X,1)), x,X)
end

function distances_avx!(d, x, X)
    @avx for n ∈ axes(X,1)
        dₙ = zero(eltype(d))
        y  = X[n]
        for p = 1:3
            dₙ += (x[p] - y[p])^2
        end
        d[n] = sqrt(dₙ)
    end
    d
end

Which gives:

@btime distances_avx($pg[1],$pg);
  4.214 μs (2 allocations: 30.45 KiB)

And the resulting output would be a vector with Euclidean distances from pg[1] to pg[1],pg[2],pg[3] etc…

I feel like I have “butchered” your original code. Would it possible to write such a code for my case using StaticArrays? I often struggle working with StaticArrays but I have always been told they are the most efficient to use for calculations etc.

Kind regards

It’ll be a while before that works. For now,

julia> using StaticArrays, LoopVectorization

julia> pg = [@SVector(rand(3)) for _ ∈ 1:3883];

julia> LoopVectorization.check_args(pg)
false

LoopVectorization uses check_args to decide whether or not to try and optimize the loop. If it returns false, it will not try. Instead, it’ll use @fastmath @inbounds, which works really well here.

But it doesn’t seem like that really matters:

julia> pg = [@SVector(rand(3)) for _ ∈ 1:3883];

julia> d = similar(pg, Float64); r = rand(length(d));

julia> @benchmark distances_avx!($d, $pg[1], $pg)
BenchmarkTools.Trial:
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     2.718 μs (0.00% GC)
  median time:      2.719 μs (0.00% GC)
  mean time:        2.724 μs (0.00% GC)
  maximum time:     6.104 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     9

julia> @benchmark $d .= Base.FastMath.sqrt_fast.($r)
BenchmarkTools.Trial:
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     2.728 μs (0.00% GC)
  median time:      2.733 μs (0.00% GC)
  mean time:        2.738 μs (0.00% GC)
  maximum time:     6.453 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     9

Just taking the square root of 3883 scalars takes about as long as calculating the 3883 distances.

4 Likes

Hi Elrod!

Thanks for taking time do look at it. I suppose you used the version of distances_avx! I had modified in my previous answer. Then I did the same test as you did in your answers above:

@benchmark distances_avx!($d, $pg[1], $pg)
BenchmarkTools.Trial:
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     3.800 μs (0.00% GC)
  median time:      3.900 μs (0.00% GC)
  mean time:        3.987 μs (0.00% GC)
  maximum time:     10.988 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     8
@benchmark $d .= Base.FastMath.sqrt_fast.($r)
BenchmarkTools.Trial:
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     2.589 μs (0.00% GC)
  median time:      2.644 μs (0.00% GC)
  mean time:        2.667 μs (0.00% GC)
  maximum time:     5.867 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     9

A bit fascinating how you are not seeing a difference between these two but I am. Using Julia 1.5.0

Also thanks for the explanation of loop_vectorization and checking if it applies

Kind regards

That difference is probably architecture related.
Skylake-X (and Cascadelake-X, which is almost identical) CPUs have AVX512, which doubles the size of the SIMD units, meaning each instruction can operate on 8 Float64s in parallel, instead of 4 Float64s like in CPUs with AVX(2).
Normally, how many clock cycles an instruction takes is independent of how wide it is.
However, on Skylake-X, the square root instruction actually takes twice as long for 8 Float64 as it does for 4 Float64.
So basically everything except the square roots are now twice as fast. Thanks to Out of Order (OoO) execution, the CPU can finish the operations required for the next square root while the last one is finishing. Hence, with AVX512, the CPU spends almost 100% of its time calculating square roots, while everything else happens in parallel.

Without AVX512, square roots are more or less equally fast, but the everything else now takes twice as long – and now takes longer than the square roots.

Also, testing that I see the same results on a tigerlake laptop (a newer architecture, with some differences from Skylake-X [the most striking one being that tigerlake has half the FMA performance…]):

julia> using StaticArrays, BenchmarkTools

julia> function distances_fm!(d, x, X)
           @inbounds @fastmath for n ∈ axes(X,1)
               dₙ = zero(eltype(d))
               y  = X[n]
               for p = 1:3
                   dₙ += (x[p] - y[p])^2
               end
               d[n] = sqrt(dₙ)
           end
           d
       end
distances_fm! (generic function with 1 method)

julia> pg = [@SVector(rand(3)) for _ ∈ 1:3883];

julia> d = similar(pg, Float64); r = rand(length(d));

julia> @benchmark distances_fm!($d, $pg[1], $pg)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     2.545 μs (0.00% GC)
  median time:      2.602 μs (0.00% GC)
  mean time:        2.590 μs (0.00% GC)
  maximum time:     5.024 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     9

julia> @benchmark $d .= Base.FastMath.sqrt_fast.($r)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     2.559 μs (0.00% GC)
  median time:      2.561 μs (0.00% GC)
  mean time:        2.568 μs (0.00% GC)
  maximum time:     3.647 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     9

julia> versioninfo()
Julia Version 1.6.0-DEV.1324
Commit ae789ca1df (2020-10-24 01:55 UTC)
Platform Info:
  OS: Linux (x86_64-linux-gnu)
  CPU: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-11.0.0 (ORCJIT, tigerlake)

and I again see the same thing: the square roots by themselves take just as long as all the distances.

This of course makes it a little harder to optimize for your architecture.
If you want to speed up calculating the distances specifically, you can try this

pgmat = copy(reshape(reinterpret(Float64, pg), (3, length(pg)))'); # length(pg) x 3 matrix
function distances_fm!(d, x, X::AbstractMatrix)
    @inbounds Base.Cartesian.@nexprs 3 p -> x_p = x[p];
    @inbounds @fastmath for n ∈ axes(X,1)
        dₙ = zero(eltype(d))
        Base.Cartesian.@nexprs 3 p -> dₙ += (x_p - X[n,p])^2
        d[n] = sqrt(dₙ)
    end
    d
end
@benchmark distances_fm!($d, view($pgmat, 1, :), $pgmat)

Benchmark:

julia> @benchmark distances_fm!($d, view($pgmat, 1, :), $pgmat)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     2.617 μs (0.00% GC)
  median time:      2.619 μs (0.00% GC)
  mean time:        2.624 μs (0.00% GC)
  maximum time:     4.654 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     9

Note that this requires a different memory layout than you had earlier: the vector of SVectors is basically a 3 x N matrix, which we’ve now transposed and copied into a new array (the copy is necessary to actually change the memory layout; transpose/adjoint/' are lazy) that is N x 3.

The code uses SIMD instructions, which act on multiple data elements in parallel, doing the same operation on each of them. That means it is calculating the distances in parallel.
If we call the elements of your SVector x, y, and z, that requires it load many xes, yes, and zes in parallel.

Loading contiguous numbers in parallel is faster. If you’re loading a bunch of numbers in parallel, you want them to be right next to each other.
That is, because you want to load

x_vec = [x_1, x_2, x_3, x_4, x_5, x_6, x_7, x_8]; # 8 per vector with AVX512
y_vec = [y_1, y_2, y_3, y_4, y_5, y_6, y_7, y_8]; # 8 per vector with AVX512
z_vec = [z_1, z_2, z_3, z_4, z_5, z_6, z_7, z_8]; # 8 per vector with AVX512

it helps if x_1, x_2, x_3, etc… are side by side in memory. Ditto for ys and zs.
They aren’t side by side in memory when you have a vector of SVectors. Instead, your memory looks like

[x_1, y_1, z_1, x_2, y_2, z_2, x_3,...]

It’s not that big a deal because they’re close to each other and the pattern is regular: LLVM will still load them contiguously, getting

v_1 = [x_1, y_1, z_1, x_2, y_2, z_2, x_3, y_3];
v_2 = [z_3, x_4, y_4, z_4, x_5, y_5, z_5, x_6];
v_3 = [y_6, z_6, x_7, y_7, z_7, x_8, y_8, z_8];

and then it shuffles their contents until it gets x_vec, y_vec, z_vec, as above. The shuffling takes a few operations, but it is no where near as bad as if it had to gather them from more random or much further regions.
On your CPU, where that part of the code (i.e., not the square roots) is the bottleneck, cutting out the shuffles may make the difference.

It may also be the case that this shuffling is way faster on AVX512; the actual instruction it uses vpermt2pd is AVX512 only; it could even be the case that without this instruction LLVM doesn’t think SIMD is worth it and doesn’t even try. Mind showing me the results of @code_native debuginfo=:none syntax=:intel distances_fm!(d, pg[1], pg)?

A couple other notes here:

  1. When it comes to changing memory layouts, you have to consider the rest of your code. Maybe the rest benefits from the vector of SVectors. If you have to pay a price to switch memory layouts, you’ll have to consider if it is worth it. One reason I went into the influence of memory layouts above is to help you understand the trdeoffs and make decisions about 3xN vs Nx3 in the rest of your code.
  2. What’s up with Base.Cartesian.@nexprs? Theres an issue where wrapping arrays in structs doesn’t get handled well in loops, so I do this to load from the view before the loop, and then reference those scalars inside of it.

I’m curious if/by how much this version on a matrix is faster on your computer.

6 Likes

Wait: tiger lake has half the fma performance? Why on earth would they do that?

I’m not sure, but it’s been a “thing” historically.
Many early Xeons, like Xeon 4110, also had just 1 FMA unit, just like icelake-client and tigerlake do.
I don’t know why. Maybe it is to save die space (“who does gemm on a laptop?”). I doubt it’s for the sake of market differentiation. Icelake-server will have 2 fma units, but I think the fact that one of them has 4 cores and consumes 15-28 watts and the other 28 with ~200 watts is the giveaway that they serve different purposes and markets.

To be precise, the fma isn’t any slower on icelake-client/tigerlake, it’s just that the CPU is only capable of executing 1 per clock cycle instead of 2 per clock cycle; fmas are less parallel. So as long as your code doesn’t demand more than 1 fma per cycle, you should be fine.

My tigerlake laptop is a little faster than my 10980XE in many singlethreaded AVX512 benchmarks*, like the distances calculation above (which does have a few fmas in it) and SIMD RNGs (no fmas).

*caveat

*The major caveat here being that the laptop will slow down after a short period doing work, and immediately if using more threads; the desktop CPU is configured to run at the same speed regardless of number of active cores or how long it’s been running. A single threaded benchmark on that desktop is representative of a long running multithreaded job, but not so for the laptop which will throttle significantly.
Given similar clock speeds between a laptop and desktop, the desktop will be much faster for serious work.

But when it comes to code dominated by fmas like gemm (and lots of other linear algebra), tigerlake:

julia> function AmulB!(C, A, B)
           @avx for m ∈ axes(A,1), n ∈ axes(B,2)
               Cmn = zero(eltype(C))
               for k ∈ axes(A,2)
                   Cmn += A[m,k] * B[k,n]
               end
               C[m,n] = Cmn
           end
       end
AmulB! (generic function with 1 method)

julia> M = K = N = 144; C = Matrix{Float64}(undef, M, N); A = rand(M, K); B = rand(K, N);

julia> @benchmark AmulB!($C, $A, $B)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     83.458 μs (0.00% GC)
  median time:      89.555 μs (0.00% GC)
  mean time:        88.861 μs (0.00% GC)
  maximum time:     193.471 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

julia> 2e-9M*K*N / 83.458e-6 #calculate GFLOPS
71.55656737520671

julia> versioninfo()
Julia Version 1.5.3-pre.0
Commit 5905edebb1 (2020-09-24 05:09 UTC)
Platform Info:
  OS: Linux (x86_64-linux-gnu)
  CPU: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-9.0.1 (ORCJIT, tigerlake)

cascadelake-x:

julia> using LoopVectorization, BenchmarkTools

julia> function AmulB!(C, A, B)
           @avx for m ∈ axes(A,1), n ∈ axes(B,2)
               Cmn = zero(eltype(C))
               for k ∈ axes(A,2)
                   Cmn += A[m,k] * B[k,n]
               end
               C[m,n] = Cmn
           end
       end
AmulB! (generic function with 1 method)

julia> M = K = N = 144; C = Matrix{Float64}(undef, M, N); A = rand(M, K); B = rand(K, N);

julia> @benchmark AmulB!($C, $A, $B)
BenchmarkTools.Trial:
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     47.726 μs (0.00% GC)
  median time:      47.899 μs (0.00% GC)
  mean time:        47.964 μs (0.00% GC)
  maximum time:     115.973 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

julia> 2e-9M*K*N / 47.726e-6 #calculate GFLOPS
125.13028537903871

julia> versioninfo()
Julia Version 1.5.3-pre.0
Commit 5905edebb1* (2020-09-24 05:09 UTC)
Platform Info:
  OS: Linux (x86_64-generic-linux)
  CPU: Intel(R) Core(TM) i9-10980XE CPU @ 3.00GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-9.0.1 (ORCJIT, cascadelake)
Environment:
  JULIA_NUM_THREADS = 18

125 vs 71 GFLOPS, about 75% higher FLOPS for the CPU with 2 fma units.

it makes a huge difference. Tigerlake won’t really be faster than Zen2 CPUs here, which can do 2x 256 bit fmas per clock cycle (note than Zen1 was capable of just 1x 256 bit fma/cycle), which matches the 1x 512 bit in theoretical peak throughput.
It’s easier to realize peak throughput with AVX512 (lower bandwidth between registers and L1 & L2 cache needed due to 4x larger register file to hold intermediate values), but regarding gemm itself, it’s easy enough to get close to peak performance with AVX2 that Zen2 should be very close clock-for-clock.
Supposedly the upcoming Zen3 will have increased floating point throughput relative to Zen2. I don’t know what that means (maybe 3x 256-bit fmas per cycle?), but that could mean it would actually beat tigerlake in single-threaded fma-dominated workloads.
Zen2+ should obviously win multithreaded.
Maybe @stillyslalom can post the above gemm benchmark for comparison on his 4800U.

3 Likes
julia> @belapsed AmulB!($C, $A, $B)
8.92e-5

julia> 2e-9*M*K*N / ans
66.95031390134531

julia> BLAS.set_num_threads(1)

julia> @belapsed mul!($C, $A, $B)
0.000112101

julia> 2e-9*M*K*N / ans
53.27310193486232
julia> versioninfo()
Julia Version 1.5.2
Commit 539f3ce943 (2020-09-23 23:17 UTC)
Platform Info:
  OS: Windows (x86_64-w64-mingw32)
  CPU: AMD Ryzen 9 4900HS with Radeon Graphics
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-9.0.1 (ORCJIT, znver2)
Environment:
  JULIA_NUM_THREADS = 8
1 Like

More or less equally fast at gemm as tigerlake, thanks for confirming.

Also OpenBLAS’s performance is really disappointing there. Hopefully that’ll be fixed in Julia 1.6.

1 Like

Thank you so much for that incredibly detailed explanation!

This was my first kind of introduction down into the “nitty gritty” of performant CPU code and was a very interesting read. Of course I did not grasp everything but your explanation of how memory layout can play an impact was really great and I really liked it - makes me eager to test some more things. Generally I had just been using StaticArrays since I was always told they would be faster, but I always found them much more difficult to work with. I tested the three functions you provided:

 @benchmark distances_fm!($d, $pg[1], $pg)
BenchmarkTools.Trial:
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     3.800 μs (0.00% GC)
  median time:      3.900 μs (0.00% GC)
  mean time:        3.938 μs (0.00% GC)
  maximum time:     9.425 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     8
@benchmark $d .= Base.FastMath.sqrt_fast.($r)
BenchmarkTools.Trial:
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     2.589 μs (0.00% GC)
  median time:      2.600 μs (0.00% GC)
  mean time:        2.672 μs (0.00% GC)
  maximum time:     5.556 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     9
 @benchmark distances_fm!($d, view($pgmat, 1, :), $pgmat)
BenchmarkTools.Trial:
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     2.589 μs (0.00% GC)
  median time:      2.645 μs (0.00% GC)
  mean time:        2.674 μs (0.00% GC)
  maximum time:     5.589 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     9

Which seems to confirm that what is happening under the hood is as you already have described. I also took your last function and used it on the same case in my initial post (i.e. pg with 100 elements):

@benchmark distances_fm!($d, view($pgmat, 1, :), $pgmat)
BenchmarkTools.Trial:
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     68.544 ns (0.00% GC)
  median time:      70.698 ns (0.00% GC)
  mean time:        71.367 ns (0.00% GC)
  maximum time:     138.730 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     976

So it has become extremely fast! Again thank you so much.

Kind regards

3 Likes

Why do you use a function that calls another functions instead of just calling directly the inner function distances_avx!?

If I want to write both an in-place and allocating version, I make the allocating version a wrapper of the in-place one to reduce code duplication.

Preferably, people can use the in-place version in their own code, but maybe that isn’t always practical.

4 Likes