Understanding why one function is faster than the other

If you’d like a faster function:

using LoopVectorization

function distances_avx(X::AbstractMatrix{XT}, Y::AbstractMatrix{YT}) where {XT,YT}
    distances_avx!(Vector{promote_type(XT,YT,Float32)}(undef, size(X,1)), X, Y)
end

function distances_avx!(d, X, Y)
    @avx for n ∈ axes(X,1)
        dₙ = zero(eltype(d))
        for p ∈ axes(X,2)
            dₙ += (X[n,p] - Y[n,p])^2
        end
        d[n] = sqrt(dₙ)
    end
    d
end

This yields:

julia> distances_avx(X1L', Y1L') ≈ colwise(Euclidean(), X1L, Y1L)
true

julia> X1Lt = copy(X1L'); Y1Lt = copy(Y1L');

julia> @benchmark distances_avx($X1L', $Y1L')
BenchmarkTools.Trial:
  memory estimate:  896 bytes
  allocs estimate:  1
  --------------
  minimum time:     234.650 ns (0.00% GC)
  median time:      244.064 ns (0.00% GC)
  mean time:        263.833 ns (5.22% GC)
  maximum time:     2.707 μs (86.20% GC)
  --------------
  samples:          10000
  evals/sample:     409

julia> @benchmark distances_avx($X1Lt, $Y1Lt)
BenchmarkTools.Trial:
  memory estimate:  896 bytes
  allocs estimate:  1
  --------------
  minimum time:     99.239 ns (0.00% GC)
  median time:      105.655 ns (0.00% GC)
  mean time:        121.963 ns (10.68% GC)
  maximum time:     1.207 μs (83.88% GC)
  --------------
  samples:          10000
  evals/sample:     944

julia> @benchmark colwise(Euclidean(), $X1L, $Y1L)
BenchmarkTools.Trial:
  memory estimate:  896 bytes
  allocs estimate:  1
  --------------
  minimum time:     310.889 ns (0.00% GC)
  median time:      356.742 ns (0.00% GC)
  mean time:        376.197 ns (3.68% GC)
  maximum time:     4.532 μs (87.48% GC)
  --------------
  samples:          10000
  evals/sample:     244

julia> @benchmark calc_euclidean($X1L,$Y1L)
BenchmarkTools.Trial:
  memory estimate:  896 bytes
  allocs estimate:  1
  --------------
  minimum time:     796.737 ns (0.00% GC)
  median time:      803.463 ns (0.00% GC)
  mean time:        834.667 ns (1.82% GC)
  maximum time:     11.567 μs (88.25% GC)
  --------------
  samples:          10000
  evals/sample:     95

You can get a bit more performance from evaluating it inplace:

julia> d = similar(X1L, size(X1L,2));

julia> @benchmark distances_avx!($d, $X1L', $Y1L')
BenchmarkTools.Trial:
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     212.117 ns (0.00% GC)
  median time:      212.466 ns (0.00% GC)
  mean time:        212.701 ns (0.00% GC)
  maximum time:     258.545 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     506

julia> @benchmark distances_avx!($d, $X1Lt, $Y1Lt)
BenchmarkTools.Trial:
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     91.683 ns (0.00% GC)
  median time:      91.764 ns (0.00% GC)
  mean time:        91.873 ns (0.00% GC)
  maximum time:     114.600 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     957

Also:

@code_llvm @Meta.lower colwise(Euclidean(), X1L, Y1L)

Just do @code_llvm. You’re not seeing the correct output because of the `@Meta.lower.

6 Likes