If you’d like a faster function:
using LoopVectorization
function distances_avx(X::AbstractMatrix{XT}, Y::AbstractMatrix{YT}) where {XT,YT}
distances_avx!(Vector{promote_type(XT,YT,Float32)}(undef, size(X,1)), X, Y)
end
function distances_avx!(d, X, Y)
@avx for n ∈ axes(X,1)
dₙ = zero(eltype(d))
for p ∈ axes(X,2)
dₙ += (X[n,p] - Y[n,p])^2
end
d[n] = sqrt(dₙ)
end
d
end
This yields:
julia> distances_avx(X1L', Y1L') ≈ colwise(Euclidean(), X1L, Y1L)
true
julia> X1Lt = copy(X1L'); Y1Lt = copy(Y1L');
julia> @benchmark distances_avx($X1L', $Y1L')
BenchmarkTools.Trial:
memory estimate: 896 bytes
allocs estimate: 1
--------------
minimum time: 234.650 ns (0.00% GC)
median time: 244.064 ns (0.00% GC)
mean time: 263.833 ns (5.22% GC)
maximum time: 2.707 μs (86.20% GC)
--------------
samples: 10000
evals/sample: 409
julia> @benchmark distances_avx($X1Lt, $Y1Lt)
BenchmarkTools.Trial:
memory estimate: 896 bytes
allocs estimate: 1
--------------
minimum time: 99.239 ns (0.00% GC)
median time: 105.655 ns (0.00% GC)
mean time: 121.963 ns (10.68% GC)
maximum time: 1.207 μs (83.88% GC)
--------------
samples: 10000
evals/sample: 944
julia> @benchmark colwise(Euclidean(), $X1L, $Y1L)
BenchmarkTools.Trial:
memory estimate: 896 bytes
allocs estimate: 1
--------------
minimum time: 310.889 ns (0.00% GC)
median time: 356.742 ns (0.00% GC)
mean time: 376.197 ns (3.68% GC)
maximum time: 4.532 μs (87.48% GC)
--------------
samples: 10000
evals/sample: 244
julia> @benchmark calc_euclidean($X1L,$Y1L)
BenchmarkTools.Trial:
memory estimate: 896 bytes
allocs estimate: 1
--------------
minimum time: 796.737 ns (0.00% GC)
median time: 803.463 ns (0.00% GC)
mean time: 834.667 ns (1.82% GC)
maximum time: 11.567 μs (88.25% GC)
--------------
samples: 10000
evals/sample: 95
You can get a bit more performance from evaluating it inplace:
julia> d = similar(X1L, size(X1L,2));
julia> @benchmark distances_avx!($d, $X1L', $Y1L')
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 212.117 ns (0.00% GC)
median time: 212.466 ns (0.00% GC)
mean time: 212.701 ns (0.00% GC)
maximum time: 258.545 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 506
julia> @benchmark distances_avx!($d, $X1Lt, $Y1Lt)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 91.683 ns (0.00% GC)
median time: 91.764 ns (0.00% GC)
mean time: 91.873 ns (0.00% GC)
maximum time: 114.600 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 957
Also:
@code_llvm @Meta.lower colwise(Euclidean(), X1L, Y1L)
Just do @code_llvm. You’re not seeing the correct output because of the `@Meta.lower.