I also did this with VectorizationBase.jl on another computer with a Xeon as an exercise for myself. Admittedly, we could use some improved documentation there, but perhaps it is still not meant to be used directly?
julia> using VectorizationBase
julia> dist(a::NTuple{N,T}, b::NTuple{N,T}) where {N,T} = return ntuple(sqrt(Vec(a...)^2 + Vec(b...)^2), Val(N))
dist (generic function with 2 methods)
julia> a = b = (1.f0,2.f0,3.f0,4.f0,5.f0,6.f0,7.f0,8.f0)
(1.0f0, 2.0f0, 3.0f0, 4.0f0, 5.0f0, 6.0f0, 7.0f0, 8.0f0)
julia> @code_llvm debuginfo=:none dist(a,b)
; Function Attrs: uwtable
define void @julia_dist_1430([8 x float]* noalias nocapture sret([8 x float]) %0, [8 x float]* nocapture nonnull readonly align 4 dereferenceable(32) %1, [8 x float]* nocapture nonnull readonly align 4 dereferenceable(32) %2) #0 {
top:
%3 = bitcast [8 x float]* %1 to <8 x float>*
%4 = load <8 x float>, <8 x float>* %3, align 4
%res.i26 = fmul reassoc nsz arcp contract afn <8 x float> %4, %4
%5 = bitcast [8 x float]* %2 to <8 x float>*
%6 = load <8 x float>, <8 x float>* %5, align 4
%res.i17 = fmul reassoc nsz arcp contract afn <8 x float> %6, %6
%res.i16 = fadd nsz contract <8 x float> %res.i26, %res.i17
%res.i15 = call fast <8 x float> @llvm.sqrt.v8f32(<8 x float> %res.i16)
%7 = bitcast [8 x float]* %0 to <8 x float>*
store <8 x float> %res.i15, <8 x float>* %7, align 4
ret void
}
julia> @code_native debuginfo=:none dist(a,b)
.text
.file "dist"
.globl julia_dist_1446 # -- Begin function julia_dist_1446
.p2align 4, 0x90
.type julia_dist_1446,@function
julia_dist_1446: # @julia_dist_1446
.cfi_startproc
# %bb.0: # %top
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset %rbp, -16
movq %rsp, %rbp
.cfi_def_cfa_register %rbp
vmovups (%rdx), %ymm0
vmovups (%r8), %ymm1
movq %rcx, %rax
vmulps %ymm1, %ymm1, %ymm1
vfmadd231ps %ymm0, %ymm0, %ymm1 # ymm1 = (ymm0 * ymm0) + ymm1
vsqrtps %ymm1, %ymm0
vmovups %ymm0, (%rcx)
popq %rbp
vzeroupper
retq
.Lfunc_end0:
.size julia_dist_1446, .Lfunc_end0-julia_dist_1446
.cfi_endproc
# -- End function
.section ".note.GNU-stack","",@progbits
What is impressive here is that the compiler chose an advanced instruction,vfmadd231ps
which is a fused multiply add. The great thing about Julia is that it can adapt to the architecture it is being used on easily. Here it is using AVX512 instruction:
https://www.felixcloutier.com/x86/vfmadd132ps:vfmadd213ps:vfmadd231ps
This version can also be easily adapted to use 512-bit registers (see the zmm*
):
julia> a = b = (1, 2, 3, 4, 5, 6, 7, 8)
(1, 2, 3, 4, 5, 6, 7, 8)
julia> @code_native debuginfo=:none dist(a,b)
.text
.file "dist"
.globl julia_dist_1521 # -- Begin function julia_dist_1521
.p2align 4, 0x90
.type julia_dist_1521,@function
julia_dist_1521: # @julia_dist_1521
.cfi_startproc
# %bb.0: # %top
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset %rbp, -16
movq %rsp, %rbp
.cfi_def_cfa_register %rbp
vcvtqq2pd (%rdx), %zmm0
vcvtqq2pd (%r8), %zmm1
vmulpd %zmm1, %zmm1, %zmm1
vfmadd231pd %zmm0, %zmm0, %zmm1 # zmm1 = (zmm0 * zmm0) + zmm1
vsqrtpd %zmm1, %zmm0
movq %rcx, %rax
vmovupd %ymm0, (%rcx)
vextractf64x4 $1, %zmm0, 32(%rcx)
popq %rbp
vzeroupper
retq
.Lfunc_end0:
.size julia_dist_1521, .Lfunc_end0-julia_dist_1521
.cfi_endproc
# -- End function
.section ".note.GNU-stack","",@progbits