Hi!

Often, it takes no extra effort at all.

```
function dot_product(x, y)
out = zero(promote_type(eltype(x), eltype(y)))
@inbounds for i in eachindex(x,y)
out += x[i] * y[i]
end
out
end
x, y = randn(256), randn(256);
```

On a computer with avx512:

```
julia> @code_native dot_product(x, y)
.text
; Function dot_product {
; Location: REPL[4]:3
; Function eachindex; {
; Location: abstractarray.jl:207
; Function eachindex; {
; Location: abstractarray.jl:217
; Function eachindex; {
; Location: abstractarray.jl:214
; Function axes1; {
; Location: abstractarray.jl:93
; Function axes; {
; Location: abstractarray.jl:75
; Function size; {
; Location: REPL[4]:2
subq $40, %rsp
movq 24(%rdi), %rax
;}
; Function map; {
; Location: tuple.jl:165
; Function Type; {
; Location: range.jl:314
; Function Type; {
; Location: range.jl:305
; Function max; {
; Location: promotion.jl:414
movq %rax, %rcx
sarq $63, %rcx
andnq %rax, %rcx, %r8
;}}}}}}}
; Location: abstractarray.jl:218
; Function _all_match_first; {
; Location: abstractarray.jl:224
; Function #89; {
; Location: abstractarray.jl:218
; Function eachindex; {
; Location: abstractarray.jl:214
; Function axes1; {
; Location: abstractarray.jl:93
; Function axes; {
; Location: abstractarray.jl:75
; Function size; {
; Location: array.jl:155
movq 24(%rsi), %rcx
;}
; Function map; {
; Location: tuple.jl:165
; Function Type; {
; Location: range.jl:314
; Function Type; {
; Location: range.jl:305
; Function max; {
; Location: promotion.jl:414
movq %rcx, %rdx
sarq $63, %rdx
andnq %rcx, %rdx, %rcx
;}}}}}}}}}
; Function _all_match_first; {
; Location: promotion.jl:403
cmpq %rcx, %r8
;}
jne L300
; Location: abstractarray.jl:217
; Function eachindex; {
; Location: abstractarray.jl:214
; Function axes1; {
; Location: abstractarray.jl:93
; Function axes; {
; Location: abstractarray.jl:75
; Function map; {
; Location: tuple.jl:165
; Function Type; {
; Location: range.jl:314
; Function Type; {
; Location: range.jl:305
; Function max; {
; Location: promotion.jl:414
testq %rax, %rax
;}}}}}}}}}
jle L76
movq (%rdi), %rcx
movq (%rsi), %rdx
; Location: REPL[4]:3
cmpq $32, %r8
jae L88
vxorpd %xmm0, %xmm0, %xmm0
movl $1, %esi
jmp L259
L76:
vxorps %xmm0, %xmm0, %xmm0
; Location: REPL[4]:6
addq $40, %rsp
vzeroupper
retq
; Location: REPL[4]:3
L88:
movabsq $9223372036854775776, %rdi # imm = 0x7FFFFFFFFFFFFFE0
andq %r8, %rdi
leaq 1(%rdi), %rsi
vxorpd %xmm0, %xmm0, %xmm0
xorl %eax, %eax
vxorpd %xmm1, %xmm1, %xmm1
vxorpd %xmm2, %xmm2, %xmm2
vxorpd %xmm3, %xmm3, %xmm3
nopl (%rax,%rax)
; Location: REPL[4]:4
; Function getindex; {
; Location: array.jl:739
L128:
vmovupd (%rdx,%rax,8), %zmm4
vmovupd 64(%rdx,%rax,8), %zmm5
vmovupd 128(%rdx,%rax,8), %zmm6
vmovupd 192(%rdx,%rax,8), %zmm7
;}
; Function +; {
; Location: float.jl:395
vfmadd231pd (%rcx,%rax,8), %zmm4, %zmm0
vfmadd231pd 64(%rcx,%rax,8), %zmm5, %zmm1
vfmadd231pd 128(%rcx,%rax,8), %zmm6, %zmm2
vfmadd231pd 192(%rcx,%rax,8), %zmm7, %zmm3
addq $32, %rax
cmpq %rax, %rdi
jne L128
vaddpd %zmm0, %zmm1, %zmm0
vaddpd %zmm0, %zmm2, %zmm0
vaddpd %zmm0, %zmm3, %zmm0
vextractf64x4 $1, %zmm0, %ymm1
vaddpd %zmm1, %zmm0, %zmm0
vextractf128 $1, %ymm0, %xmm1
vaddpd %zmm1, %zmm0, %zmm0
vpermilpd $1, %xmm0, %xmm1 # xmm1 = xmm0[1,0]
vaddpd %zmm1, %zmm0, %zmm0
cmpq %rdi, %r8
;}
; Location: REPL[4]:3
je L292
L259:
addq $-1, %rsi
nopw (%rax,%rax)
; Location: REPL[4]:4
; Function getindex; {
; Location: array.jl:739
L272:
vmovsd (%rdx,%rsi,8), %xmm1 # xmm1 = mem[0],zero
;}
; Function +; {
; Location: float.jl:395
vfmadd231sd (%rcx,%rsi,8), %xmm1, %xmm0
;}
; Function iterate; {
; Location: range.jl:591
; Function ==; {
; Location: promotion.jl:403
addq $1, %rsi
cmpq %rsi, %r8
;}}
jne L272
; Location: REPL[4]:6
L292:
addq $40, %rsp
vzeroupper
retq
; Location: REPL[4]:3
; Function eachindex; {
; Location: abstractarray.jl:207
; Function eachindex; {
; Location: abstractarray.jl:218
L300:
movabsq $jl_system_image_data, %rax
movq %rax, 8(%rsp)
movabsq $jl_system_image_data, %rax
movq %rax, 16(%rsp)
movq %rdi, 24(%rsp)
movq %rsi, 32(%rsp)
movabsq $jl_invoke, %rax
movabsq $140450899053840, %rdi # imm = 0x7FBD45F24D10
leaq 8(%rsp), %rsi
movl $4, %edx
callq *%rax
ud2
nopw %cs:(%rax,%rax)
;}}}
```

All the `zmm`

registers you are 512 bit registers. `ymm`

is 256, and `xmm`

is 128.

The main loop body is:

```
L128:
vmovupd (%rdx,%rax,8), %zmm4
vmovupd 64(%rdx,%rax,8), %zmm5
vmovupd 128(%rdx,%rax,8), %zmm6
vmovupd 192(%rdx,%rax,8), %zmm7
;}
; Function +; {
; Location: float.jl:395
vfmadd231pd (%rcx,%rax,8), %zmm4, %zmm0
vfmadd231pd 64(%rcx,%rax,8), %zmm5, %zmm1
vfmadd231pd 128(%rcx,%rax,8), %zmm6, %zmm2
vfmadd231pd 192(%rcx,%rax,8), %zmm7, %zmm3
addq $32, %rax
cmpq %rax, %rdi
jne L128
```

L128 4 registers from memory (a total of 32 doubles) of one array, and then it uses fused multiply-add instructions (vfmadd) to multiply numbers from another region of memory (the other array) with the just loaded values, and add them to accumulated dot products.

Besides this making the dot product dramatically faster, you can also compare this with pairwise summation – it should also be more accurate than the naive algorithm.

It is fast:

```
julia> using BenchmarkTools
julia> @btime dot_product($x, $y)
12.979 ns (0 allocations: 0 bytes)
4.738430453861962
julia> @btime $x' * $y
25.089 ns (0 allocations: 0 bytes)
4.738430453861962
```

For comparison, on my similarly-clocked Ryzen, the minimum time is about 34ns.

If you’d like a little more involved example, I recently been working on a problem that involves a mixture of zero-mean tri-variate T distributions. A Gibbs sampler is straightforward to implement for this problem; here is the function calculating the unnormalized conditional probabilities of group membership:

```
using Random, BenchmarkTools, SpecialFunctions
function update_individual_probs_v1!(mt::MersenneTwister, probabilities::AbstractMatrix{T}, baseπ,
Li::AbstractMatrix{T}, ν, x::AbstractMatrix{T}, ::Val{NG}) where {T,NG}
@inbounds for g ∈ 1:NG
Li11 = Li[1,g]
Li21 = Li[2,g]
Li31 = Li[3,g]
Li22 = Li[4,g]
Li32 = Li[5,g]
Li33 = Li[6,g]
νfactor = (ν[g] - 2) / ν[g]
exponent = T(-1.5) - T(0.5) * ν[g]
base = log(baseπ[g]) + log(Li11) + log(Li22) + log(Li33) +
lgamma(-exponent) - lgamma(T(0.5)*ν[g]) - T(1.5)*log(ν[g])
for i ∈ 1:size(probabilities,1)
lx₁ = Li11 * x[i,1]
lx₂ = Li21 * x[i,1] + Li22 * x[i,2]
lx₃ = Li31 * x[i,1] + Li32 * x[i,2] + Li33 * x[i,3]
probabilities[i,g] = exp(base + exponent * log(one(T) + νfactor * (lx₁*lx₁ + lx₂*lx₂ + lx₃*lx₃)))
end
end
end
function update_individual_probs_v1!(probabilities::AbstractMatrix{T}, baseπ,
Li::AbstractMatrix{T}, ν, x::AbstractMatrix{T}, ::Val{NG}) where {T,NG}
update_individual_probs_v1!(Random.GLOBAL_RNG, probabilities, baseπ, Li, ν, x, Val(NG))
end
```

`Li`

is the inverse of the Cholesky factor of the covariance matrix. So we’re calculationg (Li * x)’(Li * x), for each of `NG`

groups. Creating a simple fake data set:

```
T = Float32
NG = 6;
N = 1024;
X = randn(T, N,3);
probabilities = Matrix{T}(undef, N, NG);
Li = rand(T, 6,NG);
ν = T(N / NG) + 4 .+ rand(T, NG);
baseπ = rand(T, NG);
baseπ ./= sum(baseπ);
```

This yields:

```
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 118.813 μs (0.00% GC)
median time: 121.939 μs (0.00% GC)
mean time: 127.839 μs (0.00% GC)
maximum time: 195.095 μs (0.00% GC)
--------------
samples: 10000
evals/sample: 1
```

The `@simd`

macro does not help.

However, I wrote a macro in SLEEFwrap that (with a few assumptions) can vectorize loops, including those that contain special functions (via SLEEF):

```
using SIMDPirates, SLEEFwrap
using SLEEFwrap: @restrict_simd
@generated function update_individual_probs!(mt::MersenneTwister, probabilities::AbstractMatrix{T}, baseπ::AbstractVector{T}, Li::AbstractMatrix{T}, ν, x::AbstractMatrix{T}, ::Val{NG}) where {T,NG}
quote
@inbounds for g ∈ 1:NG
Li11 = Li[1,g]
Li21 = Li[2,g]
Li31 = Li[3,g]
Li22 = Li[4,g]
Li32 = Li[5,g]
Li33 = Li[6,g]
νfactor = (ν[g] - 2) / ν[g]
exponent = T(-1.5) - T(0.5) * ν[g]
base = log(baseπ[g]) + log(Li11) + log(Li22) + log(Li33) +
lgamma(-exponent) - lgamma(T(0.5)*ν[g]) - T(1.5)*log(ν[g])
@restrict_simd $T for i ∈ 1:size(probabilities,1)
lx₁ = Li11 * x[i,1]
lx₂ = Li21 * x[i,1] + Li22 * x[i,2]
lx₃ = Li31 * x[i,1] + Li32 * x[i,2] + Li33 * x[i,3]
probabilities[i,g] = exp(base + exponent * log(one(T) + νfactor * (lx₁*lx₁ + lx₂*lx₂ + lx₃*lx₃)))
end
end
end
end
function update_individual_probs!(probabilities::AbstractMatrix{T}, baseπ,
Li::AbstractMatrix{T}, ν, x::AbstractMatrix{T}, ::Val{NG}) where {T,NG}
update_individual_probs!(Random.GLOBAL_RNG, probabilities, baseπ, Li, ν, x, Val(NG))
end
```

This yields:

```
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 7.363 μs (0.00% GC)
median time: 7.867 μs (0.00% GC)
mean time: 7.990 μs (0.00% GC)
maximum time: 16.006 μs (0.00% GC)
--------------
samples: 10000
evals/sample: 4
```

For some reason the times are about 20 microseconds slowed from the REPL than from IJulia. I showed IJulia times above.

EDIT: Nevermind on that – the runtime is data-dependent. In the REPL, I was using the fake data I generated above. In IJulia, I was using more realistic fake data and processing it as I normally would before getting to that step.