# Trigonometric functions do not use SIMD - Solved: use LoopVectorization.jl

I noticed that computations with trigonometric functions cannot use SIMD. Is this a nature of trigonometric functions? I would like to learn more details.

My test code is as follows:

function two_mul(out, a1, a2)
@inbounds @simd for i in eachindex(a1)
out[i] = (a1[i]) * a2[i]  # no trig function here
end
end

a1 = rand(80_000)
a2 = rand(80_000)
out = similar(a1);

@code_native two_mul(out, a1, a2)
@btime two_mul(out, a1, a2)

Outputs and following, and I can find the usage of vmulpd

.text
; β @ In[168]:1 within `two_mul'
movq	%rsi, -8(%rsp)
movq	8(%rsi), %rax
; β @ In[168]:2 within `two_mul'
; ββ @ simdloop.jl:69 within `macro expansion'
; βββ @ abstractarray.jl:212 within `eachindex'
; ββββ @ abstractarray.jl:95 within `axes1'
; βββββ @ abstractarray.jl:75 within `axes'
; ββββββ @ array.jl:155 within `size'
movq	24(%rax), %rcx
; ββββββ
; ββββββ @ tuple.jl:157 within `map'
; βββββββ @ range.jl:326 within `OneTo' @ range.jl:317
; ββββββββ @ promotion.jl:409 within `max'
testq	%rcx, %rcx
; ββββββββ
; ββ @ simdloop.jl:72 within `macro expansion'
jle	L279
movq	%rcx, %rdx
sarq	\$63, %rdx
andnq	%rcx, %rdx, %r8
movq	(%rsi), %rdi
movq	16(%rsi), %rdx
movq	(%rax), %rcx
movq	(%rdx), %rdx
movq	(%rdi), %rsi
; ββ @ simdloop.jl:75 within `macro expansion'
cmpq	\$32, %r8
jae	L63
xorl	%edi, %edi
jmp	L256
; ββ @ simdloop.jl:75 within `macro expansion'
L63:
leaq	(%rsi,%r8,8), %rdi
leaq	(%rcx,%r8,8), %rax
leaq	(%rdx,%r8,8), %r9
cmpq	%rax, %rsi
setb	%r10b
cmpq	%rdi, %rcx
setb	%r11b
andb	%r10b, %r11b
cmpq	%r9, %rsi
setb	%r9b
cmpq	%rdi, %rdx
setb	%al
andb	%r9b, %al
orb	%r11b, %al
cmpb	\$1, %al
jne	L128
movb	\$1, %al
; ββ @ simdloop.jl:75 within `macro expansion'
testb	%al, %al
je	L128
xorl	%edi, %edi
jmp	L256
; ββ @ simdloop.jl:75 within `macro expansion'
L128:
movl	%r8d, %eax
andl	\$31, %eax
movq	%r8, %rdi
subq	%rax, %rdi
xorl	%eax, %eax
nop
; ββ @ simdloop.jl:77 within `macro expansion' @ In[168]:3
; βββ @ array.jl:809 within `getindex'
L144:
vmovupd	(%rcx,%rax,8), %zmm0
vmovupd	64(%rcx,%rax,8), %zmm1
vmovupd	128(%rcx,%rax,8), %zmm2
vmovupd	192(%rcx,%rax,8), %zmm3
; βββ
; βββ @ float.jl:405 within `*'
vmulpd	(%rdx,%rax,8), %zmm0, %zmm0
vmulpd	64(%rdx,%rax,8), %zmm1, %zmm1
vmulpd	128(%rdx,%rax,8), %zmm2, %zmm2
vmulpd	192(%rdx,%rax,8), %zmm3, %zmm3
; βββ
; βββ @ array.jl:847 within `setindex!'
vmovupd	%zmm0, (%rsi,%rax,8)
vmovupd	%zmm1, 64(%rsi,%rax,8)
vmovupd	%zmm2, 128(%rsi,%rax,8)
vmovupd	%zmm3, 192(%rsi,%rax,8)
; βββ
; ββ @ simdloop.jl:78 within `macro expansion'
; βββ @ int.jl:86 within `+'
cmpq	%rax, %rdi
jne	L144
; βββ
; ββ @ simdloop.jl:75 within `macro expansion'
cmpq	%rdi, %r8
je	L279
nopl	(%rax,%rax)
; ββ @ simdloop.jl:77 within `macro expansion' @ In[168]:3
; βββ @ array.jl:809 within `getindex'
L256:
vmovsd	(%rcx,%rdi,8), %xmm0    # xmm0 = mem[0],zero
; βββ
; βββ @ float.jl:405 within `*'
vmulsd	(%rdx,%rdi,8), %xmm0, %xmm0
; βββ
; βββ @ array.jl:847 within `setindex!'
vmovsd	%xmm0, (%rsi,%rdi,8)
; βββ
; ββ @ simdloop.jl:78 within `macro expansion'
; βββ @ int.jl:86 within `+'
incq	%rdi
; βββ
; ββ @ simdloop.jl:75 within `macro expansion'
; βββ @ int.jl:82 within `<'
cmpq	%r8, %rdi
; βββ
jb	L256
; ββ
; β @ simdloop.jl within `two_mul'
L279:
movabsq	\$jl_system_image_data, %rax
; β
; β @ In[168]:2 within `two_mul'
vzeroupper
retq
; β
68.392 ΞΌs (0 allocations: 0 bytes)

function two_mul_sin(out, a1, a2)
@inbounds @simd for i in eachindex(a1)
out[i] = sin(a1[i]) * a2[i]
end
end

a1 = rand(80_000)
a2 = rand(80_000)
out = similar(a1);

@code_native two_mul_sin(out, a1, a2)
@btime two_mul_sin(out, a1, a2)

Outputs:

.text
; β @ In[169]:1 within `two_mul_sin'
pushq	%rbp
pushq	%r15
pushq	%r14
pushq	%r13
pushq	%r12
pushq	%rbx
pushq	%rax
movq	%rsi, (%rsp)
movq	8(%rsi), %r14
; β @ In[169]:2 within `two_mul_sin'
; ββ @ simdloop.jl:69 within `macro expansion'
; βββ @ abstractarray.jl:212 within `eachindex'
; ββββ @ abstractarray.jl:95 within `axes1'
; βββββ @ abstractarray.jl:75 within `axes'
; ββββββ @ array.jl:155 within `size'
movq	24(%r14), %rax
; ββββββ
; ββββββ @ tuple.jl:157 within `map'
; βββββββ @ range.jl:326 within `OneTo' @ range.jl:317
; ββββββββ @ promotion.jl:409 within `max'
testq	%rax, %rax
; ββββββββ
; ββ @ simdloop.jl:72 within `macro expansion'
jle	L100
movq	%rax, %rcx
sarq	\$63, %rcx
andnq	%rax, %rcx, %r15
movq	(%rsi), %r12
movq	16(%rsi), %r13
xorl	%ebx, %ebx
movabsq	\$sin, %rbp
nopl	(%rax,%rax)
; ββ @ simdloop.jl:77 within `macro expansion' @ In[169]:3
; βββ @ array.jl:809 within `getindex'
L64:
movq	(%r14), %rax
vmovsd	(%rax,%rbx,8), %xmm0    # xmm0 = mem[0],zero
; βββ
callq	*%rbp
; βββ @ array.jl:809 within `getindex'
movq	(%r13), %rax
; βββ
; βββ @ float.jl:405 within `*'
vmulsd	(%rax,%rbx,8), %xmm0, %xmm0
; βββ
; βββ @ array.jl:847 within `setindex!'
movq	(%r12), %rax
vmovsd	%xmm0, (%rax,%rbx,8)
; βββ
; ββ @ simdloop.jl:78 within `macro expansion'
; βββ @ int.jl:86 within `+'
incq	%rbx
; βββ
; ββ @ simdloop.jl:75 within `macro expansion'
; βββ @ int.jl:82 within `<'
cmpq	%r15, %rbx
; βββ
jb	L64
; ββ
; β @ simdloop.jl within `two_mul_sin'
L100:
movabsq	\$jl_system_image_data, %rax
; β
; β @ In[169]:2 within `two_mul_sin'
popq	%rbx
popq	%r12
popq	%r13
popq	%r14
popq	%r15
popq	%rbp
retq
; β
526.181 ΞΌs (0 allocations: 0 bytes)
versioninfo()

Julia Version 1.5.2
Commit 539f3ce943 (2020-09-23 23:17 UTC)
Platform Info:
OS: Linux (x86_64-pc-linux-gnu)
CPU: Intel(R) Xeon(R) W-2133 CPU @ 3.60GHz
WORD_SIZE: 64
LIBM: libopenlibm
LLVM: libLLVM-9.0.1 (ORCJIT, skylake-avx512)
Environment:

Currently juliaβs special functions are unable to vectorize automatically by themself. That said, check out LoopVectorization If you write @inbounds @avx for i in eachindex(a1) you should get everything to vectorize.

6 Likes

Yes, that works:

julia> using LoopVectorization, BenchmarkTools

julia> a1 = rand(80_000);

julia> a2 = rand(80_000);

julia> out = similar(a1);

julia> function two_mul_sin!(out, a1, a2)
@inbounds @simd for i in eachindex(a1)
out[i] = sin(a1[i]) * a2[i]
end
end
two_mul_sin! (generic function with 1 method)

julia> function two_mul_sin_avx!(out, a1, a2)
@avx for i in eachindex(a1)
out[i] = sin(a1[i]) * a2[i]
end
end
two_mul_sin_avx! (generic function with 1 method)

julia> @benchmark two_mul_sin_avx!(\$out, \$a1, \$a2)
BenchmarkTools.Trial:
memory estimate:  0 bytes
allocs estimate:  0
--------------
minimum time:     81.675 ΞΌs (0.00% GC)
median time:      84.828 ΞΌs (0.00% GC)
mean time:        87.356 ΞΌs (0.00% GC)
maximum time:     173.757 ΞΌs (0.00% GC)
--------------
samples:          10000
evals/sample:     1

julia> @benchmark two_mul_sin!(\$out, \$a1, \$a2)
BenchmarkTools.Trial:
memory estimate:  0 bytes
allocs estimate:  0
--------------
minimum time:     354.765 ΞΌs (0.00% GC)
median time:      372.535 ΞΌs (0.00% GC)
mean time:        375.081 ΞΌs (0.00% GC)
maximum time:     1.349 ms (0.00% GC)
--------------
samples:          10000
evals/sample:     1

julia> a1 = rand(800);

julia> a2 = rand(800);

julia> out = similar(a1);

julia> @benchmark two_mul_sin_avx!(\$out, \$a1, \$a2)
BenchmarkTools.Trial:
memory estimate:  0 bytes
allocs estimate:  0
--------------
minimum time:     417.286 ns (0.00% GC)
median time:      482.095 ns (0.00% GC)
mean time:        485.422 ns (0.00% GC)
maximum time:     820.759 ns (0.00% GC)
--------------
samples:          10000
evals/sample:     199

julia> @benchmark two_mul_sin!(\$out, \$a1, \$a2)
BenchmarkTools.Trial:
memory estimate:  0 bytes
allocs estimate:  0
--------------
minimum time:     2.354 ΞΌs (0.00% GC)
median time:      2.448 ΞΌs (0.00% GC)
mean time:        2.491 ΞΌs (0.00% GC)
maximum time:     3.942 ΞΌs (0.00% GC)
--------------
samples:          10000
evals/sample:     9

julia> (2.354e-6, 354.765e-6) ./ (417.286e-9, 81.675e-6) # relative performance
(5.641214898175352, 4.343617998163452)

julia> versioninfo()
Julia Version 1.5.3-pre.0
Commit 5905edebb1 (2020-09-24 05:09 UTC)
Platform Info:
OS: Linux (x86_64-linux-gnu)
CPU: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
WORD_SIZE: 64
LIBM: libopenlibm
LLVM: libLLVM-9.0.1 (ORCJIT, tigerlake)

Itβs about 5.6x faster with 800 elements, and 4.3 times faster with 80_000 on a tiger lake laptop.

With the latest release of VectorizationBase/LoopVectorization on most Linux distros, you wont see much interesting from @code_native because it will ccall libmvec.so for the SIMD sin implementation. Itβll probably switch to a Julia version eventually for all architectures (Linux with recent GLIBC is the exception).

8 Likes

Thank you @Elrod and @Oscar_Smith for the reply!

Iβm trying to use @avx in my application that uses structs to wrap vectors. The MWE is as follows.

abstract type VComponent{T} <: AbstractVector{T} end

Base.size(VC::VComponent) = Base.size(VC.v)

Base.iterate(VC::VComponent) = Base.iterate(VC.v)

Base.iterate(VC::VComponent, iter) = Base.iterate(VC.v, iter)

Base.iterate(VC::VComponent, iter, state) = Base.Iterate(VC.v, iter, state)

Base.@propagate_inbounds function Base.getindex(VC::VComponent, i::Int)
@boundscheck checkbounds(VC, i)
VC.v[i]
end

Base.@propagate_inbounds function Base.setindex!(
VC::VComponent,
v::T,
i::Int,
) where {T<:AbstractFloat}
@boundscheck checkbounds(VC, i)
VC.v[i] = v
end

# define the struct `IntParam` that implements the array interfaces
Base.@kwdef struct IntParam{T} <: VComponent{T}
v::Vector{T}
end

Benchmark code is here:

v1 = IntParam{Float64}(rand(80_000));

v2 = IntParam{Float64}(rand(80_000));

# does not vectorize
@benchmark two_mul_sin_avx!(\$out, \$v1, \$v2)

BenchmarkTools.Trial:
memory estimate:  0 bytes
allocs estimate:  0
--------------
minimum time:     557.309 ΞΌs (0.00% GC)
median time:      557.955 ΞΌs (0.00% GC)
mean time:        566.163 ΞΌs (0.00% GC)
maximum time:     623.568 ΞΌs (0.00% GC)
--------------
samples:          8826
evals/sample:     1

With the code from @Elrod, the result is as follows

@benchmark two_mul_sin_avx!(\$out, \$a1, \$a2)

BenchmarkTools.Trial:
memory estimate:  0 bytes
allocs estimate:  0
--------------
minimum time:     76.891 ΞΌs (0.00% GC)
median time:      78.154 ΞΌs (0.00% GC)
mean time:        78.733 ΞΌs (0.00% GC)
maximum time:     150.660 ΞΌs (0.00% GC)
--------------
samples:          10000
evals/sample:     1

I suppose itβs because LoopVectorization cannot assume the data is stored in contiguous memory and thus have to call getindex each time to retrieve the next value.

The immediate fix is to write my code to access the v attribute directly (like the following). But that is very verbose in real applications.

function two_mul_sin_avx!(out, a1, a2)
@avx for i in eachindex(a1)
out[i] = sin(a1.v[i]) * a2.v[i]
end
end

I looked up in the documentation of LoopVectorization but find my use case does not fit within any of the examples (not exactly an array-of-struct type of problem). Any suggestions?

Thatβs a problem with the documentation. It uses check_args to check if the arguments are supported:

julia> v1 = IntParam{Float64}(rand(80_000));

julia> v2 = IntParam{Float64}(rand(80_000));

julia> out = similar(v1);

julia> LoopVectorization.check_args(v1,v2,out)
false

The next major release of LoopVectorization (which I hope to have out by the end of the year) will use ArrayInterface.jl for this, which will change the recommended way of adding support.

But for now, defining these two methods should work:

LoopVectorization.check_args(::VComponent{T}) where {T} = LoopVectorization.check_type(T)
Base.pointer(v::VComponent) = pointer(v.v)

For example:

julia> v1 = IntParam{Float64}(rand(80_000));

julia> v2 = IntParam{Float64}(rand(80_000));

julia> out = similar(v1);

julia> @benchmark two_mul_sin_avx!(\$out, \$v1, \$v2)
BenchmarkTools.Trial:
memory estimate:  0 bytes
allocs estimate:  0
--------------
minimum time:     477.467 ΞΌs (0.00% GC)
median time:      480.595 ΞΌs (0.00% GC)
mean time:        480.921 ΞΌs (0.00% GC)
maximum time:     519.928 ΞΌs (0.00% GC)
--------------
samples:          10000
evals/sample:     1

julia> LoopVectorization.check_args(::VComponent{T}) where {T} = LoopVectorization.check_type(T)

julia> Base.pointer(v::VComponent) = pointer(v.v)

julia> @benchmark two_mul_sin_avx!(\$out, \$v1, \$v2)
BenchmarkTools.Trial:
memory estimate:  0 bytes
allocs estimate:  0
--------------
minimum time:     81.493 ΞΌs (0.00% GC)
median time:      82.477 ΞΌs (0.00% GC)
mean time:        82.725 ΞΌs (0.00% GC)
maximum time:     152.587 ΞΌs (0.00% GC)
--------------
samples:          10000
evals/sample:     1
5 Likes

This is exactly what Iβm looking for! Thank you very much for your help. Iβm going to mark it as the solution and will watch for the new release of LoopVectorization.

3 Likes

With the latest version of LoopVectorization, check_args seems to have been deprecated.

I was able to get it working with

using VectorizationBase

@inline VectorizationBase.memory_reference(::VComponent{T}) where {T} = VectorizationBase.memory_reference(T)

Is this the correct new way of adding vectorization support for vector wrappers?

Iβll add an inherits_from_parent trait to ArrayInterface thatβll let you define 3 simple methods:

1. ArrayInterface.inherits_from_parent
2. Base.parent
3. ArrayInterface.parent_type

Currently, a few more methods are needed:

using ArrayInterface
struct ArrayWrapper{T,N,A <: AbstractArray{T,N}} <: AbstractArray{T,N}
data::A
end
ArrayInterface.parent_type(::Type{ArrayWrapper{T,N,A}}) where {T,N,A} = A
Base.parent(x::ArrayWrapper) = x.data
Base.unsafe_convert(::Type{Ptr{T}}, x::ArrayWrapper{T}) where {T} = Base.unsafe_convert(Ptr{T}, parent(x))
Base.size(x::ArrayWrapper) = size(parent(x))
Base.strides(x::ArrayWrapper) = strides(parent(x))

ArrayInterface.contiguous_axis(::Type{A}) where {A <: ArrayWrapper} = ArrayInterface.contiguous_axis(ArrayInterface.parent_type(A))
ArrayInterface.contiguous_batch_size(::Type{A}) where {A <: ArrayWrapper} = ArrayInterface.contiguous_batch_size(ArrayInterface.parent_type(A))
ArrayInterface.stride_rank(::Type{A}) where {A <: ArrayWrapper} = ArrayInterface.stride_rank(ArrayInterface.parent_type(A))

Note of course that you still need to implement the rest of the usual array interface, such as getindex. Although now:

julia> A = ArrayWrapper(rand(8,10));

julia> using LoopVectorization

julia> stridedpointer(A) # works
VectorizationBase.StridedPointer{Float64, 2, 1, 0, (1, 2), Tuple{ArrayInterface.StaticInt{8}, Int64}, Tuple{ArrayInterface.StaticInt{1}, ArrayInterface.StaticInt{1}}}(Ptr{Float64} @0x00007f0854fd8610, (Static(8), 64), (Static(1), Static(1)))

julia> LoopVectorization.check_args(A) # true
true
3 Likes

Hi @Elrod ,

Iβm trying to adapt my code for LoopVectorization 0.21. In a nutshell, my custom type wraps a Vector in the field named v.

The code below works with 0.20 but not with 0.21.

@inline VectorizationBase.memory_reference(::VComponent{T}) where {T} = VectorizationBase.memory_reference(T)

The error is

β `LoopVectorization.check_args` on your inputs failed; running fallback `@inbounds @fastmath` loop instead.

I tried to overload check_args that worked in an earlier version below, but the error is new

LoopVectorization.check_args(::VComponent{T}) where {T} = LoopVectorization.check_type(T)
Base.pointer(v::VComponent) = pointer(v.v)

Error message:

nlsolve without Jac: Error During Test at /home/hacui/repos/Dianli.jl/test/test_pfsol.jl:19
Got exception outside of a @test
"Memory access for Dianli.BasicTypes.IntParam{Float64} not implemented yet."

MethodError: no method matching memory_reference(::Type{Float64})
Closest candidates are:
memory_reference(::LayoutPointers.AbstractStridedPointer) at ~/.julia/packages/LayoutPointers/fHXhH/src/stridedpointers.jl:50
memory_reference(::AbstractRange{T}) where T at ~/.julia/packages/LayoutPointers/fHXhH/src/stridedpointers.jl:180
memory_reference(::BitArray) at ~/.julia/packages/LayoutPointers/fHXhH/src/stridedpointers.jl:15

Any hints to fix it?