Trigonometric functions do not use SIMD - Solved: use LoopVectorization.jl

I noticed that computations with trigonometric functions cannot use SIMD. Is this a nature of trigonometric functions? I would like to learn more details.

My test code is as follows:

function two_mul(out, a1, a2)
    @inbounds @simd for i in eachindex(a1)
        out[i] = (a1[i]) * a2[i]  # no trig function here
    end
end

a1 = rand(80_000)
a2 = rand(80_000)
out = similar(a1);

@code_native two_mul(out, a1, a2)
@btime two_mul(out, a1, a2)

Outputs and following, and I can find the usage of vmulpd

	.text
; β”Œ @ In[168]:1 within `two_mul'
	movq	%rsi, -8(%rsp)
	movq	8(%rsi), %rax
; β”‚ @ In[168]:2 within `two_mul'
; β”‚β”Œ @ simdloop.jl:69 within `macro expansion'
; β”‚β”‚β”Œ @ abstractarray.jl:212 within `eachindex'
; β”‚β”‚β”‚β”Œ @ abstractarray.jl:95 within `axes1'
; β”‚β”‚β”‚β”‚β”Œ @ abstractarray.jl:75 within `axes'
; β”‚β”‚β”‚β”‚β”‚β”Œ @ array.jl:155 within `size'
	movq	24(%rax), %rcx
; β”‚β”‚β”‚β”‚β”‚β””
; β”‚β”‚β”‚β”‚β”‚β”Œ @ tuple.jl:157 within `map'
; β”‚β”‚β”‚β”‚β”‚β”‚β”Œ @ range.jl:326 within `OneTo' @ range.jl:317
; β”‚β”‚β”‚β”‚β”‚β”‚β”‚β”Œ @ promotion.jl:409 within `max'
	testq	%rcx, %rcx
; β”‚β”‚β””β””β””β””β””β””
; β”‚β”‚ @ simdloop.jl:72 within `macro expansion'
	jle	L279
	movq	%rcx, %rdx
	sarq	$63, %rdx
	andnq	%rcx, %rdx, %r8
	movq	(%rsi), %rdi
	movq	16(%rsi), %rdx
	movq	(%rax), %rcx
	movq	(%rdx), %rdx
	movq	(%rdi), %rsi
; β”‚β”‚ @ simdloop.jl:75 within `macro expansion'
	cmpq	$32, %r8
	jae	L63
	xorl	%edi, %edi
	jmp	L256
; β”‚β”‚ @ simdloop.jl:75 within `macro expansion'
L63:
	leaq	(%rsi,%r8,8), %rdi
	leaq	(%rcx,%r8,8), %rax
	leaq	(%rdx,%r8,8), %r9
	cmpq	%rax, %rsi
	setb	%r10b
	cmpq	%rdi, %rcx
	setb	%r11b
	andb	%r10b, %r11b
	cmpq	%r9, %rsi
	setb	%r9b
	cmpq	%rdi, %rdx
	setb	%al
	andb	%r9b, %al
	orb	%r11b, %al
	cmpb	$1, %al
	jne	L128
	movb	$1, %al
; β”‚β”‚ @ simdloop.jl:75 within `macro expansion'
	testb	%al, %al
	je	L128
	xorl	%edi, %edi
	jmp	L256
; β”‚β”‚ @ simdloop.jl:75 within `macro expansion'
L128:
	movl	%r8d, %eax
	andl	$31, %eax
	movq	%r8, %rdi
	subq	%rax, %rdi
	xorl	%eax, %eax
	nop
; β”‚β”‚ @ simdloop.jl:77 within `macro expansion' @ In[168]:3
; β”‚β”‚β”Œ @ array.jl:809 within `getindex'
L144:
	vmovupd	(%rcx,%rax,8), %zmm0
	vmovupd	64(%rcx,%rax,8), %zmm1
	vmovupd	128(%rcx,%rax,8), %zmm2
	vmovupd	192(%rcx,%rax,8), %zmm3
; β”‚β”‚β””
; β”‚β”‚β”Œ @ float.jl:405 within `*'
	vmulpd	(%rdx,%rax,8), %zmm0, %zmm0
	vmulpd	64(%rdx,%rax,8), %zmm1, %zmm1
	vmulpd	128(%rdx,%rax,8), %zmm2, %zmm2
	vmulpd	192(%rdx,%rax,8), %zmm3, %zmm3
; β”‚β”‚β””
; β”‚β”‚β”Œ @ array.jl:847 within `setindex!'
	vmovupd	%zmm0, (%rsi,%rax,8)
	vmovupd	%zmm1, 64(%rsi,%rax,8)
	vmovupd	%zmm2, 128(%rsi,%rax,8)
	vmovupd	%zmm3, 192(%rsi,%rax,8)
; β”‚β”‚β””
; β”‚β”‚ @ simdloop.jl:78 within `macro expansion'
; β”‚β”‚β”Œ @ int.jl:86 within `+'
	addq	$32, %rax
	cmpq	%rax, %rdi
	jne	L144
; β”‚β”‚β””
; β”‚β”‚ @ simdloop.jl:75 within `macro expansion'
	cmpq	%rdi, %r8
	je	L279
	nopl	(%rax,%rax)
; β”‚β”‚ @ simdloop.jl:77 within `macro expansion' @ In[168]:3
; β”‚β”‚β”Œ @ array.jl:809 within `getindex'
L256:
	vmovsd	(%rcx,%rdi,8), %xmm0    # xmm0 = mem[0],zero
; β”‚β”‚β””
; β”‚β”‚β”Œ @ float.jl:405 within `*'
	vmulsd	(%rdx,%rdi,8), %xmm0, %xmm0
; β”‚β”‚β””
; β”‚β”‚β”Œ @ array.jl:847 within `setindex!'
	vmovsd	%xmm0, (%rsi,%rdi,8)
; β”‚β”‚β””
; β”‚β”‚ @ simdloop.jl:78 within `macro expansion'
; β”‚β”‚β”Œ @ int.jl:86 within `+'
	incq	%rdi
; β”‚β”‚β””
; β”‚β”‚ @ simdloop.jl:75 within `macro expansion'
; β”‚β”‚β”Œ @ int.jl:82 within `<'
	cmpq	%r8, %rdi
; β”‚β”‚β””
	jb	L256
; β””β””
; β”Œ @ simdloop.jl within `two_mul'
L279:
	movabsq	$jl_system_image_data, %rax
; β””
; β”Œ @ In[168]:2 within `two_mul'
	vzeroupper
	retq
; β””
  68.392 ΞΌs (0 allocations: 0 bytes)

If I add a sin function, it will use vmulsd instead.

function two_mul_sin(out, a1, a2)
    @inbounds @simd for i in eachindex(a1)
        out[i] = sin(a1[i]) * a2[i]
    end
end

a1 = rand(80_000)
a2 = rand(80_000)
out = similar(a1);

@code_native two_mul_sin(out, a1, a2)
@btime two_mul_sin(out, a1, a2)

Outputs:

	.text
; β”Œ @ In[169]:1 within `two_mul_sin'
	pushq	%rbp
	pushq	%r15
	pushq	%r14
	pushq	%r13
	pushq	%r12
	pushq	%rbx
	pushq	%rax
	movq	%rsi, (%rsp)
	movq	8(%rsi), %r14
; β”‚ @ In[169]:2 within `two_mul_sin'
; β”‚β”Œ @ simdloop.jl:69 within `macro expansion'
; β”‚β”‚β”Œ @ abstractarray.jl:212 within `eachindex'
; β”‚β”‚β”‚β”Œ @ abstractarray.jl:95 within `axes1'
; β”‚β”‚β”‚β”‚β”Œ @ abstractarray.jl:75 within `axes'
; β”‚β”‚β”‚β”‚β”‚β”Œ @ array.jl:155 within `size'
	movq	24(%r14), %rax
; β”‚β”‚β”‚β”‚β”‚β””
; β”‚β”‚β”‚β”‚β”‚β”Œ @ tuple.jl:157 within `map'
; β”‚β”‚β”‚β”‚β”‚β”‚β”Œ @ range.jl:326 within `OneTo' @ range.jl:317
; β”‚β”‚β”‚β”‚β”‚β”‚β”‚β”Œ @ promotion.jl:409 within `max'
	testq	%rax, %rax
; β”‚β”‚β””β””β””β””β””β””
; β”‚β”‚ @ simdloop.jl:72 within `macro expansion'
	jle	L100
	movq	%rax, %rcx
	sarq	$63, %rcx
	andnq	%rax, %rcx, %r15
	movq	(%rsi), %r12
	movq	16(%rsi), %r13
	xorl	%ebx, %ebx
	movabsq	$sin, %rbp
	nopl	(%rax,%rax)
; β”‚β”‚ @ simdloop.jl:77 within `macro expansion' @ In[169]:3
; β”‚β”‚β”Œ @ array.jl:809 within `getindex'
L64:
	movq	(%r14), %rax
	vmovsd	(%rax,%rbx,8), %xmm0    # xmm0 = mem[0],zero
; β”‚β”‚β””
	callq	*%rbp
; β”‚β”‚β”Œ @ array.jl:809 within `getindex'
	movq	(%r13), %rax
; β”‚β”‚β””
; β”‚β”‚β”Œ @ float.jl:405 within `*'
	vmulsd	(%rax,%rbx,8), %xmm0, %xmm0
; β”‚β”‚β””
; β”‚β”‚β”Œ @ array.jl:847 within `setindex!'
	movq	(%r12), %rax
	vmovsd	%xmm0, (%rax,%rbx,8)
; β”‚β”‚β””
; β”‚β”‚ @ simdloop.jl:78 within `macro expansion'
; β”‚β”‚β”Œ @ int.jl:86 within `+'
	incq	%rbx
; β”‚β”‚β””
; β”‚β”‚ @ simdloop.jl:75 within `macro expansion'
; β”‚β”‚β”Œ @ int.jl:82 within `<'
	cmpq	%r15, %rbx
; β”‚β”‚β””
	jb	L64
; β””β””
; β”Œ @ simdloop.jl within `two_mul_sin'
L100:
	movabsq	$jl_system_image_data, %rax
; β””
; β”Œ @ In[169]:2 within `two_mul_sin'
	addq	$8, %rsp
	popq	%rbx
	popq	%r12
	popq	%r13
	popq	%r14
	popq	%r15
	popq	%rbp
	retq
; β””
  526.181 ΞΌs (0 allocations: 0 bytes)
versioninfo()

Julia Version 1.5.2
Commit 539f3ce943 (2020-09-23 23:17 UTC)
Platform Info:
  OS: Linux (x86_64-pc-linux-gnu)
  CPU: Intel(R) Xeon(R) W-2133 CPU @ 3.60GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-9.0.1 (ORCJIT, skylake-avx512)
Environment:
  JULIA_NUM_THREADS = 6

Currently julia’s special functions are unable to vectorize automatically by themself. That said, check out LoopVectorization If you write @inbounds @avx for i in eachindex(a1) you should get everything to vectorize.

6 Likes

Yes, that works:

julia> using LoopVectorization, BenchmarkTools

julia> a1 = rand(80_000);

julia> a2 = rand(80_000);

julia> out = similar(a1);

julia> function two_mul_sin!(out, a1, a2)
           @inbounds @simd for i in eachindex(a1)
               out[i] = sin(a1[i]) * a2[i]
           end
       end
two_mul_sin! (generic function with 1 method)

julia> function two_mul_sin_avx!(out, a1, a2)
           @avx for i in eachindex(a1)
               out[i] = sin(a1[i]) * a2[i]
           end
       end
two_mul_sin_avx! (generic function with 1 method)

julia> @benchmark two_mul_sin_avx!($out, $a1, $a2)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     81.675 ΞΌs (0.00% GC)
  median time:      84.828 ΞΌs (0.00% GC)
  mean time:        87.356 ΞΌs (0.00% GC)
  maximum time:     173.757 ΞΌs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

julia> @benchmark two_mul_sin!($out, $a1, $a2)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     354.765 ΞΌs (0.00% GC)
  median time:      372.535 ΞΌs (0.00% GC)
  mean time:        375.081 ΞΌs (0.00% GC)
  maximum time:     1.349 ms (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

julia> a1 = rand(800);

julia> a2 = rand(800);

julia> out = similar(a1);

julia> @benchmark two_mul_sin_avx!($out, $a1, $a2)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     417.286 ns (0.00% GC)
  median time:      482.095 ns (0.00% GC)
  mean time:        485.422 ns (0.00% GC)
  maximum time:     820.759 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     199

julia> @benchmark two_mul_sin!($out, $a1, $a2)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     2.354 ΞΌs (0.00% GC)
  median time:      2.448 ΞΌs (0.00% GC)
  mean time:        2.491 ΞΌs (0.00% GC)
  maximum time:     3.942 ΞΌs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     9

julia> (2.354e-6, 354.765e-6) ./ (417.286e-9, 81.675e-6) # relative performance
(5.641214898175352, 4.343617998163452)


julia> versioninfo()
Julia Version 1.5.3-pre.0
Commit 5905edebb1 (2020-09-24 05:09 UTC)
Platform Info:
  OS: Linux (x86_64-linux-gnu)
  CPU: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-9.0.1 (ORCJIT, tigerlake)

It’s about 5.6x faster with 800 elements, and 4.3 times faster with 80_000 on a tiger lake laptop.

With the latest release of VectorizationBase/LoopVectorization on most Linux distros, you wont see much interesting from @code_native because it will ccall libmvec.so for the SIMD sin implementation. It’ll probably switch to a Julia version eventually for all architectures (Linux with recent GLIBC is the exception).

8 Likes

Thank you @Elrod and @Oscar_Smith for the reply!

I’m trying to use @avx in my application that uses structs to wrap vectors. The MWE is as follows.


abstract type VComponent{T} <: AbstractVector{T} end

Base.size(VC::VComponent) = Base.size(VC.v)

Base.iterate(VC::VComponent) = Base.iterate(VC.v)

Base.iterate(VC::VComponent, iter) = Base.iterate(VC.v, iter)

Base.iterate(VC::VComponent, iter, state) = Base.Iterate(VC.v, iter, state)

Base.@propagate_inbounds function Base.getindex(VC::VComponent, i::Int)
    @boundscheck checkbounds(VC, i)
    VC.v[i]
end

Base.@propagate_inbounds function Base.setindex!(
    VC::VComponent,
    v::T,
    i::Int,
) where {T<:AbstractFloat}
    @boundscheck checkbounds(VC, i)
    VC.v[i] = v
end

# define the struct `IntParam` that implements the array interfaces
Base.@kwdef struct IntParam{T} <: VComponent{T}
    v::Vector{T}
end

Benchmark code is here:

v1 = IntParam{Float64}(rand(80_000));

v2 = IntParam{Float64}(rand(80_000));

# does not vectorize
@benchmark two_mul_sin_avx!($out, $v1, $v2)


BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     557.309 ΞΌs (0.00% GC)
  median time:      557.955 ΞΌs (0.00% GC)
  mean time:        566.163 ΞΌs (0.00% GC)
  maximum time:     623.568 ΞΌs (0.00% GC)
  --------------
  samples:          8826
  evals/sample:     1

With the code from @Elrod, the result is as follows


@benchmark two_mul_sin_avx!($out, $a1, $a2)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     76.891 ΞΌs (0.00% GC)
  median time:      78.154 ΞΌs (0.00% GC)
  mean time:        78.733 ΞΌs (0.00% GC)
  maximum time:     150.660 ΞΌs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

I suppose it’s because LoopVectorization cannot assume the data is stored in contiguous memory and thus have to call getindex each time to retrieve the next value.

The immediate fix is to write my code to access the v attribute directly (like the following). But that is very verbose in real applications.

function two_mul_sin_avx!(out, a1, a2)
    @avx for i in eachindex(a1)
        out[i] = sin(a1.v[i]) * a2.v[i]
    end
end

I looked up in the documentation of LoopVectorization but find my use case does not fit within any of the examples (not exactly an array-of-struct type of problem). Any suggestions?

That’s a problem with the documentation. It uses check_args to check if the arguments are supported:

julia> v1 = IntParam{Float64}(rand(80_000));

julia> v2 = IntParam{Float64}(rand(80_000));

julia> out = similar(v1);

julia> LoopVectorization.check_args(v1,v2,out)
false

The next major release of LoopVectorization (which I hope to have out by the end of the year) will use ArrayInterface.jl for this, which will change the recommended way of adding support.

But for now, defining these two methods should work:

LoopVectorization.check_args(::VComponent{T}) where {T} = LoopVectorization.check_type(T)
Base.pointer(v::VComponent) = pointer(v.v)

For example:

julia> v1 = IntParam{Float64}(rand(80_000));

julia> v2 = IntParam{Float64}(rand(80_000));

julia> out = similar(v1);

julia> @benchmark two_mul_sin_avx!($out, $v1, $v2)
BenchmarkTools.Trial:
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     477.467 ΞΌs (0.00% GC)
  median time:      480.595 ΞΌs (0.00% GC)
  mean time:        480.921 ΞΌs (0.00% GC)
  maximum time:     519.928 ΞΌs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

julia> LoopVectorization.check_args(::VComponent{T}) where {T} = LoopVectorization.check_type(T)

julia> Base.pointer(v::VComponent) = pointer(v.v)

julia> @benchmark two_mul_sin_avx!($out, $v1, $v2)
BenchmarkTools.Trial:
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     81.493 ΞΌs (0.00% GC)
  median time:      82.477 ΞΌs (0.00% GC)
  mean time:        82.725 ΞΌs (0.00% GC)
  maximum time:     152.587 ΞΌs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1
5 Likes

This is exactly what I’m looking for! Thank you very much for your help. I’m going to mark it as the solution and will watch for the new release of LoopVectorization.

3 Likes

With the latest version of LoopVectorization, check_args seems to have been deprecated.

I was able to get it working with

using VectorizationBase

@inline VectorizationBase.memory_reference(::VComponent{T}) where {T} = VectorizationBase.memory_reference(T)

Is this the correct new way of adding vectorization support for vector wrappers?

I’ll add an inherits_from_parent trait to ArrayInterface that’ll let you define 3 simple methods:

  1. ArrayInterface.inherits_from_parent
  2. Base.parent
  3. ArrayInterface.parent_type

which should add LoopVectorization support.
Currently, a few more methods are needed:

using ArrayInterface
struct ArrayWrapper{T,N,A <: AbstractArray{T,N}} <: AbstractArray{T,N}
    data::A
end
ArrayInterface.parent_type(::Type{ArrayWrapper{T,N,A}}) where {T,N,A} = A
Base.parent(x::ArrayWrapper) = x.data
Base.unsafe_convert(::Type{Ptr{T}}, x::ArrayWrapper{T}) where {T} = Base.unsafe_convert(Ptr{T}, parent(x))
Base.size(x::ArrayWrapper) = size(parent(x))
Base.strides(x::ArrayWrapper) = strides(parent(x))

ArrayInterface.contiguous_axis(::Type{A}) where {A <: ArrayWrapper} = ArrayInterface.contiguous_axis(ArrayInterface.parent_type(A))
ArrayInterface.contiguous_batch_size(::Type{A}) where {A <: ArrayWrapper} = ArrayInterface.contiguous_batch_size(ArrayInterface.parent_type(A))
ArrayInterface.stride_rank(::Type{A}) where {A <: ArrayWrapper} = ArrayInterface.stride_rank(ArrayInterface.parent_type(A))

Note of course that you still need to implement the rest of the usual array interface, such as getindex. Although now:

julia> A = ArrayWrapper(rand(8,10));

julia> using LoopVectorization

julia> stridedpointer(A) # works
VectorizationBase.StridedPointer{Float64, 2, 1, 0, (1, 2), Tuple{ArrayInterface.StaticInt{8}, Int64}, Tuple{ArrayInterface.StaticInt{1}, ArrayInterface.StaticInt{1}}}(Ptr{Float64} @0x00007f0854fd8610, (Static(8), 64), (Static(1), Static(1)))

julia> LoopVectorization.check_args(A) # true
true
3 Likes

Hi @Elrod ,

I’m trying to adapt my code for LoopVectorization 0.21. In a nutshell, my custom type wraps a Vector in the field named v.

The code below works with 0.20 but not with 0.21.

@inline VectorizationBase.memory_reference(::VComponent{T}) where {T} = VectorizationBase.memory_reference(T)

The error is

β”‚ `LoopVectorization.check_args` on your inputs failed; running fallback `@inbounds @fastmath` loop instead.

I tried to overload check_args that worked in an earlier version below, but the error is new


LoopVectorization.check_args(::VComponent{T}) where {T} = LoopVectorization.check_type(T)
Base.pointer(v::VComponent) = pointer(v.v)

Error message:

nlsolve without Jac: Error During Test at /home/hacui/repos/Dianli.jl/test/test_pfsol.jl:19
  Got exception outside of a @test
  "Memory access for Dianli.BasicTypes.IntParam{Float64} not implemented yet."

If the memory_reference overload is added back, the error is below:

  MethodError: no method matching memory_reference(::Type{Float64})
  Closest candidates are:
    memory_reference(::LayoutPointers.AbstractStridedPointer) at ~/.julia/packages/LayoutPointers/fHXhH/src/stridedpointers.jl:50
    memory_reference(::AbstractRange{T}) where T at ~/.julia/packages/LayoutPointers/fHXhH/src/stridedpointers.jl:180
    memory_reference(::BitArray) at ~/.julia/packages/LayoutPointers/fHXhH/src/stridedpointers.jl:15

Any hints to fix it?

Thanks in advance!

Assuming parent is defined

memory_reference(v::VComponent) = memory_reference(parent(v))
1 Like