Trigonometric functions do not use SIMD - Solved: use LoopVectorization.jl

cuihantao · October 21, 2020, 3:09am

I noticed that computations with trigonometric functions cannot use SIMD. Is this a nature of trigonometric functions? I would like to learn more details.

My test code is as follows:

function two_mul(out, a1, a2)
    @inbounds @simd for i in eachindex(a1)
        out[i] = (a1[i]) * a2[i]  # no trig function here
    end
end

a1 = rand(80_000)
a2 = rand(80_000)
out = similar(a1);

@code_native two_mul(out, a1, a2)
@btime two_mul(out, a1, a2)

Outputs and following, and I can find the usage of vmulpd

	.text
; ┌ @ In[168]:1 within `two_mul'
	movq	%rsi, -8(%rsp)
	movq	8(%rsi), %rax
; │ @ In[168]:2 within `two_mul'
; │┌ @ simdloop.jl:69 within `macro expansion'
; ││┌ @ abstractarray.jl:212 within `eachindex'
; │││┌ @ abstractarray.jl:95 within `axes1'
; ││││┌ @ abstractarray.jl:75 within `axes'
; │││││┌ @ array.jl:155 within `size'
	movq	24(%rax), %rcx
; │││││└
; │││││┌ @ tuple.jl:157 within `map'
; ││││││┌ @ range.jl:326 within `OneTo' @ range.jl:317
; │││││││┌ @ promotion.jl:409 within `max'
	testq	%rcx, %rcx
; ││└└└└└└
; ││ @ simdloop.jl:72 within `macro expansion'
	jle	L279
	movq	%rcx, %rdx
	sarq	$63, %rdx
	andnq	%rcx, %rdx, %r8
	movq	(%rsi), %rdi
	movq	16(%rsi), %rdx
	movq	(%rax), %rcx
	movq	(%rdx), %rdx
	movq	(%rdi), %rsi
; ││ @ simdloop.jl:75 within `macro expansion'
	cmpq	$32, %r8
	jae	L63
	xorl	%edi, %edi
	jmp	L256
; ││ @ simdloop.jl:75 within `macro expansion'
L63:
	leaq	(%rsi,%r8,8), %rdi
	leaq	(%rcx,%r8,8), %rax
	leaq	(%rdx,%r8,8), %r9
	cmpq	%rax, %rsi
	setb	%r10b
	cmpq	%rdi, %rcx
	setb	%r11b
	andb	%r10b, %r11b
	cmpq	%r9, %rsi
	setb	%r9b
	cmpq	%rdi, %rdx
	setb	%al
	andb	%r9b, %al
	orb	%r11b, %al
	cmpb	$1, %al
	jne	L128
	movb	$1, %al
; ││ @ simdloop.jl:75 within `macro expansion'
	testb	%al, %al
	je	L128
	xorl	%edi, %edi
	jmp	L256
; ││ @ simdloop.jl:75 within `macro expansion'
L128:
	movl	%r8d, %eax
	andl	$31, %eax
	movq	%r8, %rdi
	subq	%rax, %rdi
	xorl	%eax, %eax
	nop
; ││ @ simdloop.jl:77 within `macro expansion' @ In[168]:3
; ││┌ @ array.jl:809 within `getindex'
L144:
	vmovupd	(%rcx,%rax,8), %zmm0
	vmovupd	64(%rcx,%rax,8), %zmm1
	vmovupd	128(%rcx,%rax,8), %zmm2
	vmovupd	192(%rcx,%rax,8), %zmm3
; ││└
; ││┌ @ float.jl:405 within `*'
	vmulpd	(%rdx,%rax,8), %zmm0, %zmm0
	vmulpd	64(%rdx,%rax,8), %zmm1, %zmm1
	vmulpd	128(%rdx,%rax,8), %zmm2, %zmm2
	vmulpd	192(%rdx,%rax,8), %zmm3, %zmm3
; ││└
; ││┌ @ array.jl:847 within `setindex!'
	vmovupd	%zmm0, (%rsi,%rax,8)
	vmovupd	%zmm1, 64(%rsi,%rax,8)
	vmovupd	%zmm2, 128(%rsi,%rax,8)
	vmovupd	%zmm3, 192(%rsi,%rax,8)
; ││└
; ││ @ simdloop.jl:78 within `macro expansion'
; ││┌ @ int.jl:86 within `+'
	addq	$32, %rax
	cmpq	%rax, %rdi
	jne	L144
; ││└
; ││ @ simdloop.jl:75 within `macro expansion'
	cmpq	%rdi, %r8
	je	L279
	nopl	(%rax,%rax)
; ││ @ simdloop.jl:77 within `macro expansion' @ In[168]:3
; ││┌ @ array.jl:809 within `getindex'
L256:
	vmovsd	(%rcx,%rdi,8), %xmm0    # xmm0 = mem[0],zero
; ││└
; ││┌ @ float.jl:405 within `*'
	vmulsd	(%rdx,%rdi,8), %xmm0, %xmm0
; ││└
; ││┌ @ array.jl:847 within `setindex!'
	vmovsd	%xmm0, (%rsi,%rdi,8)
; ││└
; ││ @ simdloop.jl:78 within `macro expansion'
; ││┌ @ int.jl:86 within `+'
	incq	%rdi
; ││└
; ││ @ simdloop.jl:75 within `macro expansion'
; ││┌ @ int.jl:82 within `<'
	cmpq	%r8, %rdi
; ││└
	jb	L256
; └└
; ┌ @ simdloop.jl within `two_mul'
L279:
	movabsq	$jl_system_image_data, %rax
; └
; ┌ @ In[168]:2 within `two_mul'
	vzeroupper
	retq
; └
  68.392 μs (0 allocations: 0 bytes)

If I add a sin function, it will use vmulsd instead.

function two_mul_sin(out, a1, a2)
    @inbounds @simd for i in eachindex(a1)
        out[i] = sin(a1[i]) * a2[i]
    end
end

a1 = rand(80_000)
a2 = rand(80_000)
out = similar(a1);

@code_native two_mul_sin(out, a1, a2)
@btime two_mul_sin(out, a1, a2)

Outputs:

	.text
; ┌ @ In[169]:1 within `two_mul_sin'
	pushq	%rbp
	pushq	%r15
	pushq	%r14
	pushq	%r13
	pushq	%r12
	pushq	%rbx
	pushq	%rax
	movq	%rsi, (%rsp)
	movq	8(%rsi), %r14
; │ @ In[169]:2 within `two_mul_sin'
; │┌ @ simdloop.jl:69 within `macro expansion'
; ││┌ @ abstractarray.jl:212 within `eachindex'
; │││┌ @ abstractarray.jl:95 within `axes1'
; ││││┌ @ abstractarray.jl:75 within `axes'
; │││││┌ @ array.jl:155 within `size'
	movq	24(%r14), %rax
; │││││└
; │││││┌ @ tuple.jl:157 within `map'
; ││││││┌ @ range.jl:326 within `OneTo' @ range.jl:317
; │││││││┌ @ promotion.jl:409 within `max'
	testq	%rax, %rax
; ││└└└└└└
; ││ @ simdloop.jl:72 within `macro expansion'
	jle	L100
	movq	%rax, %rcx
	sarq	$63, %rcx
	andnq	%rax, %rcx, %r15
	movq	(%rsi), %r12
	movq	16(%rsi), %r13
	xorl	%ebx, %ebx
	movabsq	$sin, %rbp
	nopl	(%rax,%rax)
; ││ @ simdloop.jl:77 within `macro expansion' @ In[169]:3
; ││┌ @ array.jl:809 within `getindex'
L64:
	movq	(%r14), %rax
	vmovsd	(%rax,%rbx,8), %xmm0    # xmm0 = mem[0],zero
; ││└
	callq	*%rbp
; ││┌ @ array.jl:809 within `getindex'
	movq	(%r13), %rax
; ││└
; ││┌ @ float.jl:405 within `*'
	vmulsd	(%rax,%rbx,8), %xmm0, %xmm0
; ││└
; ││┌ @ array.jl:847 within `setindex!'
	movq	(%r12), %rax
	vmovsd	%xmm0, (%rax,%rbx,8)
; ││└
; ││ @ simdloop.jl:78 within `macro expansion'
; ││┌ @ int.jl:86 within `+'
	incq	%rbx
; ││└
; ││ @ simdloop.jl:75 within `macro expansion'
; ││┌ @ int.jl:82 within `<'
	cmpq	%r15, %rbx
; ││└
	jb	L64
; └└
; ┌ @ simdloop.jl within `two_mul_sin'
L100:
	movabsq	$jl_system_image_data, %rax
; └
; ┌ @ In[169]:2 within `two_mul_sin'
	addq	$8, %rsp
	popq	%rbx
	popq	%r12
	popq	%r13
	popq	%r14
	popq	%r15
	popq	%rbp
	retq
; └
  526.181 μs (0 allocations: 0 bytes)

versioninfo()

Julia Version 1.5.2
Commit 539f3ce943 (2020-09-23 23:17 UTC)
Platform Info:
  OS: Linux (x86_64-pc-linux-gnu)
  CPU: Intel(R) Xeon(R) W-2133 CPU @ 3.60GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-9.0.1 (ORCJIT, skylake-avx512)
Environment:
  JULIA_NUM_THREADS = 6

Oscar_Smith · October 21, 2020, 3:22am

Currently julia’s special functions are unable to vectorize automatically by themself. That said, check out LoopVectorization If you write @inbounds @avx for i in eachindex(a1) you should get everything to vectorize.

Elrod · October 21, 2020, 4:18am

Yes, that works:

julia> using LoopVectorization, BenchmarkTools

julia> a1 = rand(80_000);

julia> a2 = rand(80_000);

julia> out = similar(a1);

julia> function two_mul_sin!(out, a1, a2)
           @inbounds @simd for i in eachindex(a1)
               out[i] = sin(a1[i]) * a2[i]
           end
       end
two_mul_sin! (generic function with 1 method)

julia> function two_mul_sin_avx!(out, a1, a2)
           @avx for i in eachindex(a1)
               out[i] = sin(a1[i]) * a2[i]
           end
       end
two_mul_sin_avx! (generic function with 1 method)

julia> @benchmark two_mul_sin_avx!($out, $a1, $a2)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     81.675 μs (0.00% GC)
  median time:      84.828 μs (0.00% GC)
  mean time:        87.356 μs (0.00% GC)
  maximum time:     173.757 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

julia> @benchmark two_mul_sin!($out, $a1, $a2)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     354.765 μs (0.00% GC)
  median time:      372.535 μs (0.00% GC)
  mean time:        375.081 μs (0.00% GC)
  maximum time:     1.349 ms (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

julia> a1 = rand(800);

julia> a2 = rand(800);

julia> out = similar(a1);

julia> @benchmark two_mul_sin_avx!($out, $a1, $a2)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     417.286 ns (0.00% GC)
  median time:      482.095 ns (0.00% GC)
  mean time:        485.422 ns (0.00% GC)
  maximum time:     820.759 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     199

julia> @benchmark two_mul_sin!($out, $a1, $a2)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     2.354 μs (0.00% GC)
  median time:      2.448 μs (0.00% GC)
  mean time:        2.491 μs (0.00% GC)
  maximum time:     3.942 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     9

julia> (2.354e-6, 354.765e-6) ./ (417.286e-9, 81.675e-6) # relative performance
(5.641214898175352, 4.343617998163452)


julia> versioninfo()
Julia Version 1.5.3-pre.0
Commit 5905edebb1 (2020-09-24 05:09 UTC)
Platform Info:
  OS: Linux (x86_64-linux-gnu)
  CPU: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-9.0.1 (ORCJIT, tigerlake)

It’s about 5.6x faster with 800 elements, and 4.3 times faster with 80_000 on a tiger lake laptop.

With the latest release of VectorizationBase/LoopVectorization on most Linux distros, you wont see much interesting from @code_native because it will ccall libmvec.so for the SIMD sin implementation. It’ll probably switch to a Julia version eventually for all architectures (Linux with recent GLIBC is the exception).

cuihantao · October 21, 2020, 8:51pm

Thank you @Elrod and @Oscar_Smith for the reply!

I’m trying to use @avx in my application that uses structs to wrap vectors. The MWE is as follows.


abstract type VComponent{T} <: AbstractVector{T} end

Base.size(VC::VComponent) = Base.size(VC.v)

Base.iterate(VC::VComponent) = Base.iterate(VC.v)

Base.iterate(VC::VComponent, iter) = Base.iterate(VC.v, iter)

Base.iterate(VC::VComponent, iter, state) = Base.Iterate(VC.v, iter, state)

Base.@propagate_inbounds function Base.getindex(VC::VComponent, i::Int)
    @boundscheck checkbounds(VC, i)
    VC.v[i]
end

Base.@propagate_inbounds function Base.setindex!(
    VC::VComponent,
    v::T,
    i::Int,
) where {T<:AbstractFloat}
    @boundscheck checkbounds(VC, i)
    VC.v[i] = v
end

# define the struct `IntParam` that implements the array interfaces
Base.@kwdef struct IntParam{T} <: VComponent{T}
    v::Vector{T}
end

Benchmark code is here:

v1 = IntParam{Float64}(rand(80_000));

v2 = IntParam{Float64}(rand(80_000));

# does not vectorize
@benchmark two_mul_sin_avx!($out, $v1, $v2)


BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     557.309 μs (0.00% GC)
  median time:      557.955 μs (0.00% GC)
  mean time:        566.163 μs (0.00% GC)
  maximum time:     623.568 μs (0.00% GC)
  --------------
  samples:          8826
  evals/sample:     1

With the code from @Elrod, the result is as follows


@benchmark two_mul_sin_avx!($out, $a1, $a2)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     76.891 μs (0.00% GC)
  median time:      78.154 μs (0.00% GC)
  mean time:        78.733 μs (0.00% GC)
  maximum time:     150.660 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

I suppose it’s because LoopVectorization cannot assume the data is stored in contiguous memory and thus have to call getindex each time to retrieve the next value.

The immediate fix is to write my code to access the v attribute directly (like the following). But that is very verbose in real applications.

function two_mul_sin_avx!(out, a1, a2)
    @avx for i in eachindex(a1)
        out[i] = sin(a1.v[i]) * a2.v[i]
    end
end

I looked up in the documentation of LoopVectorization but find my use case does not fit within any of the examples (not exactly an array-of-struct type of problem). Any suggestions?

Elrod · October 21, 2020, 8:59pm

That’s a problem with the documentation. It uses check_args to check if the arguments are supported:

julia> v1 = IntParam{Float64}(rand(80_000));

julia> v2 = IntParam{Float64}(rand(80_000));

julia> out = similar(v1);

julia> LoopVectorization.check_args(v1,v2,out)
false

The next major release of LoopVectorization (which I hope to have out by the end of the year) will use ArrayInterface.jl for this, which will change the recommended way of adding support.

But for now, defining these two methods should work:

LoopVectorization.check_args(::VComponent{T}) where {T} = LoopVectorization.check_type(T)
Base.pointer(v::VComponent) = pointer(v.v)

For example:

julia> v1 = IntParam{Float64}(rand(80_000));

julia> v2 = IntParam{Float64}(rand(80_000));

julia> out = similar(v1);

julia> @benchmark two_mul_sin_avx!($out, $v1, $v2)
BenchmarkTools.Trial:
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     477.467 μs (0.00% GC)
  median time:      480.595 μs (0.00% GC)
  mean time:        480.921 μs (0.00% GC)
  maximum time:     519.928 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

julia> LoopVectorization.check_args(::VComponent{T}) where {T} = LoopVectorization.check_type(T)

julia> Base.pointer(v::VComponent) = pointer(v.v)

julia> @benchmark two_mul_sin_avx!($out, $v1, $v2)
BenchmarkTools.Trial:
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     81.493 μs (0.00% GC)
  median time:      82.477 μs (0.00% GC)
  mean time:        82.725 μs (0.00% GC)
  maximum time:     152.587 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

cuihantao · October 21, 2020, 9:34pm

Elrod:

But for now, defining these two methods should work:

LoopVectorization.check_args(::VComponent{T}) where {T} = LoopVectorization.check_type(T)
Base.pointer(v::VComponent) = pointer(v.v)

This is exactly what I’m looking for! Thank you very much for your help. I’m going to mark it as the solution and will watch for the new release of LoopVectorization.

cuihantao · January 22, 2021, 4:49pm

With the latest version of LoopVectorization, check_args seems to have been deprecated.

I was able to get it working with

using VectorizationBase

@inline VectorizationBase.memory_reference(::VComponent{T}) where {T} = VectorizationBase.memory_reference(T)

Is this the correct new way of adding vectorization support for vector wrappers?

Elrod · January 22, 2021, 11:42pm

I’ll add an inherits_from_parent trait to ArrayInterface that’ll let you define 3 simple methods:

ArrayInterface.inherits_from_parent
Base.parent
ArrayInterface.parent_type

which should add LoopVectorization support.
Currently, a few more methods are needed:

using ArrayInterface
struct ArrayWrapper{T,N,A <: AbstractArray{T,N}} <: AbstractArray{T,N}
    data::A
end
ArrayInterface.parent_type(::Type{ArrayWrapper{T,N,A}}) where {T,N,A} = A
Base.parent(x::ArrayWrapper) = x.data
Base.unsafe_convert(::Type{Ptr{T}}, x::ArrayWrapper{T}) where {T} = Base.unsafe_convert(Ptr{T}, parent(x))
Base.size(x::ArrayWrapper) = size(parent(x))
Base.strides(x::ArrayWrapper) = strides(parent(x))

ArrayInterface.contiguous_axis(::Type{A}) where {A <: ArrayWrapper} = ArrayInterface.contiguous_axis(ArrayInterface.parent_type(A))
ArrayInterface.contiguous_batch_size(::Type{A}) where {A <: ArrayWrapper} = ArrayInterface.contiguous_batch_size(ArrayInterface.parent_type(A))
ArrayInterface.stride_rank(::Type{A}) where {A <: ArrayWrapper} = ArrayInterface.stride_rank(ArrayInterface.parent_type(A))

Note of course that you still need to implement the rest of the usual array interface, such as getindex. Although now:

julia> A = ArrayWrapper(rand(8,10));

julia> using LoopVectorization

julia> stridedpointer(A) # works
VectorizationBase.StridedPointer{Float64, 2, 1, 0, (1, 2), Tuple{ArrayInterface.StaticInt{8}, Int64}, Tuple{ArrayInterface.StaticInt{1}, ArrayInterface.StaticInt{1}}}(Ptr{Float64} @0x00007f0854fd8610, (Static(8), 64), (Static(1), Static(1)))

julia> LoopVectorization.check_args(A) # true
true

cuihantao · December 30, 2021, 5:51am

Hi @Elrod ,

I’m trying to adapt my code for LoopVectorization 0.21. In a nutshell, my custom type wraps a Vector in the field named v.

The code below works with 0.20 but not with 0.21.

@inline VectorizationBase.memory_reference(::VComponent{T}) where {T} = VectorizationBase.memory_reference(T)

The error is

│ `LoopVectorization.check_args` on your inputs failed; running fallback `@inbounds @fastmath` loop instead.

I tried to overload check_args that worked in an earlier version below, but the error is new


LoopVectorization.check_args(::VComponent{T}) where {T} = LoopVectorization.check_type(T)
Base.pointer(v::VComponent) = pointer(v.v)

Error message:

nlsolve without Jac: Error During Test at /home/hacui/repos/Dianli.jl/test/test_pfsol.jl:19
  Got exception outside of a @test
  "Memory access for Dianli.BasicTypes.IntParam{Float64} not implemented yet."

If the memory_reference overload is added back, the error is below:

  MethodError: no method matching memory_reference(::Type{Float64})
  Closest candidates are:
    memory_reference(::LayoutPointers.AbstractStridedPointer) at ~/.julia/packages/LayoutPointers/fHXhH/src/stridedpointers.jl:50
    memory_reference(::AbstractRange{T}) where T at ~/.julia/packages/LayoutPointers/fHXhH/src/stridedpointers.jl:180
    memory_reference(::BitArray) at ~/.julia/packages/LayoutPointers/fHXhH/src/stridedpointers.jl:15

Any hints to fix it?

Thanks in advance!

Elrod · December 30, 2021, 3:29pm

Assuming parent is defined

memory_reference(v::VComponent) = memory_reference(parent(v))

Topic		Replies	Views
What is the future of `sin(::Vector)` when there is `sin.(::Vector)`? Internals & Design	34	8524	December 10, 2016
Cosine seems slow Performance	14	1797	November 27, 2019
[ANN] LoopVectorization Package Announcements	157	23284	May 27, 2020
Help understanding vectorization (or lack thereof) Performance	15	1212	June 8, 2018
Problem with LoopVectorization : @turbo, LoopVectorization.check_args Performance loopvectorization	27	2859	October 26, 2022

Trigonometric functions do not use SIMD - Solved: use LoopVectorization.jl

Related topics