I noticed that computations with trigonometric functions cannot use SIMD. Is this a nature of trigonometric functions? I would like to learn more details.
My test code is as follows:
function two_mul(out, a1, a2)
@inbounds @simd for i in eachindex(a1)
out[i] = (a1[i]) * a2[i] # no trig function here
end
end
a1 = rand(80_000)
a2 = rand(80_000)
out = similar(a1);
@code_native two_mul(out, a1, a2)
@btime two_mul(out, a1, a2)
Outputs and following, and I can find the usage of vmulpd
.text
; β @ In[168]:1 within `two_mul'
movq %rsi, -8(%rsp)
movq 8(%rsi), %rax
; β @ In[168]:2 within `two_mul'
; ββ @ simdloop.jl:69 within `macro expansion'
; βββ @ abstractarray.jl:212 within `eachindex'
; ββββ @ abstractarray.jl:95 within `axes1'
; βββββ @ abstractarray.jl:75 within `axes'
; ββββββ @ array.jl:155 within `size'
movq 24(%rax), %rcx
; ββββββ
; ββββββ @ tuple.jl:157 within `map'
; βββββββ @ range.jl:326 within `OneTo' @ range.jl:317
; ββββββββ @ promotion.jl:409 within `max'
testq %rcx, %rcx
; ββββββββ
; ββ @ simdloop.jl:72 within `macro expansion'
jle L279
movq %rcx, %rdx
sarq $63, %rdx
andnq %rcx, %rdx, %r8
movq (%rsi), %rdi
movq 16(%rsi), %rdx
movq (%rax), %rcx
movq (%rdx), %rdx
movq (%rdi), %rsi
; ββ @ simdloop.jl:75 within `macro expansion'
cmpq $32, %r8
jae L63
xorl %edi, %edi
jmp L256
; ββ @ simdloop.jl:75 within `macro expansion'
L63:
leaq (%rsi,%r8,8), %rdi
leaq (%rcx,%r8,8), %rax
leaq (%rdx,%r8,8), %r9
cmpq %rax, %rsi
setb %r10b
cmpq %rdi, %rcx
setb %r11b
andb %r10b, %r11b
cmpq %r9, %rsi
setb %r9b
cmpq %rdi, %rdx
setb %al
andb %r9b, %al
orb %r11b, %al
cmpb $1, %al
jne L128
movb $1, %al
; ββ @ simdloop.jl:75 within `macro expansion'
testb %al, %al
je L128
xorl %edi, %edi
jmp L256
; ββ @ simdloop.jl:75 within `macro expansion'
L128:
movl %r8d, %eax
andl $31, %eax
movq %r8, %rdi
subq %rax, %rdi
xorl %eax, %eax
nop
; ββ @ simdloop.jl:77 within `macro expansion' @ In[168]:3
; βββ @ array.jl:809 within `getindex'
L144:
vmovupd (%rcx,%rax,8), %zmm0
vmovupd 64(%rcx,%rax,8), %zmm1
vmovupd 128(%rcx,%rax,8), %zmm2
vmovupd 192(%rcx,%rax,8), %zmm3
; βββ
; βββ @ float.jl:405 within `*'
vmulpd (%rdx,%rax,8), %zmm0, %zmm0
vmulpd 64(%rdx,%rax,8), %zmm1, %zmm1
vmulpd 128(%rdx,%rax,8), %zmm2, %zmm2
vmulpd 192(%rdx,%rax,8), %zmm3, %zmm3
; βββ
; βββ @ array.jl:847 within `setindex!'
vmovupd %zmm0, (%rsi,%rax,8)
vmovupd %zmm1, 64(%rsi,%rax,8)
vmovupd %zmm2, 128(%rsi,%rax,8)
vmovupd %zmm3, 192(%rsi,%rax,8)
; βββ
; ββ @ simdloop.jl:78 within `macro expansion'
; βββ @ int.jl:86 within `+'
addq $32, %rax
cmpq %rax, %rdi
jne L144
; βββ
; ββ @ simdloop.jl:75 within `macro expansion'
cmpq %rdi, %r8
je L279
nopl (%rax,%rax)
; ββ @ simdloop.jl:77 within `macro expansion' @ In[168]:3
; βββ @ array.jl:809 within `getindex'
L256:
vmovsd (%rcx,%rdi,8), %xmm0 # xmm0 = mem[0],zero
; βββ
; βββ @ float.jl:405 within `*'
vmulsd (%rdx,%rdi,8), %xmm0, %xmm0
; βββ
; βββ @ array.jl:847 within `setindex!'
vmovsd %xmm0, (%rsi,%rdi,8)
; βββ
; ββ @ simdloop.jl:78 within `macro expansion'
; βββ @ int.jl:86 within `+'
incq %rdi
; βββ
; ββ @ simdloop.jl:75 within `macro expansion'
; βββ @ int.jl:82 within `<'
cmpq %r8, %rdi
; βββ
jb L256
; ββ
; β @ simdloop.jl within `two_mul'
L279:
movabsq $jl_system_image_data, %rax
; β
; β @ In[168]:2 within `two_mul'
vzeroupper
retq
; β
68.392 ΞΌs (0 allocations: 0 bytes)
If I add a sin function, it will use vmulsd
instead.
function two_mul_sin(out, a1, a2)
@inbounds @simd for i in eachindex(a1)
out[i] = sin(a1[i]) * a2[i]
end
end
a1 = rand(80_000)
a2 = rand(80_000)
out = similar(a1);
@code_native two_mul_sin(out, a1, a2)
@btime two_mul_sin(out, a1, a2)
Outputs:
.text
; β @ In[169]:1 within `two_mul_sin'
pushq %rbp
pushq %r15
pushq %r14
pushq %r13
pushq %r12
pushq %rbx
pushq %rax
movq %rsi, (%rsp)
movq 8(%rsi), %r14
; β @ In[169]:2 within `two_mul_sin'
; ββ @ simdloop.jl:69 within `macro expansion'
; βββ @ abstractarray.jl:212 within `eachindex'
; ββββ @ abstractarray.jl:95 within `axes1'
; βββββ @ abstractarray.jl:75 within `axes'
; ββββββ @ array.jl:155 within `size'
movq 24(%r14), %rax
; ββββββ
; ββββββ @ tuple.jl:157 within `map'
; βββββββ @ range.jl:326 within `OneTo' @ range.jl:317
; ββββββββ @ promotion.jl:409 within `max'
testq %rax, %rax
; ββββββββ
; ββ @ simdloop.jl:72 within `macro expansion'
jle L100
movq %rax, %rcx
sarq $63, %rcx
andnq %rax, %rcx, %r15
movq (%rsi), %r12
movq 16(%rsi), %r13
xorl %ebx, %ebx
movabsq $sin, %rbp
nopl (%rax,%rax)
; ββ @ simdloop.jl:77 within `macro expansion' @ In[169]:3
; βββ @ array.jl:809 within `getindex'
L64:
movq (%r14), %rax
vmovsd (%rax,%rbx,8), %xmm0 # xmm0 = mem[0],zero
; βββ
callq *%rbp
; βββ @ array.jl:809 within `getindex'
movq (%r13), %rax
; βββ
; βββ @ float.jl:405 within `*'
vmulsd (%rax,%rbx,8), %xmm0, %xmm0
; βββ
; βββ @ array.jl:847 within `setindex!'
movq (%r12), %rax
vmovsd %xmm0, (%rax,%rbx,8)
; βββ
; ββ @ simdloop.jl:78 within `macro expansion'
; βββ @ int.jl:86 within `+'
incq %rbx
; βββ
; ββ @ simdloop.jl:75 within `macro expansion'
; βββ @ int.jl:82 within `<'
cmpq %r15, %rbx
; βββ
jb L64
; ββ
; β @ simdloop.jl within `two_mul_sin'
L100:
movabsq $jl_system_image_data, %rax
; β
; β @ In[169]:2 within `two_mul_sin'
addq $8, %rsp
popq %rbx
popq %r12
popq %r13
popq %r14
popq %r15
popq %rbp
retq
; β
526.181 ΞΌs (0 allocations: 0 bytes)
versioninfo()
Julia Version 1.5.2
Commit 539f3ce943 (2020-09-23 23:17 UTC)
Platform Info:
OS: Linux (x86_64-pc-linux-gnu)
CPU: Intel(R) Xeon(R) W-2133 CPU @ 3.60GHz
WORD_SIZE: 64
LIBM: libopenlibm
LLVM: libLLVM-9.0.1 (ORCJIT, skylake-avx512)
Environment:
JULIA_NUM_THREADS = 6