I’m trying follow this Demystifying Auto-Vectorization in Julia blog post. I couldn’t see the nice vaddpd
instructions, however. Is there something that I need to do to enable SIMD? The E5-2699 v4 processor supports AVX2.
$ julia -O3
julia> versioninfo()
Julia Version 0.6.2
Commit d386e40c17 (2017-12-13 18:08 UTC)
Platform Info:
OS: Linux (x86_64-pc-linux-gnu)
CPU: Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz
WORD_SIZE: 64
BLAS: libopenblas (USE64BITINT DYNAMIC_ARCH NO_AFFINITY Haswell)
LAPACK: libopenblas64_
LIBM: libopenlibm
LLVM: libLLVM-3.9.1 (ORCJIT, broadwell)
julia> function mysum(a::Vector)
total = zero(eltype(a))
@simd for x in a
total += x
end
return total
end
mysum (generic function with 1 method)
julia> @code_native mysum(rand(Float64 , 100000)) ;
.text
Filename: REPL[8]
Source line: 67
movq 8(%rdi), %rax
Source line: 64
movq 24(%rdi), %rdx
xorl %ecx, %ecx
Source line: 79
testq %rdx, %rdx
cmovnsq %rdx, %rcx
Source line: 68
testq %rax, %rax
jle L206
Source line: 79
leaq -1(%rcx), %rsi
cmpq %rdx, %rsi
jae L210
Source line: 50
movq (%rdi), %r9
Source line: 66
leaq 16(%r9), %r8
movq %rax, %r10
andq $-4, %r10
pxor %xmm0, %xmm0
xorl %edi, %edi
jmp L192
Source line: 71
L64:
testq %rax, %rax
jle L192
Source line: 50
cmpq $4, %rax
jae L79
xorl %edx, %edx
jmp L165
L79:
movq %rax, %rdx
andq $-4, %rdx
je L163
movq %xmm0, %xmm0 # xmm0 = xmm0[0],zero
xorpd %xmm1, %xmm1
Source line: 74
movq %r10, %rsi
movq %r8, %rcx
nopw %cs:(%rax,%rax)
Source line: 50
L112:
movupd -16(%rcx), %xmm2
movupd (%rcx), %xmm3
Source line: 4
addpd %xmm2, %xmm0
addpd %xmm3, %xmm1
Source line: 50
addq $32, %rcx
addq $-4, %rsi
jne L112
Source line: 4
addpd %xmm0, %xmm1
movapd %xmm1, %xmm0
shufpd $1, %xmm0, %xmm0 # xmm0 = xmm0[1,0]
addpd %xmm1, %xmm0
cmpq %rdx, %rax
je L192
jmp L165
L163:
xorl %edx, %edx
Source line: 50
L165:
movq %rax, %rcx
subq %rdx, %rcx
leaq (%r9,%rdx,8), %rdx
nop
Source line: 4
L176:
addsd (%rdx), %xmm0
Source line: 71
addq $8, %rdx
decq %rcx
jne L176
nopl (%rax)
Source line: 66
L192:
incq %rdi
cmpq $2, %rdi
jne L64
Source line: 6
retq
L206:
xorps %xmm0, %xmm0
retq
L210:
pushq %rbp
movq %rsp, %rbp
Source line: 79
movq %rsp, %rax
leaq -16(%rax), %rsi
movq %rsi, %rsp
movq %rcx, -16(%rax)
movabsq $jl_bounds_error_ints, %rax
movl $1, %edx
callq *%rax
nopw %cs:(%rax,%rax)