Hi kolia,
I used the same input as you. My versioninfo()
returns:
Julia Version 1.2.0
Commit c6da87ff4b (2019-08-20 00:03 UTC)
Platform Info:
OS: Linux (x86_64-pc-linux-gnu)
CPU: Intel(R) Core(TM) i5-4300U CPU @ 1.90GHz
WORD_SIZE: 64
LIBM: libopenlibm
LLVM: libLLVM-6.0.1 (ORCJIT, haswell)
Maybe why you don’t see a speed-up is related to your CPU or OS. Here’s my @code_native
to compare with yours:
julia> @code_native summed(a)
.text
; ┌ @ REPL[2]:3 within `summed'
; │┌ @ simdloop.jl:71 within `macro expansion'
; ││┌ @ simdloop.jl:51 within `simd_inner_length'
; │││┌ @ REPL[2]:2 within `length'
movq 8(%rdi), %rax
; │└└└
; │┌ @ int.jl:49 within `macro expansion'
testq %rax, %rax
; └└
; ┌ @ simdloop.jl:72 within `summed'
jle L26
movq (%rdi), %rcx
; │ @ simdloop.jl:75 within `summed'
cmpq $16, %rax
jae L34
vxorpd %xmm0, %xmm0, %xmm0
xorl %edx, %edx
jmp L124
L26:
vxorps %xmm0, %xmm0, %xmm0
; └
; ┌ @ REPL[2]:6 within `summed'
vzeroupper
retq
; │ @ REPL[2]:3 within `summed'
; │┌ @ simdloop.jl:75 within `macro expansion'
L34:
movq %rax, %rdx
andq $-16, %rdx
leaq 96(%rcx), %rsi
vxorpd %xmm0, %xmm0, %xmm0
; ││ @ simdloop.jl:78 within `macro expansion'
; ││┌ @ int.jl:53 within `+'
movq %rdx, %rdi
vxorpd %xmm1, %xmm1, %xmm1
vxorpd %xmm2, %xmm2, %xmm2
vxorpd %xmm3, %xmm3, %xmm3
; ││└
; ││ @ simdloop.jl:77 within `macro expansion' @ REPL[2]:4
; ││┌ @ float.jl:395 within `+'
L64:
vaddpd -96(%rsi), %ymm0, %ymm0
vaddpd -64(%rsi), %ymm1, %ymm1
vaddpd -32(%rsi), %ymm2, %ymm2
vaddpd (%rsi), %ymm3, %ymm3
; │└└
; │┌ @ int.jl:53 within `macro expansion'
subq $-128, %rsi
addq $-16, %rdi
jne L64
; │└
; │┌ @ simdloop.jl:77 within `macro expansion' @ REPL[2]:4
; ││┌ @ float.jl:395 within `+'
vaddpd %ymm0, %ymm1, %ymm0
vaddpd %ymm0, %ymm2, %ymm0
vaddpd %ymm0, %ymm3, %ymm0
vextractf128 $1, %ymm0, %xmm1
vaddpd %ymm1, %ymm0, %ymm0
vhaddpd %ymm0, %ymm0, %ymm0
cmpq %rdx, %rax
; └└└
; ┌ @ simdloop.jl:75 within `summed'
je L158
L124:
subq %rdx, %rax
leaq (%rcx,%rdx,8), %rcx
nopw %cs:(%rax,%rax)
; └
; ┌ @ REPL[2]:3 within `summed'
; │┌ @ simdloop.jl:77 within `macro expansion' @ REPL[2]:4
; ││┌ @ float.jl:395 within `+'
L144:
vaddsd (%rcx), %xmm0, %xmm0
; ││└
; ││ @ simdloop.jl:75 within `macro expansion'
; ││┌ @ int.jl:49 within `<'
addq $8, %rcx
addq $-1, %rax
; ││└
jne L144
; │└
; │ @ REPL[2]:6 within `summed'
L158:
vzeroupper
retq
nopw %cs:(%rax,%rax)
; └
Glen