Amusingly it is not as fast for me on 0.7
versioninfo()
Julia Version 0.7.0-DEV.5021
Commit bfb1c1b91d* (2018-05-06 00:40 UTC)
Platform Info:
OS: Linux (x86_64-pc-linux-gnu)
CPU: Intel(R) Core(TM) i5-5###U CPU @ 2.00GHz
WORD_SIZE: 64
LIBM: libopenlibm
LLVM: libLLVM-6.0.0 (ORCJIT, broadwell)
So let’s see the reproducer:
using BenchmarkTools
d = rand(10000);
dm = rand(10000)+1;
omega_3(d, dmax) = (x= d/dmax; y = 1-x^2; y^4 );
omega_3fm(d, dmax) = (x= d/dmax; y = 1-x^2; @fastmath y^4 );
omega_4(d, dmax) = (x= d/dmax; y = 1-x^2; y*y*y*y);
begin
@btime omega_3.($d,$dm);
@btime omega_3fm.($d,$dm);
@btime omega_4.($d,$dm);
end;
# 799.657 ��s (2 allocations: 78.20 KiB)
# 25.416 ��s (2 allocations: 78.20 KiB)
# 18.386 ��s (2 allocations: 78.20 KiB)
So what’s happening? Let’s look at the @code_native
, first the good one:
@code_native omega_4(2.3,4.5)
.text
; Function omega_4 {
vdivsd %xmm1, %xmm0, %xmm0
vmulsd %xmm0, %xmm0, %xmm0
movabsq $140486950354952, %rax # imm = 0x7FC5AAC58008
vmovsd (%rax), %xmm1 # xmm1 = mem[0],zero
vsubsd %xmm0, %xmm1, %xmm0
vmulsd %xmm0, %xmm0, %xmm1
vmulsd %xmm1, %xmm0, %xmm1
vmulsd %xmm1, %xmm0, %xmm0
retq
nopw (%rax,%rax)
;}
And now the bad:
@code_native omega_3fm(2.3,4.5)
.text
; Function omega_3fm {
vdivsd %xmm1, %xmm0, %xmm0
vmulsd %xmm0, %xmm0, %xmm0
movabsq $140486950351792, %rax # imm = 0x7FC5AAC573B0
vmovsd (%rax), %xmm1 # xmm1 = mem[0],zero
vsubsd %xmm0, %xmm1, %xmm0
vmovsd %xmm0, -8(%rsp)
vmulsd %xmm0, %xmm0, %xmm0
vmulsd %xmm0, %xmm0, %xmm0
retq
nopl (%rax)
;}
Who ordered the stupid vmovsd %xmm0, -8(%rsp)
? We save an entire multiply but waste it by a spurious store? WTF? I’d say that is an LLVM bug / missed optimization opportunity.
But this does not matter in isolation:
begin
@btime omega_3fm($1.23, $1.456);
@btime omega_4($1.23, $1.456);
end;
# 2.650 ns (0 allocations: 0 bytes)
# 2.649 ns (0 allocations: 0 bytes)
So let’s look at the code in context:
foo_3fm(dst,d,dm) = (dst .= omega_3fm.(d,dm); nothing);
foo_4(dst,d,dm) = (dst .= omega_4.(d,dm); nothing);
dst = similar(d);
@btime foo_3fm($dst,$d,$dm)
@btime foo_4($dst,$d,$dm)
@btime broadcast!(/, dst, d,dm);
#24.188 ��s (0 allocations: 0 bytes)
#17.738 ��s (0 allocations: 0 bytes)
#17.547 ��s (0 allocations: 0 bytes)
So we see that the extra multiplication does not matter at all; we are probably constrained by the division only.
We now have a simpler reproducer, and (oh my god is the new @code_native
annoyingly verbose, about 2/3 of it is endless unhelpful comments about the provenance of every instruction) can look at the code. We continue to see all the bad writes to the stack (vmovsd %xmm1, 8(%rsp)
) and it is clear that they don’t matter unless either the division or a write faults. I have no idea what llvm is thinking; is there a way to inspect the stack unwinding code for a div-by-zero? Maybe that’s why llvm does not simply kill the instruction during generation of native code (the optimization would have to be pretty global, changing both the exception handling and the local code, so more difficult to spot).
I don’t really understand why these moves are so expensive, they should never hit main memory (thanks to write combining and register renaming). Maybe someone fire up IACA.jl
? Or someone who has a better understanding of uarch than me?
@code_native foo_3fm(dst,d,dm)
.text
; Function foo_3fm {
; Location: REPL[35]:1
pushq %rbp
pushq %r15
pushq %r14
pushq %r13
pushq %r12
pushq %rbx
subq $56, %rsp
vxorpd %xmm0, %xmm0, %xmm0
vmovapd %xmm0, 16(%rsp)
movq $0, 32(%rsp)
movq %rsi, 48(%rsp)
movq %fs:0, %rax
movq $2, 16(%rsp)
movq -10920(%rax), %rcx
movq %rcx, 24(%rsp)
leaq 16(%rsp), %rcx
movq %rcx, -10920(%rax)
leaq -10920(%rax), %rdi
movq (%rsi), %r12
movq 8(%rsi), %rbx
movq 16(%rsi), %r15
movq 24(%r12), %rbp
movq %rbp, %rax
sarq $63, %rax
andnq %rbp, %rax, %r13
movq 24(%rbx), %r14
movq %r14, %rax
sarq $63, %rax
andnq %r14, %rax, %rax
cmpq %rax, %r13
je L146
cmpq $1, %rax
jne L665
L146:
movq 24(%r15), %rax
movq %rax, %rcx
sarq $63, %rcx
andnq %rax, %rcx, %rax
cmpq %rax, %r13
je L177
cmpq $1, %rax
jne L734
L177:
cmpq %rbx, %r12
je L219
movq (%r12), %rax
cmpq (%rbx), %rax
jne L219
movabsq $jl_array_copy, %rax
movq %rdi, %r14
movq %rbx, %rdi
callq *%rax
movq %r14, %rdi
movq %rax, %rbx
movq 24(%rbx), %r14
L219:
cmpq %r15, %r12
je L266
movq (%r12), %rax
cmpq (%r15), %rax
jne L266
movq %rbx, 32(%rsp)
movabsq $jl_array_copy, %rax
movq %rdi, 40(%rsp)
movq %r15, %rdi
callq *%rax
movq 40(%rsp), %rdi
movq %rax, %r15
L266:
testq %r13, %r13
jle L632
testq %rbp, %rbp
jle L632
movq 24(%r15), %rax
cmpq $1, %r14
jne L379
cmpq $1, %rax
jne L476
xorl %eax, %eax
movabsq $140486950383608, %rcx # imm = 0x7FC5AAC5EFF8
vmovsd (%rcx), %xmm0 # xmm0 = mem[0],zero
L320:
movq (%rbx), %rcx
vmovsd (%rcx), %xmm1 # xmm1 = mem[0],zero
movq (%r15), %rcx
vdivsd (%rcx), %xmm1, %xmm1
vmulsd %xmm1, %xmm1, %xmm1
vsubsd %xmm1, %xmm0, %xmm1
vmovsd %xmm1, 8(%rsp)
vmulsd %xmm1, %xmm1, %xmm1
vmulsd %xmm1, %xmm1, %xmm1
movq (%r12), %rcx
vmovsd %xmm1, (%rcx,%rax,8)
addq $1, %rax
cmpq %rax, %r13
jne L320
jmp L632
L379:
cmpq $1, %rax
jne L553
xorl %eax, %eax
movabsq $140486950383608, %rcx # imm = 0x7FC5AAC5EFF8
vmovsd (%rcx), %xmm0 # xmm0 = mem[0],zero
nopw %cs:(%rax,%rax)
L416:
movq (%rbx), %rcx
vmovsd (%rcx,%rax,8), %xmm1 # xmm1 = mem[0],zero
movq (%r15), %rcx
vdivsd (%rcx), %xmm1, %xmm1
vsubsd %xmm1, %xmm0, %xmm1
vmovsd %xmm1, 8(%rsp)
vmulsd %xmm1, %xmm1, %xmm1
vmulsd %xmm1, %xmm1, %xmm1
movq (%r12), %rcx
vmovsd %xmm1, (%rcx,%rax,8)
addq $1, %rax
cmpq %rax, %r13
jne L416
jmp L632
L476:
xorl %eax, %eax
movabsq $140486950383608, %rcx # imm = 0x7FC5AAC5EFF8
vmovsd (%rcx), %xmm0 # xmm0 = mem[0],zero
nopl (%rax)
L496:
movq (%rbx), %rcx
vmovsd (%rcx), %xmm1 # xmm1 = mem[0],zero
movq (%r15), %rcx
vdivsd (%rcx,%rax,8), %xmm1, %xmm1
vmulsd %xmm1, %xmm1, %xmm1
vsubsd %xmm1, %xmm0, %xmm1
vmovsd %xmm1, 8(%rsp)
vmulsd %xmm1, %xmm1, %xmm1
vmulsd %xmm1, %xmm1, %xmm1
movq (%r12), %rcx
vmovsd %xmm1, (%rcx,%rax,8)
addq $1, %rax
cmpq %rax, %r13
jne L496
jmp L632
L553:
xorl %eax, %eax
movabsq $140486950383608, %rcx # imm = 0x7FC5AAC5EFF8
vmovsd (%rcx), %xmm0 # xmm0 = mem[0],zero
nopl (%rax)
L576:
movq (%rbx), %rcx
vmovsd (%rcx,%rax,8), %xmm1 # xmm1 = mem[0],zero
vdivsd (%rcx,%rax,8), %xmm1, %xmm1
vmulsd %xmm1, %xmm1, %xmm1
vsubsd %xmm1, %xmm0, %xmm1
vmovsd %xmm1, 8(%rsp)
vmulsd %xmm1, %xmm1, %xmm1
vmulsd %xmm1, %xmm1, %xmm1
movq (%r12), %rcx
vmovsd %xmm1, (%rcx,%rax,8)
addq $1, %rax
jne L576
L632:
movq 24(%rsp), %rax
movq %rax, (%rdi)
movabsq $140486995693576, %rax # imm = 0x7FC5AD795008
addq $56, %rsp
popq %rbx
popq %r12
popq %r13
popq %r14
popq %r15
popq %rbp
retq
L665:
movabsq $jl_gc_pool_alloc, %rax
movl $1424, %esi # imm = 0x590
movl $16, %edx
callq *%rax
movabsq $jl_system_image_data, %rcx
movq %rcx, -8(%rax)
movabsq $jl_system_image_data, %rcx
movq %rcx, (%rax)
movq %rax, 32(%rsp)
movabsq $jl_throw, %rcx
movq %rax, %rdi
callq *%rcx
L734:
movabsq $jl_gc_pool_alloc, %rax
movl $1424, %esi # imm = 0x590
movl $16, %edx
callq *%rax
movabsq $jl_system_image_data, %rcx
movq %rcx, -8(%rax)
movabsq $jl_system_image_data, %rcx
movq %rcx, (%rax)
movq %rax, 32(%rsp)
movabsq $jl_throw, %rcx
movq %rax, %rdi
callq *%rcx
nopw %cs:(%rax,%rax)
;}
Postscript:
The better way is of course
omega_5(d, dmax) = ( x= d/dmax; y = 1.0-x*x; (y*y)*(y*y));
which saves us the extra multiply. Doesn’t matter, we appear to be constrained by the divisions, unless you are hyperthreading and someone else can use the sweet execution ports we waste.