I have noticed that Julia produces much shorter assembly for the \
-operator in contrast to inv(A)
. Here are the two functions:
foo(A,b) = inv(A) * b
bar(A,b) = A\b
As expected, for A=[1.0 2.0 4.4; 4.0 3.3 53.5; 34.0 35.0 34.0]
and b=[1.3; 34.0; 46.0]
the backslash operator is much faster:
@btime foo(A,b) # -> 568.522 ns
@btime bar(A,b) #-> 339.229 ns
However, if we use the identitiy matrix B=diagonal(ones(3))
instead of vector b
, function foo
is now faster, although the assembly is much longer:
@btime foo(A,B) # -> 603.418 ns
@btime bar(A,B) #-> 1.377 mus
Here is the assembler output for b
:
foo:
.text
; β @ REPL[1]:1 within `foo'
push rbp
mov rbp, rsp
push r15
push r14
push rbx
and rsp, -32
sub rsp, 96
vxorps xmm0, xmm0, xmm0
vmovaps ymmword ptr [rsp + 32], ymm0
mov qword ptr [rsp + 80], rsi
mov r15, qword ptr fs:[0]
mov qword ptr [rsp + 32], 8
mov rax, qword ptr [r15 - 15712]
mov qword ptr [rsp + 40], rax
lea rax, [rsp + 32]
mov qword ptr [r15 - 15712], rax
mov rax, qword ptr [rsi]
mov r14, qword ptr [rsi + 8]
mov qword ptr [rsp + 24], rax
movabs rax, offset japi1_inv_17409
movabs rdi, offset jl_system_image_data
lea rsi, [rsp + 24]
mov edx, 1
vzeroupper
call rax
mov rbx, rax
; ββ @ matmul.jl:47 within `*'
; βββ @ array.jl:154 within `size'
mov rsi, qword ptr [rax + 24]
mov qword ptr [rsp + 48], rax
movabs rdi, offset jl_system_image_data
movabs rax, offset jl_alloc_array_1d
; βββ
; βββ @ abstractarray.jl:628 within `similar' @ array.jl:361
; ββββ @ boot.jl:414 within `Array' @ boot.jl:405
call rax
mov qword ptr [rsp + 56], rax
; ββββ
; βββ @ matmul.jl:208 within `mul!' @ matmul.jl:66
movabs r10, offset "julia_gemv!_17418"
mov rdi, rax
mov esi, 1308622848
mov rdx, rbx
mov rcx, r14
mov r8d, 1
xor r9d, r9d
call r10
mov rcx, qword ptr [rsp + 40]
mov qword ptr [r15 - 15712], rcx
; βββ
lea rsp, [rbp - 24]
pop rbx
pop r14
pop r15
pop rbp
ret
nop dword ptr [rax + rax]
; β
bar:
.text
; β @ REPL[2]:1 within `bar'
sub rsp, 24
mov qword ptr [rsp], rsi
vmovups xmm0, xmmword ptr [rsi]
vmovups xmmword ptr [rsp + 8], xmm0
movabs rax, offset "japi1_\_17421"
movabs rdi, offset jl_system_image_data
lea rsi, [rsp + 8]
mov edx, 2
call rax
add rsp, 24
ret
nop word ptr [rax + rax]
; β
And here for B
:
foo:
.text
; β @ REPL[1]:1 within `foo'
push rbp
mov rbp, rsp
push r15
push r14
push r12
push rbx
and rsp, -32
sub rsp, 96
vxorps xmm0, xmm0, xmm0
vmovaps ymmword ptr [rsp + 32], ymm0
mov qword ptr [rsp + 72], rsi
mov rax, qword ptr fs:[0]
mov qword ptr [rsp + 32], 8
mov rcx, qword ptr [rax - 15712]
mov qword ptr [rsp + 40], rcx
lea rcx, [rsp + 32]
mov qword ptr [rax - 15712], rcx
lea r14, [rax - 15712]
mov rax, qword ptr [rsi]
mov r12, qword ptr [rsi + 8]
mov qword ptr [rsp + 16], rax
movabs rax, offset inv
movabs rdi, offset jl_system_image_data
lea rsi, [rsp + 16]
mov edx, 1
vzeroupper
call rax
mov rbx, rax
; ββ @ diagonal.jl:170 within `*'
; βββ @ array.jl:156 within `size'
mov rsi, qword ptr [rax + 24]
mov rdx, qword ptr [rax + 32]
mov qword ptr [rsp + 56], rax
movabs rdi, offset jl_system_image_data
movabs rax, offset jl_alloc_array_2d
; βββ
; βββ @ array.jl:361 within `similar'
; ββββ @ boot.jl:415 within `Array' @ boot.jl:407
call rax
mov r15, rax
; ββββ
; βββ @ array.jl:330 within `copyto!'
; ββββ @ array.jl:221 within `length'
mov r8, qword ptr [rbx + 8]
; ββββ
; βββ @ array.jl:330 within `copyto!' @ array.jl:313
; ββββ @ promotion.jl:398 within `=='
test r8, r8
; ββββ
je L221
; βββ @ array.jl:330 within `copyto!' @ array.jl:314
jle L289
; βββ @ array.jl:330 within `copyto!' @ array.jl:315
; ββββ @ operators.jl:294 within `>'
; βββββ @ int.jl:49 within `<'
cmp qword ptr [r15 + 8], r8
; βββββ
jl L303
; βββ @ array.jl:330 within `copyto!' @ array.jl:0
mov qword ptr [rsp + 48], r15
; βββ @ array.jl:330 within `copyto!' @ array.jl:318
movabs rax, offset "unsafe_copyto!"
mov esi, 1
mov ecx, 1
mov rdi, r15
mov rdx, rbx
call rax
; βββ
; ββ @ array.jl within `*'
L221:
mov qword ptr [rsp + 48], r15
; ββ
; ββ @ diagonal.jl:170 within `*'
mov qword ptr [rsp + 16], r15
mov qword ptr [rsp + 24], r12
movabs rax, offset "japi1_rmul!_17712"
movabs rdi, offset jl_system_image_data
lea rsi, [rsp + 16]
mov edx, 2
call rax
mov rcx, qword ptr [rsp + 40]
mov qword ptr [r14], rcx
; ββ
lea rsp, [rbp - 32]
pop rbx
pop r12
pop r14
pop r15
pop rbp
ret
; ββ @ diagonal.jl:170 within `*'
; βββ @ array.jl:330 within `copyto!' @ array.jl:314
L289:
movabs rax, offset _throw_argerror
call rax
ud2
; βββ @ array.jl:330 within `copyto!' @ array.jl:316
; ββββ @ boot.jl:242 within `BoundsError'
L303:
movabs rax, offset jl_gc_pool_alloc
mov rdi, r14
mov esi, 1424
mov edx, 32
call rax
movabs rcx, offset jl_system_image_data
mov qword ptr [rax - 8], rcx
vxorps xmm0, xmm0, xmm0
vmovups xmmword ptr [rax], xmm0
mov qword ptr [rsp + 48], rax
; ββββ
movabs rcx, offset jl_throw
mov rdi, rax
call rcx
nop word ptr cs:[rax + rax]
nop dword ptr [rax]
; βββ
bar:
.text
; β @ REPL[2]:1 within `bar'
sub rsp, 24
mov qword ptr [rsp], rsi
vmovups xmm0, xmmword ptr [rsi]
vmovups xmmword ptr [rsp + 8], xmm0
movabs rax, offset "japi1_\_17734"
movabs rdi, offset jl_system_image_data
lea rsi, [rsp + 8]
mov edx, 2
call rax
add rsp, 24
ret
nop word ptr [rax + rax]
; β
To me it seems that the bar
- function calculates the result at compile time. But why is it then slower than foo
in case of the identity matrix. Am I missing something?