Assembler length vs speed (backslash- and inverse-operator)

I have noticed that Julia produces much shorter assembly for the \-operator in contrast to inv(A). Here are the two functions:

foo(A,b) = inv(A) * b
bar(A,b) = A\b

As expected, for A=[1.0 2.0 4.4; 4.0 3.3 53.5; 34.0 35.0 34.0] and b=[1.3; 34.0; 46.0] the backslash operator is much faster:

@btime foo(A,b)   # -> 568.522 ns
@btime bar(A,b)   #-> 339.229 ns

However, if we use the identitiy matrix B=diagonal(ones(3)) instead of vector b, function foo is now faster, although the assembly is much longer:

@btime foo(A,B)   # -> 603.418 ns
@btime bar(A,B)   #-> 1.377 mus

Here is the assembler output for b:

foo:

.text
; β”Œ @ REPL[1]:1 within `foo'
push	rbp
mov	rbp, rsp
push	r15
push	r14
push	rbx
and	rsp, -32
sub	rsp, 96
vxorps	xmm0, xmm0, xmm0
vmovaps	ymmword ptr [rsp + 32], ymm0
mov	qword ptr [rsp + 80], rsi
mov	r15, qword ptr fs:[0]
mov	qword ptr [rsp + 32], 8
mov	rax, qword ptr [r15 - 15712]
mov	qword ptr [rsp + 40], rax
lea	rax, [rsp + 32]
mov	qword ptr [r15 - 15712], rax
mov	rax, qword ptr [rsi]
mov	r14, qword ptr [rsi + 8]
mov	qword ptr [rsp + 24], rax
movabs	rax, offset japi1_inv_17409
movabs	rdi, offset jl_system_image_data
lea	rsi, [rsp + 24]
mov	edx, 1
vzeroupper
call	rax
mov	rbx, rax
    ; β”‚β”Œ @ matmul.jl:47 within `*'
    ; β”‚β”‚β”Œ @ array.jl:154 within `size'
mov	rsi, qword ptr [rax + 24]
mov	qword ptr [rsp + 48], rax
movabs	rdi, offset jl_system_image_data
movabs	rax, offset jl_alloc_array_1d
    ; β”‚β”‚β””
    ; β”‚β”‚β”Œ @ abstractarray.jl:628 within `similar' @ array.jl:361
    ; β”‚β”‚β”‚β”Œ @ boot.jl:414 within `Array' @ boot.jl:405
call	rax
mov	qword ptr [rsp + 56], rax
    ; β”‚β”‚β””β””
    ; β”‚β”‚β”Œ @ matmul.jl:208 within `mul!' @ matmul.jl:66
movabs	r10, offset "julia_gemv!_17418"
mov	rdi, rax
mov	esi, 1308622848
mov	rdx, rbx
mov	rcx, r14
mov	r8d, 1
xor	r9d, r9d
call	r10
mov	rcx, qword ptr [rsp + 40]
mov	qword ptr [r15 - 15712], rcx
    ; β”‚β””β””
lea	rsp, [rbp - 24]
pop	rbx
pop	r14
pop	r15
pop	rbp
ret
nop	dword ptr [rax + rax]
    ; β””

bar:

.text
; β”Œ @ REPL[2]:1 within `bar'
sub	rsp, 24
mov	qword ptr [rsp], rsi
vmovups	xmm0, xmmword ptr [rsi]
vmovups	xmmword ptr [rsp + 8], xmm0
movabs	rax, offset "japi1_\_17421"
movabs	rdi, offset jl_system_image_data
lea	rsi, [rsp + 8]
mov	edx, 2
call	rax
add	rsp, 24
ret
nop	word ptr [rax + rax]
; β””

And here for B:

foo:

.text
; β”Œ @ REPL[1]:1 within `foo'
push	rbp
mov	rbp, rsp
push	r15
push	r14
push	r12
push	rbx
and	rsp, -32
sub	rsp, 96
vxorps	xmm0, xmm0, xmm0
vmovaps	ymmword ptr [rsp + 32], ymm0
mov	qword ptr [rsp + 72], rsi
mov	rax, qword ptr fs:[0]
mov	qword ptr [rsp + 32], 8
mov	rcx, qword ptr [rax - 15712]
mov	qword ptr [rsp + 40], rcx
lea	rcx, [rsp + 32]
mov	qword ptr [rax - 15712], rcx
lea	r14, [rax - 15712]
mov	rax, qword ptr [rsi]
mov	r12, qword ptr [rsi + 8]
mov	qword ptr [rsp + 16], rax
movabs	rax, offset inv
movabs	rdi, offset jl_system_image_data
lea	rsi, [rsp + 16]
mov	edx, 1
vzeroupper
call	rax
mov	rbx, rax
; β”‚β”Œ @ diagonal.jl:170 within `*'
; β”‚β”‚β”Œ @ array.jl:156 within `size'
mov	rsi, qword ptr [rax + 24]
mov	rdx, qword ptr [rax + 32]
mov	qword ptr [rsp + 56], rax
movabs	rdi, offset jl_system_image_data
movabs	rax, offset jl_alloc_array_2d
; β”‚β”‚β””
; β”‚β”‚β”Œ @ array.jl:361 within `similar'
; β”‚β”‚β”‚β”Œ @ boot.jl:415 within `Array' @ boot.jl:407
call	rax
mov	r15, rax
; β”‚β”‚β””β””
; β”‚β”‚β”Œ @ array.jl:330 within `copyto!'
; β”‚β”‚β”‚β”Œ @ array.jl:221 within `length'
mov	r8, qword ptr [rbx + 8]
; β”‚β”‚β”‚β””
; β”‚β”‚β”‚ @ array.jl:330 within `copyto!' @ array.jl:313
; β”‚β”‚β”‚β”Œ @ promotion.jl:398 within `=='
test	r8, r8
; β”‚β”‚β”‚β””
je	L221
; β”‚β”‚β”‚ @ array.jl:330 within `copyto!' @ array.jl:314
jle	L289
; β”‚β”‚β”‚ @ array.jl:330 within `copyto!' @ array.jl:315
; β”‚β”‚β”‚β”Œ @ operators.jl:294 within `>'
; β”‚β”‚β”‚β”‚β”Œ @ int.jl:49 within `<'
cmp	qword ptr [r15 + 8], r8
; β”‚β”‚β”‚β””β””
jl	L303
; β”‚β”‚β”‚ @ array.jl:330 within `copyto!' @ array.jl:0
mov	qword ptr [rsp + 48], r15
; β”‚β”‚β”‚ @ array.jl:330 within `copyto!' @ array.jl:318
movabs	rax, offset "unsafe_copyto!"
mov	esi, 1
mov	ecx, 1
mov	rdi, r15
mov	rdx, rbx
call	rax
; β”‚β””β””
; β”‚β”Œ @ array.jl within `*'
L221:
mov	qword ptr [rsp + 48], r15
; β”‚β””
; β”‚β”Œ @ diagonal.jl:170 within `*'
mov	qword ptr [rsp + 16], r15
mov	qword ptr [rsp + 24], r12
movabs	rax, offset "japi1_rmul!_17712"
movabs	rdi, offset jl_system_image_data
lea	rsi, [rsp + 16]
mov	edx, 2
call	rax
mov	rcx, qword ptr [rsp + 40]
mov	qword ptr [r14], rcx
; β”‚β””
lea	rsp, [rbp - 32]
pop	rbx
pop	r12
pop	r14
pop	r15
pop	rbp
ret
; β”‚β”Œ @ diagonal.jl:170 within `*'
; β”‚β”‚β”Œ @ array.jl:330 within `copyto!' @ array.jl:314
L289:
movabs	rax, offset _throw_argerror
call	rax
ud2
; β”‚β”‚β”‚ @ array.jl:330 within `copyto!' @ array.jl:316
; β”‚β”‚β”‚β”Œ @ boot.jl:242 within `BoundsError'
L303:
movabs	rax, offset jl_gc_pool_alloc
mov	rdi, r14
mov	esi, 1424
mov	edx, 32
call	rax
movabs	rcx, offset jl_system_image_data
mov	qword ptr [rax - 8], rcx
vxorps	xmm0, xmm0, xmm0
vmovups	xmmword ptr [rax], xmm0
mov	qword ptr [rsp + 48], rax
; β”‚β”‚β”‚β””
movabs	rcx, offset jl_throw
mov	rdi, rax
call	rcx
nop	word ptr cs:[rax + rax]
nop	dword ptr [rax]
  ; β””β””β””

bar:

	.text
; β”Œ @ REPL[2]:1 within `bar'
sub	rsp, 24
mov	qword ptr [rsp], rsi
vmovups	xmm0, xmmword ptr [rsi]
vmovups	xmmword ptr [rsp + 8], xmm0
movabs	rax, offset "japi1_\_17734"
movabs	rdi, offset jl_system_image_data
lea	rsi, [rsp + 8]
mov	edx, 2
call	rax
add	rsp, 24
ret
nop	word ptr [rax + rax]
; β””

To me it seems that the bar- function calculates the result at compile time. But why is it then slower than foo in case of the identity matrix. Am I missing something?

Please quote code in your messages.

Assembly length is a poor proxy for performance, especially in this case where the assembly is mostly just setting up calls to other functions. It’s better to use the profiler to understand where time is spend, before delving into low-level performance modelling. Also assembly is not the best thing to read if your brain isn’t an Intel CPU. Better to first look at code_typed and code_lowered.

1 Like

the problem here is that \ by a diagonal matrix (as of 1.4.1) uses a fallback method for abstract arrays which is unsurprisingly slow. I think I saw a PR recently that fixes this.

3 Likes