For information, the purpose of the disassembly exercise was trying to figure out what the fact that julia integer are immutable meant for generated code (if it had a performance impact such as too many allocations).
In this particular example, I was interested in the loop counter.
However LLVM is smart and there is no loop counter in the generated code.
I get the same generated code for:
g(i)=i
function foo(n)
res = zero(n)
for i ∈ 1:n
res+=g(i)
end
end
ie function g get inlined.
Similarly if I sum the squares, cubes or 4th power (g(i)=i^4), LLVM builds the closed form solution and again there is no loop counter.
So instead I tried this function:
function foobar(n,a)
res = zero(n)
for i ∈ 1:n
res+=a[i]
end
res
end
julia>code_native(foobar,(Int64,Array{Int64,1}),:intel)
.text
Filename: REPL[101]
xor eax, eax
Source line: 3
test rdi, rdi
jle L49
Source line: 4
mov r8, qword ptr [rsi]
mov rdx, qword ptr [rsi + 24]
xor ecx, ecx
xor eax, eax
nop word ptr cs:[rax + rax]
L32:
cmp rcx, rdx
jae L50
add rax, qword ptr [r8 + 8*rcx]
Source line: 3
inc rcx
cmp rdi, rcx
jne L32
Source line: 6
L49:
ret
L50:
push rbp
mov rbp, rsp
Source line: 4
mov rdx, rsp
lea rax, [rdx - 16]
mov rsp, rax
inc rcx
mov qword ptr [rdx - 16], rcx
movabs rcx, jl_bounds_error_ints
mov edx, 1
mov rdi, rsi
mov rsi, rax
call rcx
nop
RAX stores the result res.
It is initialized to zero with XOR EAX, EAX (though I don’t know why this instruction is done twice)
RDI (1st argument) is the input n.
The array a is passed via RSI (2nd argument).
From the workflow, we can infer:
RSI must contain the address of the raw float64 array data which is stored in R8
address RSI+24 must contain the array size which is stored in RDX.
RCX is the counter i-1.
initialized to zero with XOR ECX, ECX
incremented with INC RCX
the computation is done by the line
add rax, qword ptr [r8 + 8*rcx]
The counter is compared to both n for the loop (label L32) and RDX for the array out of bounds exception (label L50)
Label L49 is used for the fast exit in case n<=0.
I guess it should not be surprising that the loop counter are mutating registers since LLVM has full freedom to optimize the local variable but I wanted to understand the simple things first. Next time I will try to see what happens in function that try to modify integers stored in struct or arrays passed as arguments (there could be multiple external references to those integers).
Interestingly if I use the @inbounds macro:
function foobar(n,a)
res = zero(n)
for i ∈ 1:n
@inbounds res+=a[i]
end
res
end
I get:
julia>code_native(foobar,(Int64,Array{Int64,1}),:intel)
.text
Filename: REPL[107]
pushq %rbp
movq %rsp, %rbp
xorl %eax, %eax
Source line: 3
testq %rdi, %rdi
jle L156
Source line: 4
movq (%rsi), %r8
xorl %eax, %eax
movl $1, %edx
Source line: 3
cmpq $4, %rdi
jb L130
movq %rdi, %rsi
andq $-4, %rsi
xorl %eax, %eax
movq %rdi, %rcx
andq $-4, %rcx
movl $1, %edx
je L130
movq %rsi, %rdx
orq $1, %rdx
leaq 16(%r8), %rax
pxor %xmm0, %xmm0
movq %rcx, %rsi
pxor %xmm1, %xmm1
nopl (%rax)
Source line: 4
L80:
movdqu -16(%rax), %xmm2
movdqu (%rax), %xmm3
paddq %xmm2, %xmm0
paddq %xmm3, %xmm1
Source line: 3
addq $32, %rax
addq $-4, %rsi
jne L80
Source line: 4
paddq %xmm0, %xmm1
pshufd $78, %xmm1, %xmm0 # xmm0 = xmm1[2,3,0,1]
paddq %xmm1, %xmm0
movd %xmm0, %rax
cmpq %rdi, %rcx
Source line: 3
je L156
L130:
incq %rdi
subq %rdx, %rdi
leaq -8(%r8,%rdx,8), %rcx
nopl (%rax)
Source line: 4
L144:
addq (%rcx), %rax
Source line: 3
addq $8, %rcx
decq %rdi
jne L144
Source line: 6
L156:
popq %rbp
retq
nop
No more calls to jl_bounds_error_ints
I was expecting the same code as before without the 2 lines with the RDX register (array size) and without the code following label L50 but the code is vastly different. There is much more code in total: looks like it is using the XMM registers and SIMD instructions to generate faster code so you get more performance increase than just removing bounds checking.