Consider the following example in swift, from: Redirecting to Google Groups
func f() -> [Int] {
return [0, 1]
}
func testMe() -> Int {
var r = 0
for x in f() {
r += x
}
return r
}
According to Dave in that thread,
βWhen I build that with swiftc -O -S, I see this definition of testMe, which shows me that at least in some circumstances, Swift can specialize on not just the length but the contents of an Array:β
With the following output:
_$s1x6testMeSiyF:
pushq %rbp
movq %rsp, %rbp
movl $1, %eax
popq %rbp
retq
However the comparable Julia code:
f()= [0,1]
function testMe(F)
r=0
for x in F()
r+=x
end
return r
end
@code_native testMe(f)
Gives:
.text
; β @ untitled-10d92c9353faa31dff3822d67fccff61:4 within `sum_arg'
pushq %rbp
movq %rsp, %rbp
; β @ untitled-10d92c9353faa31dff3822d67fccff61:5 within `sum_arg'
; ββ @ array.jl:720 within `iterate' @ array.jl:720
; βββ @ array.jl:216 within `length'
movq 8(%rcx), %rax
; βββ
testq %rax, %rax
jle L64
; βββ @ array.jl:744 within `getindex'
movq (%rcx), %rcx
; βββ
; β @ untitled-10d92c9353faa31dff3822d67fccff61:6 within `sum_arg'
; ββ @ float.jl:401 within `+'
vxorpd %xmm0, %xmm0, %xmm0
vaddsd (%rcx), %xmm0, %xmm0
; ββ
; ββ @ array.jl:720 within `iterate'
; βββ @ int.jl:430 within `<' @ int.jl:423
cmpq $1, %rax
; βββ
je L62
; ββ
; β @ array.jl within `sum_arg'
movl $1, %edx
nopw %cs:(%rax,%rax)
; β
; β @ untitled-10d92c9353faa31dff3822d67fccff61:6 within `sum_arg'
; ββ @ float.jl:401 within `+'
L48:
vaddsd (%rcx,%rdx,8), %xmm0, %xmm0
; ββ
; ββ @ array.jl:720 within `iterate'
; βββ @ int.jl:430 within `<' @ int.jl:423
addq $1, %rdx
cmpq %rax, %rdx
; βββ
jb L48
; ββ
; β @ untitled-10d92c9353faa31dff3822d67fccff61:8 within `sum_arg'f3822d67fccff61:8 within `sum_arg'
L62:
popq %rbp
retq
L64:
vxorps %xmm0, %xmm0, %xmm0
; β @ untitled-10d92c9353faa31dff3822d67fccff61:8 within `sum_arg'
popq %rbp
retq
nopw %cs:(%rax,%rax)
; β
; @ untitled-a318115c5de073a6de1a9c13afc76741:4 within `testMe'
; Function Attrs: uwtable
define i64 @julia_testMe_18378() #0 {
L24:
; @ untitled-a318115c5de073a6de1a9c13afc76741:5 within `testMe'
; β @ untitled-a318115c5de073a6de1a9c13afc76741:1 within `f'
; ββ @ array.jl:130 within `vect'
; βββ @ array.jl:614 within `_array_for'
; ββββ @ abstractarray.jl:671 within `similar' @ abstractarray.jl:672
; βββββ @ boot.jl:413 within `Array' @ boot.jl:404
%0 = call %jl_value_t addrspace(10)* inttoptr (i64 1720217328 to %jl_value_t addrspace(10)* (%jl_value_t addrspace(10)*, i64)*)(%jl_value_t addrspace(10)* addrspacecast (%jl_value_t* inttoptr (i64 121410064 to %jl_value_t*) to %jl_value_t addrspace(10)*), i64 2)
%1 = addrspacecast %jl_value_t addrspace(10)* %0 to %jl_value_t addrspace(11)*
%2 = bitcast %jl_value_t addrspace(11)* %1 to i64 addrspace(13)* addrspace(11)*
%3 = load i64 addrspace(13)*, i64 addrspace(13)* addrspace(11)* %2, align 8
; βββββ
; βββ @ array.jl:782 within `setindex!'
%4 = bitcast i64 addrspace(13)* %3 to <2 x i64> addrspace(13)*
store <2 x i64> <i64 0, i64 1>, <2 x i64> addrspace(13)* %4, align 8
; βββ
; β @ array.jl:720 within `iterate' @ array.jl:720
; ββ @ array.jl:216 within `length'
%5 = bitcast %jl_value_t addrspace(11)* %1 to %jl_array_t addrspace(11)*
%6 = getelementptr inbounds %jl_array_t, %jl_array_t addrspace(11)* %5, i64 0, i32 1
%7 = load i64, i64 addrspace(11)* %6, align 8
; ββ
%8 = icmp slt i64 %7, 2
br i1 %8, label %L66, label %L65.lr.ph.L65.lr.ph.split_crit_edge
L65.lr.ph.L65.lr.ph.split_crit_edge: ; preds = %L24
; β
; @ untitled-a318115c5de073a6de1a9c13afc76741:6 within `testMe'
; β @ array.jl:720 within `iterate'
%9 = add i64 %7, -1
%min.iters.check = icmp ult i64 %9, 16
br i1 %min.iters.check, label %scalar.ph, label %vector.ph
vector.ph: ; preds = %L65.lr.ph.L65.lr.ph.split_crit_edge
%n.vec = and i64 %9, -16
%ind.end = or i64 %n.vec, 1
%ind.end28 = or i64 %n.vec, 2
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x i64> [ zeroinitializer, %vector.ph ], [ %18, %vector.body ]
%vec.phi32 = phi <4 x i64> [ zeroinitializer, %vector.ph ], [ %19, %vector.body ]
%vec.phi33 = phi <4 x i64> [ zeroinitializer, %vector.ph ], [ %20, %vector.body ]
%vec.phi34 = phi <4 x i64> [ zeroinitializer, %vector.ph ], [ %21, %vector.body ]
%offset.idx = or i64 %index, 1
; ββ @ array.jl:744 within `getindex'
%10 = getelementptr inbounds i64, i64 addrspace(13)* %3, i64 %offset.idx
; ββ
; @ untitled-a318115c5de073a6de1a9c13afc76741:5 within `testMe'
; β @ array.jl:720 within `iterate' @ array.jl:720
; ββ @ array.jl:744 within `getindex'
%11 = bitcast i64 addrspace(13)* %10 to <4 x i64> addrspace(13)*
%wide.load = load <4 x i64>, <4 x i64> addrspace(13)* %11, align 8
%12 = getelementptr i64, i64 addrspace(13)* %10, i64 4
%13 = bitcast i64 addrspace(13)* %12 to <4 x i64> addrspace(13)*
%wide.load42 = load <4 x i64>, <4 x i64> addrspace(13)* %13, align 8
%14 = getelementptr i64, i64 addrspace(13)* %10, i64 8
%15 = bitcast i64 addrspace(13)* %14 to <4 x i64> addrspace(13)*
%wide.load43 = load <4 x i64>, <4 x i64> addrspace(13)* %15, align 8
%16 = getelementptr i64, i64 addrspace(13)* %10, i64 12
%17 = bitcast i64 addrspace(13)* %16 to <4 x i64> addrspace(13)*
%wide.load44 = load <4 x i64>, <4 x i64> addrspace(13)* %17, align 8
; ββ
; @ untitled-a318115c5de073a6de1a9c13afc76741:6 within `testMe'
; β @ int.jl:53 within `+'
%18 = add <4 x i64> %wide.load, %vec.phi
%19 = add <4 x i64> %wide.load42, %vec.phi32
%20 = add <4 x i64> %wide.load43, %vec.phi33
%21 = add <4 x i64> %wide.load44, %vec.phi34
%index.next = add i64 %index, 16
%22 = icmp eq i64 %index.next, %n.vec
br i1 %22, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%bin.rdx = add <4 x i64> %19, %18
%bin.rdx45 = add <4 x i64> %20, %bin.rdx
%bin.rdx46 = add <4 x i64> %21, %bin.rdx45
%rdx.shuf = shufflevector <4 x i64> %bin.rdx46, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
%bin.rdx47 = add <4 x i64> %bin.rdx46, %rdx.shuf
%rdx.shuf48 = shufflevector <4 x i64> %bin.rdx47, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
%bin.rdx49 = add <4 x i64> %bin.rdx47, %rdx.shuf48
%23 = extractelement <4 x i64> %bin.rdx49, i32 0
%cmp.n = icmp eq i64 %9, %n.vec
; β
; β @ array.jl:720 within `iterate'
br i1 %cmp.n, label %L66, label %scalar.ph
scalar.ph: ; preds = %middle.block, %L65.lr.ph.L65.lr.ph.split_crit_edge
%bc.resume.val = phi i64 [ %ind.end, %middle.block ], [ 1, %L65.lr.ph.L65.lr.ph.split_crit_edge ]
%bc.resume.val27 = phi i64 [ %ind.end28, %middle.block ], [ 2, %L65.lr.ph.L65.lr.ph.split_crit_edge ]
%bc.merge.rdx = phi i64 [ %23, %middle.block ], [ 0, %L65.lr.ph.L65.lr.ph.split_crit_edge ]
br label %L65
L65: ; preds = %L65, %scalar.ph
%24 = phi i64 [ %bc.resume.val, %scalar.ph ], [ %value_phi1120, %L65 ]
%25 = phi i64 [ %bc.merge.rdx, %scalar.ph ], [ %28, %L65 ]
%value_phi1120 = phi i64 [ %bc.resume.val27, %scalar.ph ], [ %27, %L65 ]
; ββ @ array.jl:744 within `getindex'
%26 = getelementptr inbounds i64, i64 addrspace(13)* %3, i64 %24
; ββ
; ββ @ int.jl:53 within `+'
%27 = add i64 %value_phi1120, 1
; ββ
; @ untitled-a318115c5de073a6de1a9c13afc76741:5 within `testMe'
; β @ array.jl:720 within `iterate' @ array.jl:720
; ββ @ array.jl:744 within `getindex'
%value_phi10 = load i64, i64 addrspace(13)* %26, align 8
; ββ
; @ untitled-a318115c5de073a6de1a9c13afc76741:6 within `testMe'
; β @ int.jl:53 within `+'
%28 = add i64 %value_phi10, %25
; β
; β @ array.jl:720 within `iterate'
; ββ @ int.jl:430 within `<' @ int.jl:423
%29 = icmp ult i64 %value_phi1120, %7
; ββ
br i1 %29, label %L65, label %L66
L66: ; preds = %L65, %middle.block, %L24
%value_phi15 = phi i64 [ 0, %L24 ], [ %28, %L65 ], [ %23, %middle.block ]
; β
; @ untitled-a318115c5de073a6de1a9c13afc76741:8 within `testMe'
ret i64 %value_phi15
}
.text
; β @ untitled-a318115c5de073a6de1a9c13afc76741:4 within `testMe'
pushq %rbp
movq %rsp, %rbp
; β @ untitled-a318115c5de073a6de1a9c13afc76741:5 within `testMe'
; ββ @ untitled-a318115c5de073a6de1a9c13afc76741:1 within `f'
; βββ @ array.jl:130 within `vect'
; ββββ @ array.jl:614 within `_array_for'
; βββββ @ abstractarray.jl:671 within `similar' @ abstractarray.jl:672
; ββββββ @ boot.jl:413 within `Array' @ boot.jl:404
pushq %rsi
subq $40, %rsp
movl $jl_alloc_array_1d, %eax
movl $jl_system_image_data, %ecx
movl $2, %edx
callq *%rax
movq (%rax), %r8
; ββββββ
; βββ @ array.jl:782 within `vect'
movl $1, %ecx
vmovq %rcx, %xmm0
vpslldq $8, %xmm0, %xmm0 # xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
vmovdqu %xmm0, (%r8)
; βββ
; ββ @ array.jl:720 within `iterate' @ array.jl:720
; βββ @ array.jl:216 within `length'
movq 8(%rax), %r9
; βββ
cmpq $2, %r9
jge L68
; ββ
; β @ array.jl within `testMe'
xorl %eax, %eax
; β
; β @ untitled-a318115c5de073a6de1a9c13afc76741:8 within `testMe'
addq $40, %rsp
popq %rsi
popq %rbp
retq
; β @ untitled-a318115c5de073a6de1a9c13afc76741:6 within `testMe'
; ββ @ array.jl:720 within `iterate'
L68:
leaq -1(%r9), %r10
cmpq $16, %r10
jae L92
xorl %eax, %eax
movl $2, %ecx
movl $1, %edx
jmp L214
; ββ @ array.jl:720 within `iterate'
L92:
movq %r10, %r11
andq $-16, %r11
leaq 1(%r11), %rdx
leaq 2(%r11), %rcx
leaq 104(%r8), %rax
vpxor %xmm0, %xmm0, %xmm0
movq %r11, %rsi
vpxor %xmm1, %xmm1, %xmm1
vpxor %xmm2, %xmm2, %xmm2
vpxor %xmm3, %xmm3, %xmm3
nopw %cs:(%rax,%rax)
; ββ
; ββ @ int.jl:53 within `+'
L144:
vpaddq -96(%rax), %ymm0, %ymm0
vpaddq -64(%rax), %ymm1, %ymm1
vpaddq -32(%rax), %ymm2, %ymm2
vpaddq (%rax), %ymm3, %ymm3
subq $-128, %rax
addq $-16, %rsi
jne L144
vpaddq %ymm0, %ymm1, %ymm0
vpaddq %ymm0, %ymm2, %ymm0
vpaddq %ymm0, %ymm3, %ymm0
vextracti128 $1, %ymm0, %xmm1
vpaddq %ymm1, %ymm0, %ymm0
vpshufd $78, %xmm0, %xmm1 # xmm1 = xmm0[2,3,0,1]
vpaddq %ymm1, %ymm0, %ymm0
vmovq %xmm0, %rax
cmpq %r11, %r10
; ββ
; ββ @ array.jl:720 within `iterate'
je L244
L214:
leaq -1(%rcx), %rsi
nopw (%rax,%rax)
; ββ
; ββ @ int.jl:53 within `+'
L224:
addq (%r8,%rdx,8), %rax
; ββ
; ββ @ array.jl:720 within `iterate'
; βββ @ int.jl:430 within `<' @ int.jl:423
addq $1, %rsi
movq %rcx, %rdx
; βββ
; βββ @ int.jl:53 within `+'
leaq 1(%rcx), %rcx
; βββ
; ββ @ int.jl:423 within `iterate'
cmpq %r9, %rsi
; ββ
; β @ array.jl:720 within `testMe'
jb L224
; β
; β @ untitled-a318115c5de073a6de1a9c13afc76741:8 within `testMe'
L244:
addq $40, %rsp
popq %rsi
popq %rbp
vzeroupper
retq
nop
; β
julia> @code_native testMe(f)
.text
; β @ untitled-a318115c5de073a6de1a9c13afc76741:4 within `testMe'
pushq %rbp
movq %rsp, %rbp
; β @ untitled-a318115c5de073a6de1a9c13afc76741:5 within `testMe'
; ββ @ untitled-a318115c5de073a6de1a9c13afc76741:1 within `f'
; βββ @ array.jl:130 within `vect'
; ββββ @ array.jl:614 within `_array_for'
; βββββ @ abstractarray.jl:671 within `similar' @ abstractarray.jl:672
; ββββββ @ boot.jl:413 within `Array' @ boot.jl:404
pushq %rsi
subq $40, %rsp
movl $jl_alloc_array_1d, %eax
movl $jl_system_image_data, %ecx
movl $2, %edx
callq *%rax
movq (%rax), %r8
; ββββββ
; βββ @ array.jl:782 within `vect'
movl $1, %ecx
vmovq %rcx, %xmm0
vpslldq $8, %xmm0, %xmm0 # xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
vmovdqu %xmm0, (%r8)
; βββ
; ββ @ array.jl:720 within `iterate' @ array.jl:720
; βββ @ array.jl:216 within `length'
movq 8(%rax), %r9
; βββ
cmpq $2, %r9
jge L68
; ββ
; β @ array.jl within `testMe'
xorl %eax, %eax
; β
; β @ untitled-a318115c5de073a6de1a9c13afc76741:8 within `testMe'
addq $40, %rsp
popq %rsi
popq %rbp
retq
; β @ untitled-a318115c5de073a6de1a9c13afc76741:6 within `testMe'
; ββ @ array.jl:720 within `iterate'
L68:
leaq -1(%r9), %r10
cmpq $16, %r10
jae L92
xorl %eax, %eax
movl $2, %ecx
movl $1, %edx
jmp L214
; ββ @ array.jl:720 within `iterate'
L92:
movq %r10, %r11
andq $-16, %r11
leaq 1(%r11), %rdx
leaq 2(%r11), %rcx
leaq 104(%r8), %rax
vpxor %xmm0, %xmm0, %xmm0
movq %r11, %rsi
vpxor %xmm1, %xmm1, %xmm1
vpxor %xmm2, %xmm2, %xmm2
vpxor %xmm3, %xmm3, %xmm3
nopw %cs:(%rax,%rax)
; ββ
; ββ @ int.jl:53 within `+'
L144:
vpaddq -96(%rax), %ymm0, %ymm0
vpaddq -64(%rax), %ymm1, %ymm1
vpaddq -32(%rax), %ymm2, %ymm2
vpaddq (%rax), %ymm3, %ymm3
subq $-128, %rax
addq $-16, %rsi
jne L144
vpaddq %ymm0, %ymm1, %ymm0
vpaddq %ymm0, %ymm2, %ymm0
vpaddq %ymm0, %ymm3, %ymm0
vextracti128 $1, %ymm0, %xmm1
vpaddq %ymm1, %ymm0, %ymm0
vpshufd $78, %xmm0, %xmm1 # xmm1 = xmm0[2,3,0,1]
vpaddq %ymm1, %ymm0, %ymm0
vmovq %xmm0, %rax
cmpq %r11, %r10
; ββ
; ββ @ array.jl:720 within `iterate'
je L244
L214:
leaq -1(%rcx), %rsi
nopw (%rax,%rax)
; ββ
; ββ @ int.jl:53 within `+'
L224:
addq (%r8,%rdx,8), %rax
; ββ
; ββ @ array.jl:720 within `iterate'
; βββ @ int.jl:430 within `<' @ int.jl:423
addq $1, %rsi
movq %rcx, %rdx
; βββ
; βββ @ int.jl:53 within `+'
leaq 1(%rcx), %rcx
; βββ
; ββ @ int.jl:423 within `iterate'
cmpq %r9, %rsi
; ββ
; β @ array.jl:720 within `testMe'
jb L224
; β
; β @ untitled-a318115c5de073a6de1a9c13afc76741:8 within `testMe'
L244:
addq $40, %rsp
popq %rsi
popq %rbp
vzeroupper
retq
nop
; β
This is Julia 1.3, windows 10, Intel CPU
Why is Juliaβs ASM output so much larger?