Why is Julia's generated assembly more complicated than Swift's for a simple function?

Consider the following example in swift, from: https://groups.google.com/a/tensorflow.org/forum/m/?hl=bn#!topic/swift/qGgM5HKXVrE

func f() -> [Int] {
    return [0, 1]


}

func testMe() -> Int {
    var r = 0
    for x in f() {
        r += x
    }
    return r
}

According to Dave in that thread,

β€œWhen I build that with swiftc -O -S, I see this definition of testMe, which shows me that at least in some circumstances, Swift can specialize on not just the length but the contents of an Array:”

With the following output:

_$s1x6testMeSiyF:
pushq %rbp
movq %rsp, %rbp
movl $1, %eax
popq %rbp
retq

However the comparable Julia code:

f()= [0,1]

function testMe(F)
    r=0
    for x in F()
        r+=x
    end
    return r
end

@code_native testMe(f)

Gives:

        .text
; β”Œ @ untitled-10d92c9353faa31dff3822d67fccff61:4 within `sum_arg'
        pushq   %rbp
        movq    %rsp, %rbp
; β”‚ @ untitled-10d92c9353faa31dff3822d67fccff61:5 within `sum_arg'
; β”‚β”Œ @ array.jl:720 within `iterate' @ array.jl:720
; β”‚β”‚β”Œ @ array.jl:216 within `length'
        movq    8(%rcx), %rax
; β”‚β”‚β””
        testq   %rax, %rax
        jle     L64
; β”‚β”‚β”Œ @ array.jl:744 within `getindex'
        movq    (%rcx), %rcx
; β”‚β””β””
; β”‚ @ untitled-10d92c9353faa31dff3822d67fccff61:6 within `sum_arg'
; β”‚β”Œ @ float.jl:401 within `+'
        vxorpd  %xmm0, %xmm0, %xmm0
        vaddsd  (%rcx), %xmm0, %xmm0
; β”‚β””
; β”‚β”Œ @ array.jl:720 within `iterate'
; β”‚β”‚β”Œ @ int.jl:430 within `<' @ int.jl:423
        cmpq    $1, %rax
; β”‚β”‚β””
        je      L62
; β””β””
; β”Œ @ array.jl within `sum_arg'
        movl    $1, %edx
        nopw    %cs:(%rax,%rax)
; β””
; β”Œ @ untitled-10d92c9353faa31dff3822d67fccff61:6 within `sum_arg'
; β”‚β”Œ @ float.jl:401 within `+'
L48:
        vaddsd  (%rcx,%rdx,8), %xmm0, %xmm0
; β”‚β””
; β”‚β”Œ @ array.jl:720 within `iterate'
; β”‚β”‚β”Œ @ int.jl:430 within `<' @ int.jl:423
        addq    $1, %rdx
        cmpq    %rax, %rdx
; β”‚β”‚β””
        jb      L48
; β”‚β””
; β”‚ @ untitled-10d92c9353faa31dff3822d67fccff61:8 within `sum_arg'f3822d67fccff61:8 within `sum_arg'
L62:
        popq    %rbp
        retq
L64:
        vxorps  %xmm0, %xmm0, %xmm0
; β”‚ @ untitled-10d92c9353faa31dff3822d67fccff61:8 within `sum_arg'
        popq    %rbp
        retq
        nopw    %cs:(%rax,%rax)
; β””


;  @ untitled-a318115c5de073a6de1a9c13afc76741:4 within `testMe'
; Function Attrs: uwtable
define i64 @julia_testMe_18378() #0 {
L24:
;  @ untitled-a318115c5de073a6de1a9c13afc76741:5 within `testMe'
; β”Œ @ untitled-a318115c5de073a6de1a9c13afc76741:1 within `f'
; β”‚β”Œ @ array.jl:130 within `vect'
; β”‚β”‚β”Œ @ array.jl:614 within `_array_for'
; β”‚β”‚β”‚β”Œ @ abstractarray.jl:671 within `similar' @ abstractarray.jl:672
; β”‚β”‚β”‚β”‚β”Œ @ boot.jl:413 within `Array' @ boot.jl:404
       %0 = call %jl_value_t addrspace(10)* inttoptr (i64 1720217328 to %jl_value_t addrspace(10)* (%jl_value_t addrspace(10)*, i64)*)(%jl_value_t addrspace(10)* addrspacecast (%jl_value_t* inttoptr (i64 121410064 to %jl_value_t*) to %jl_value_t addrspace(10)*), i64 2)
       %1 = addrspacecast %jl_value_t addrspace(10)* %0 to %jl_value_t addrspace(11)*
       %2 = bitcast %jl_value_t addrspace(11)* %1 to i64 addrspace(13)* addrspace(11)*
       %3 = load i64 addrspace(13)*, i64 addrspace(13)* addrspace(11)* %2, align 8
; β”‚β”‚β””β””β””
; β”‚β”‚β”Œ @ array.jl:782 within `setindex!'
     %4 = bitcast i64 addrspace(13)* %3 to <2 x i64> addrspace(13)*
     store <2 x i64> <i64 0, i64 1>, <2 x i64> addrspace(13)* %4, align 8
; β””β””β””
; β”Œ @ array.jl:720 within `iterate' @ array.jl:720
; β”‚β”Œ @ array.jl:216 within `length'
    %5 = bitcast %jl_value_t addrspace(11)* %1 to %jl_array_t addrspace(11)*
    %6 = getelementptr inbounds %jl_array_t, %jl_array_t addrspace(11)* %5, i64 0, i32 1
    %7 = load i64, i64 addrspace(11)* %6, align 8
; β”‚β””
   %8 = icmp slt i64 %7, 2
   br i1 %8, label %L66, label %L65.lr.ph.L65.lr.ph.split_crit_edge

L65.lr.ph.L65.lr.ph.split_crit_edge:              ; preds = %L24
; β””
;  @ untitled-a318115c5de073a6de1a9c13afc76741:6 within `testMe'
; β”Œ @ array.jl:720 within `iterate'
   %9 = add i64 %7, -1
   %min.iters.check = icmp ult i64 %9, 16
   br i1 %min.iters.check, label %scalar.ph, label %vector.ph

vector.ph:                                        ; preds = %L65.lr.ph.L65.lr.ph.split_crit_edge
   %n.vec = and i64 %9, -16
   %ind.end = or i64 %n.vec, 1
   %ind.end28 = or i64 %n.vec, 2
   br label %vector.body

vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %vec.phi = phi <4 x i64> [ zeroinitializer, %vector.ph ], [ %18, %vector.body ]
   %vec.phi32 = phi <4 x i64> [ zeroinitializer, %vector.ph ], [ %19, %vector.body ]
   %vec.phi33 = phi <4 x i64> [ zeroinitializer, %vector.ph ], [ %20, %vector.body ]
   %vec.phi34 = phi <4 x i64> [ zeroinitializer, %vector.ph ], [ %21, %vector.body ]
   %offset.idx = or i64 %index, 1
; β”‚β”Œ @ array.jl:744 within `getindex'
    %10 = getelementptr inbounds i64, i64 addrspace(13)* %3, i64 %offset.idx
; β””β””
;  @ untitled-a318115c5de073a6de1a9c13afc76741:5 within `testMe'
; β”Œ @ array.jl:720 within `iterate' @ array.jl:720
; β”‚β”Œ @ array.jl:744 within `getindex'
    %11 = bitcast i64 addrspace(13)* %10 to <4 x i64> addrspace(13)*
    %wide.load = load <4 x i64>, <4 x i64> addrspace(13)* %11, align 8
    %12 = getelementptr i64, i64 addrspace(13)* %10, i64 4
    %13 = bitcast i64 addrspace(13)* %12 to <4 x i64> addrspace(13)*
    %wide.load42 = load <4 x i64>, <4 x i64> addrspace(13)* %13, align 8
    %14 = getelementptr i64, i64 addrspace(13)* %10, i64 8
    %15 = bitcast i64 addrspace(13)* %14 to <4 x i64> addrspace(13)*
    %wide.load43 = load <4 x i64>, <4 x i64> addrspace(13)* %15, align 8
    %16 = getelementptr i64, i64 addrspace(13)* %10, i64 12
    %17 = bitcast i64 addrspace(13)* %16 to <4 x i64> addrspace(13)*
    %wide.load44 = load <4 x i64>, <4 x i64> addrspace(13)* %17, align 8
; β””β””
;  @ untitled-a318115c5de073a6de1a9c13afc76741:6 within `testMe'
; β”Œ @ int.jl:53 within `+'
   %18 = add <4 x i64> %wide.load, %vec.phi
   %19 = add <4 x i64> %wide.load42, %vec.phi32
   %20 = add <4 x i64> %wide.load43, %vec.phi33
   %21 = add <4 x i64> %wide.load44, %vec.phi34
   %index.next = add i64 %index, 16
   %22 = icmp eq i64 %index.next, %n.vec
   br i1 %22, label %middle.block, label %vector.body

middle.block:                                     ; preds = %vector.body
   %bin.rdx = add <4 x i64> %19, %18
   %bin.rdx45 = add <4 x i64> %20, %bin.rdx
   %bin.rdx46 = add <4 x i64> %21, %bin.rdx45
   %rdx.shuf = shufflevector <4 x i64> %bin.rdx46, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
   %bin.rdx47 = add <4 x i64> %bin.rdx46, %rdx.shuf
   %rdx.shuf48 = shufflevector <4 x i64> %bin.rdx47, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
   %bin.rdx49 = add <4 x i64> %bin.rdx47, %rdx.shuf48
   %23 = extractelement <4 x i64> %bin.rdx49, i32 0
   %cmp.n = icmp eq i64 %9, %n.vec
; β””
; β”Œ @ array.jl:720 within `iterate'
   br i1 %cmp.n, label %L66, label %scalar.ph

scalar.ph:                                        ; preds = %middle.block, %L65.lr.ph.L65.lr.ph.split_crit_edge
   %bc.resume.val = phi i64 [ %ind.end, %middle.block ], [ 1, %L65.lr.ph.L65.lr.ph.split_crit_edge ]
   %bc.resume.val27 = phi i64 [ %ind.end28, %middle.block ], [ 2, %L65.lr.ph.L65.lr.ph.split_crit_edge ]
   %bc.merge.rdx = phi i64 [ %23, %middle.block ], [ 0, %L65.lr.ph.L65.lr.ph.split_crit_edge ]
   br label %L65

L65:                                              ; preds = %L65, %scalar.ph
   %24 = phi i64 [ %bc.resume.val, %scalar.ph ], [ %value_phi1120, %L65 ]
   %25 = phi i64 [ %bc.merge.rdx, %scalar.ph ], [ %28, %L65 ]
   %value_phi1120 = phi i64 [ %bc.resume.val27, %scalar.ph ], [ %27, %L65 ]
; β”‚β”Œ @ array.jl:744 within `getindex'
    %26 = getelementptr inbounds i64, i64 addrspace(13)* %3, i64 %24
; β”‚β””
; β”‚β”Œ @ int.jl:53 within `+'
    %27 = add i64 %value_phi1120, 1
; β””β””
;  @ untitled-a318115c5de073a6de1a9c13afc76741:5 within `testMe'
; β”Œ @ array.jl:720 within `iterate' @ array.jl:720
; β”‚β”Œ @ array.jl:744 within `getindex'
    %value_phi10 = load i64, i64 addrspace(13)* %26, align 8
; β””β””
;  @ untitled-a318115c5de073a6de1a9c13afc76741:6 within `testMe'
; β”Œ @ int.jl:53 within `+'
   %28 = add i64 %value_phi10, %25
; β””
; β”Œ @ array.jl:720 within `iterate'
; β”‚β”Œ @ int.jl:430 within `<' @ int.jl:423
    %29 = icmp ult i64 %value_phi1120, %7
; β”‚β””
   br i1 %29, label %L65, label %L66

L66:                                              ; preds = %L65, %middle.block, %L24
   %value_phi15 = phi i64 [ 0, %L24 ], [ %28, %L65 ], [ %23, %middle.block ]
; β””
;  @ untitled-a318115c5de073a6de1a9c13afc76741:8 within `testMe'
  ret i64 %value_phi15
}
        .text
; β”Œ @ untitled-a318115c5de073a6de1a9c13afc76741:4 within `testMe'
        pushq   %rbp
        movq    %rsp, %rbp
; β”‚ @ untitled-a318115c5de073a6de1a9c13afc76741:5 within `testMe'
; β”‚β”Œ @ untitled-a318115c5de073a6de1a9c13afc76741:1 within `f'
; β”‚β”‚β”Œ @ array.jl:130 within `vect'
; β”‚β”‚β”‚β”Œ @ array.jl:614 within `_array_for'
; β”‚β”‚β”‚β”‚β”Œ @ abstractarray.jl:671 within `similar' @ abstractarray.jl:672
; β”‚β”‚β”‚β”‚β”‚β”Œ @ boot.jl:413 within `Array' @ boot.jl:404
        pushq   %rsi
        subq    $40, %rsp
        movl    $jl_alloc_array_1d, %eax
        movl    $jl_system_image_data, %ecx
        movl    $2, %edx
        callq   *%rax
        movq    (%rax), %r8
; β”‚β”‚β”‚β””β””β””
; β”‚β”‚β”‚ @ array.jl:782 within `vect'
        movl    $1, %ecx
        vmovq   %rcx, %xmm0
        vpslldq $8, %xmm0, %xmm0        # xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
        vmovdqu %xmm0, (%r8)
; β”‚β””β””
; β”‚β”Œ @ array.jl:720 within `iterate' @ array.jl:720
; β”‚β”‚β”Œ @ array.jl:216 within `length'
        movq    8(%rax), %r9
; β”‚β”‚β””
        cmpq    $2, %r9
        jge     L68
; β””β””
; β”Œ @ array.jl within `testMe'
        xorl    %eax, %eax
; β””
; β”Œ @ untitled-a318115c5de073a6de1a9c13afc76741:8 within `testMe'
        addq    $40, %rsp
        popq    %rsi
        popq    %rbp
        retq
; β”‚ @ untitled-a318115c5de073a6de1a9c13afc76741:6 within `testMe'
; β”‚β”Œ @ array.jl:720 within `iterate'
L68:
        leaq    -1(%r9), %r10
        cmpq    $16, %r10
        jae     L92
        xorl    %eax, %eax
        movl    $2, %ecx
        movl    $1, %edx
        jmp     L214
; β”‚β”‚ @ array.jl:720 within `iterate'
L92:
        movq    %r10, %r11
        andq    $-16, %r11
        leaq    1(%r11), %rdx
        leaq    2(%r11), %rcx
        leaq    104(%r8), %rax
        vpxor   %xmm0, %xmm0, %xmm0
        movq    %r11, %rsi
        vpxor   %xmm1, %xmm1, %xmm1
        vpxor   %xmm2, %xmm2, %xmm2
        vpxor   %xmm3, %xmm3, %xmm3
        nopw    %cs:(%rax,%rax)
; β”‚β””
; β”‚β”Œ @ int.jl:53 within `+'
L144:
        vpaddq  -96(%rax), %ymm0, %ymm0
        vpaddq  -64(%rax), %ymm1, %ymm1
        vpaddq  -32(%rax), %ymm2, %ymm2
        vpaddq  (%rax), %ymm3, %ymm3
        subq    $-128, %rax
        addq    $-16, %rsi
        jne     L144
        vpaddq  %ymm0, %ymm1, %ymm0
        vpaddq  %ymm0, %ymm2, %ymm0
        vpaddq  %ymm0, %ymm3, %ymm0
        vextracti128    $1, %ymm0, %xmm1
        vpaddq  %ymm1, %ymm0, %ymm0
        vpshufd $78, %xmm0, %xmm1       # xmm1 = xmm0[2,3,0,1]
        vpaddq  %ymm1, %ymm0, %ymm0
        vmovq   %xmm0, %rax
        cmpq    %r11, %r10
; β”‚β””
; β”‚β”Œ @ array.jl:720 within `iterate'
        je      L244
L214:
        leaq    -1(%rcx), %rsi
        nopw    (%rax,%rax)
; β”‚β””
; β”‚β”Œ @ int.jl:53 within `+'
L224:
        addq    (%r8,%rdx,8), %rax
; β”‚β””
; β”‚β”Œ @ array.jl:720 within `iterate'
; β”‚β”‚β”Œ @ int.jl:430 within `<' @ int.jl:423
        addq    $1, %rsi
        movq    %rcx, %rdx
; β”‚β”‚β””
; β”‚β”‚β”Œ @ int.jl:53 within `+'
        leaq    1(%rcx), %rcx
; β”‚β””β””
; β”‚β”Œ @ int.jl:423 within `iterate'
        cmpq    %r9, %rsi
; β””β””
; β”Œ @ array.jl:720 within `testMe'
        jb      L224
; β””
; β”Œ @ untitled-a318115c5de073a6de1a9c13afc76741:8 within `testMe'
L244:
        addq    $40, %rsp
        popq    %rsi
        popq    %rbp
        vzeroupper
        retq
        nop
; β””
julia> @code_native testMe(f)
        .text
; β”Œ @ untitled-a318115c5de073a6de1a9c13afc76741:4 within `testMe'
        pushq   %rbp
        movq    %rsp, %rbp
; β”‚ @ untitled-a318115c5de073a6de1a9c13afc76741:5 within `testMe'
; β”‚β”Œ @ untitled-a318115c5de073a6de1a9c13afc76741:1 within `f'
; β”‚β”‚β”Œ @ array.jl:130 within `vect'
; β”‚β”‚β”‚β”Œ @ array.jl:614 within `_array_for'
; β”‚β”‚β”‚β”‚β”Œ @ abstractarray.jl:671 within `similar' @ abstractarray.jl:672
; β”‚β”‚β”‚β”‚β”‚β”Œ @ boot.jl:413 within `Array' @ boot.jl:404
        pushq   %rsi
        subq    $40, %rsp
        movl    $jl_alloc_array_1d, %eax
        movl    $jl_system_image_data, %ecx
        movl    $2, %edx
        callq   *%rax
        movq    (%rax), %r8
; β”‚β”‚β”‚β””β””β””
; β”‚β”‚β”‚ @ array.jl:782 within `vect'
        movl    $1, %ecx
        vmovq   %rcx, %xmm0
        vpslldq $8, %xmm0, %xmm0        # xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
        vmovdqu %xmm0, (%r8)
; β”‚β””β””
; β”‚β”Œ @ array.jl:720 within `iterate' @ array.jl:720
; β”‚β”‚β”Œ @ array.jl:216 within `length'
        movq    8(%rax), %r9
; β”‚β”‚β””
        cmpq    $2, %r9
        jge     L68
; β””β””
; β”Œ @ array.jl within `testMe'
        xorl    %eax, %eax
; β””
; β”Œ @ untitled-a318115c5de073a6de1a9c13afc76741:8 within `testMe'
        addq    $40, %rsp
        popq    %rsi
        popq    %rbp
        retq
; β”‚ @ untitled-a318115c5de073a6de1a9c13afc76741:6 within `testMe'
; β”‚β”Œ @ array.jl:720 within `iterate'
L68:
        leaq    -1(%r9), %r10
        cmpq    $16, %r10
        jae     L92
        xorl    %eax, %eax
        movl    $2, %ecx
        movl    $1, %edx
        jmp     L214
; β”‚β”‚ @ array.jl:720 within `iterate'
L92:
        movq    %r10, %r11
        andq    $-16, %r11
        leaq    1(%r11), %rdx
        leaq    2(%r11), %rcx
        leaq    104(%r8), %rax
        vpxor   %xmm0, %xmm0, %xmm0
        movq    %r11, %rsi
        vpxor   %xmm1, %xmm1, %xmm1
        vpxor   %xmm2, %xmm2, %xmm2
        vpxor   %xmm3, %xmm3, %xmm3
        nopw    %cs:(%rax,%rax)
; β”‚β””
; β”‚β”Œ @ int.jl:53 within `+'
L144:
        vpaddq  -96(%rax), %ymm0, %ymm0
        vpaddq  -64(%rax), %ymm1, %ymm1
        vpaddq  -32(%rax), %ymm2, %ymm2
        vpaddq  (%rax), %ymm3, %ymm3
        subq    $-128, %rax
        addq    $-16, %rsi
        jne     L144
        vpaddq  %ymm0, %ymm1, %ymm0
        vpaddq  %ymm0, %ymm2, %ymm0
        vpaddq  %ymm0, %ymm3, %ymm0
        vextracti128    $1, %ymm0, %xmm1
        vpaddq  %ymm1, %ymm0, %ymm0
        vpshufd $78, %xmm0, %xmm1       # xmm1 = xmm0[2,3,0,1]
        vpaddq  %ymm1, %ymm0, %ymm0
        vmovq   %xmm0, %rax
        cmpq    %r11, %r10
; β”‚β””
; β”‚β”Œ @ array.jl:720 within `iterate'
        je      L244
L214:
        leaq    -1(%rcx), %rsi
        nopw    (%rax,%rax)
; β”‚β””
; β”‚β”Œ @ int.jl:53 within `+'
L224:
        addq    (%r8,%rdx,8), %rax
; β”‚β””
; β”‚β”Œ @ array.jl:720 within `iterate'
; β”‚β”‚β”Œ @ int.jl:430 within `<' @ int.jl:423
        addq    $1, %rsi
        movq    %rcx, %rdx
; β”‚β”‚β””
; β”‚β”‚β”Œ @ int.jl:53 within `+'
        leaq    1(%rcx), %rcx
; β”‚β””β””
; β”‚β”Œ @ int.jl:423 within `iterate'
        cmpq    %r9, %rsi
; β””β””
; β”Œ @ array.jl:720 within `testMe'
        jb      L224
; β””
; β”Œ @ untitled-a318115c5de073a6de1a9c13afc76741:8 within `testMe'
L244:
        addq    $40, %rsp
        popq    %rsi
        popq    %rbp
        vzeroupper
        retq
        nop
; β””

This is Julia 1.3, windows 10, Intel CPU

Why is Julia’s ASM output so much larger?

2 Likes

Fundamentally, large parts of julia don’t live in the JIT-llvm-world and are instead runtime library. This includes stuff like codegen, garbage collection, and array bookkeeping. The functions must be readily available from C, and must be available in a statically compiled version (for embedding julia).

Currently, these parts are implemented in C / C++, compiled to machine code, shipped to users who don’t grab sources and make, and then linked into the julia process.

It would be better to also compile to bitcode for parts of the runtime, and have a second version available that permits llvm to β€œsee through” parts of the runtime. LLVM is theoretically capable of cross-language IPO / LTO (interprocedural optimization / link-time optimization); but getting this to work properly is hard. (we suddenly have two versions of the library: One dynamically loaded ELF, and one bitcode that we can link into the JIT context; both may refer to global variables or external functions, and that needs to be properly merged. The super cool feature would be to teach ORCJIT to load ELF with embedded bitcode and then play pick-and-choose between implementations)

3 Likes

That part is not really relevant in this case. This is simply because the Julia compiler is not taught much about arrays. Such optimization should be possible without any change to the array implementation though the difficult and the coverage for such optimization is hard to tell without trying it out.

2 Likes

Making it a tuple instead…

f()= (0,1)

function testMe(F)
    r=0
    for x in F()
        r+=x
    end
    return r
end

@code_native debuginfo=:none testMe(f)

yields

#julia> @code_native debuginfo=:none testMe(f)
        .text
        movl    $1, %eax
        retq
        nopw    %cs:(%rax,%rax)
6 Likes

I’m not really sure what you want to show but it neither proof or disproof my point.

What I’m saying is, despite the complex internal structure and runtime handling of the array (which tuples lacks so the example given doesn’t proof my point), there are enough that the compiler could know about that they should be able to perform optimization on arrays even in the current implementation. We β€œjust” need better ways to pass on the information from the source code to the optimizer.

A closer example is optimization elission for other objects. The code that allocate such a structure on the heap is quite complex. However, as long as we can hide/abstract that complexity to the optimizer it is entirely possible to do such optimizations.

What are the optimizations that Swift is doing which Julia isn’t? Is it related to Swift’s ARC, return type declaration (don’t think so right?) or copy on write/ value array semantics?

None of above. Those are all implementation details that doesn’t matter at all. The optimization that swift has that julia doesn’t have is the optimization itself.

In fact, none of what you mentioned are optimizations anyway. They are properties of the language and none of them enables/disables such optimization.

That given a simpler structure Julia does generate similar to the Swift example (the call is inlined, and the loop and additions replaced with moving $1 into a register).
My comment wasn’t relevant to foobar’s, nor was it very relevant to your response, but I thought it may be relevant to the OP. I don’t know much about arrays in Swift. If they’re like llvm arrays (arrays in many languages are statically sized), then a homogenous Julia tuple is analogous, and then this (analogous inputs, analogous outputs) would be an answer to OP’s question on differences in asm generated.

That said,

This is all exciting, and would be a wonderful optimization to have. I am glad that β€œSuch optimization should be possible without any change to the array implementation” and β€œWe β€œjust” need better ways to pass on the information from the source code to the optimizer.”

Does there happen to be an issue on this? It seems like this would improve the performance of a lot of naively written code, and free us to create small arrays in functions without performance penalty.

4 Likes

OK, you quoted my reply which is why I was confused what you are trying to show. Yes, if you don’t need to use arrays then a tuple is fine. However, you shouldn’t need to change type in order to achieve that. An array is as fine as any other types. There shouldn’t be any need to introduce things like StaticArrays.

edit: and I mean no need for StaticArray for the purpose of allocation ellision. The Array itself still won’t be the fastest thing to allocate but that’s also an fairly orthogonal optimization, which now has everything to do with the runtime @foobar_lv2 mentioned but isn’t related to this thread.

1 Like

They are properties of the language and none of them enables/disables such optimization.

Right, I’m curious wondering if one of these semantic differences makes these optimizations easier/more predictable.

They affect what optimization can be done in which condition. Pretty much the only thing you need to worry about is escape analysis and it’s usually such that different semantics allows the same optimization in different cases.

For this specific case though, as long as F() is inlined there’s no challenge. (edit: i.e. it’s the easiest case. I’m not saying such optimization is easy to write in general…)

2 Likes

Shouldn’t it be f()::Vector{Int64}= [0,1]? It seems Swift type annotate the output

Adding constant prop for arrays like this would be quite easy, but would have significant compile time cost, so we don’t do that. If you want to manually opt into that kind of thing you can use StaticArrays (or tuples as was previously mentioned). We are planning a bit of an Array overhaul for 2.0, at which point Array may be closer to StaticArrays and this kind of optimization may happen, but for now, Array is just the wrong data structure for what you’re looking for here.

16 Likes

That’s interesting. Have any of the plans or discussions made their way to github, or somewhere else, for everyone to follow? (Just being curious.)

8 Likes

I’m going to guess it’s this https://github.com/JuliaLang/julia/pull/31630

4 Likes

Yes, something in that direction. It’s not currently an active work item though.

Curious, aside from time to first plot, what are the other compiler and also type system priorities?