Iterate with 1:N is faster than 1:length(A)

Hello, I am about to benchmark different ways of iterate over an array and I have realized a strange thing (I am on julia 1.5.1)

The following test:

julia> using BenchmarkTools

julia> function N_iterator(A, N) 
               for i ∈ 1:N
                       A[i] = i  
               end
       end
 N_iterator (generic function with 1 method)

julia> function length_iterator(A) 
               for i ∈ 1:length(A)
                       A[i] = i  
               end
       end
 length_iterator (generic function with 1 method)

julia> N = 100_000
 100000

julia> const A = fill(1, N);

julia> @btime N_iterator(A, N)
  45.661 ΞΌs (0 allocations: 0 bytes)

julia> @btime length_iterator(A)
  68.741 ΞΌs (0 allocations: 0 bytes)

shows that passing an integer as N is faster than calculate it from the length. Why is it so? I mean, the compiler could see that 1:length(A) is a valid index but cannot see N = length(A) by definition…

On my machine they take the same time (also v1.5.1)…

julia> @btime N_iterator(A, N)
  64.983 ΞΌs (0 allocations: 0 bytes)

julia> @btime length_iterator(A)

  64.983 ΞΌs (0 allocations: 0 bytes)

(I also checked that $ doesn’t make a difference in this case to be sure)

Ok, so it could be an artifact of time measurement?

However, the machine code looks different in both cases:

julia> @code_native N_iterator(A, N)
        .text
; β”Œ @ eachindex.jl:3 within `N_iterator'
        pushq   %rbp
        movq    %rsp, %rbp
; β”‚ @ eachindex.jl:4 within `N_iterator'
; β”‚β”Œ @ range.jl:5 within `Colon'
; β”‚β”‚β”Œ @ range.jl:280 within `UnitRange'
; β”‚β”‚β”‚β”Œ @ range.jl:285 within `unitrange_last'
; β”‚β”‚β”‚β”‚β”Œ @ operators.jl:350 within `>='
; β”‚β”‚β”‚β”‚β”‚β”Œ @ int.jl:441 within `<='
        testq   %rsi, %rsi
; β”‚β””β””β””β””β””
        jle     L80
        movq    %rsi, %rax
        sarq    $63, %rax
        andnq   %rsi, %rax, %rcx
        movq    (%rdi), %r9
        movq    8(%rdi), %rsi
; β”‚ @ eachindex.jl:5 within `N_iterator'
; β”‚β”Œ @ array.jl:847 within `setindex!'
        leaq    1(%rsi), %r8
        negq    %rsi
        negq    %rcx
        movl    $1, %eax
        nopl    (%rax,%rax)
L48:
        leaq    (%rsi,%rax), %rdx
        cmpq    $1, %rdx
        je      L85
        movq    %rax, -8(%r9,%rax,8)
; β”‚β””
; β”‚β”Œ @ range.jl:624 within `iterate'
; β”‚β”‚β”Œ @ promotion.jl:398 within `=='
        leaq    (%rcx,%rax), %rdx
        addq    $1, %rdx
; β”‚β”‚β””
        incq    %rax
; β”‚β”‚β”Œ @ promotion.jl:398 within `=='
        cmpq    $1, %rdx
; β”‚β””β””
        jne     L48
L80:
        movq    %rbp, %rsp
        popq    %rbp
        retq
; β”‚β”Œ @ array.jl:847 within `setindex!'
L85:
        movq    %rsp, %rax
        leaq    -16(%rax), %rsi
        movq    %rsi, %rsp
        movq    %r8, -16(%rax)
        movabsq $jl_bounds_error_ints, %rax
        movl    $1, %edx
        callq   *%rax
        nopw    %cs:(%rax,%rax)
        nop
; β””β””

julia> @code_native length_iterator(A)
        .text
; β”Œ @ eachindex.jl:9 within `length_iterator'
        pushq   %rbp
        movq    %rsp, %rbp
        subq    $16, %rsp
        movq    %rsi, -8(%rbp)
        movq    (%rsi), %rdi
; β”‚ @ eachindex.jl:10 within `length_iterator'
; β”‚β”Œ @ array.jl:219 within `length'
        movq    8(%rdi), %rax
; β”‚β””
; β”‚β”Œ @ range.jl:5 within `Colon'
; β”‚β”‚β”Œ @ range.jl:280 within `UnitRange'
; β”‚β”‚β”‚β”Œ @ range.jl:285 within `unitrange_last'
; β”‚β”‚β”‚β”‚β”Œ @ operators.jl:350 within `>='
; β”‚β”‚β”‚β”‚β”‚β”Œ @ int.jl:441 within `<='
        testq   %rax, %rax
; β”‚β””β””β””β””β””
        jle     L96
        movq    %rax, %rcx
        sarq    $63, %rcx
        andnq   %rax, %rcx, %rdx
        movq    (%rdi), %r9
; β”‚ @ eachindex.jl:11 within `length_iterator'
; β”‚β”Œ @ array.jl:847 within `setindex!'
        leaq    1(%rax), %r8
        negq    %rax
        negq    %rdx
        movl    $1, %ecx
        nopw    %cs:(%rax,%rax)
L64:
        leaq    (%rax,%rcx), %rsi
        cmpq    $1, %rsi
        je      L111
        movq    %rcx, -8(%r9,%rcx,8)
; β”‚β””
; β”‚β”Œ @ range.jl:624 within `iterate'
; β”‚β”‚β”Œ @ promotion.jl:398 within `=='
        leaq    (%rdx,%rcx), %rsi
        addq    $1, %rsi
; β”‚β”‚β””
        incq    %rcx
; β”‚β”‚β”Œ @ promotion.jl:398 within `=='
        cmpq    $1, %rsi
; β”‚β””β””
        jne     L64
L96:
        movabsq $jl_system_image_data, %rax
; β”‚ @ eachindex.jl:11 within `length_iterator'
        movq    %rbp, %rsp
        popq    %rbp
        retq
; β”‚β”Œ @ array.jl:847 within `setindex!'
L111:
        movq    %rsp, %rax
        leaq    -16(%rax), %rsi
        movq    %rsi, %rsp
        movq    %r8, -16(%rax)
        movabsq $jl_bounds_error_ints, %rax
        movl    $1, %edx
        callq   *%rax
; β””β””

Here’s a fun experiment:

function N_iterator(A, N) 
    for i ∈ 1:N
            A[i] = i  
    end
end

function length_iterator(A) 
    for i ∈ 1:length(A)
            A[i] = i  
    end
end
function Nlength_iterator(A) 
    n = (sizeof(A)/8) |> Int
    for i ∈ 1:n
        A[i] = i  
    end
end

N = 100000

const A = fill(1, N);
using BenchmarkTools
@btime N_iterator(A, N)
# 25.793 ΞΌs (0 allocations: 0 bytes)
@btime length_iterator(A)
#  35.018 ΞΌs (0 allocations: 0 bytes)
@btime Nlength_iterator(A)
#  25.746 ΞΌs (0 allocations: 0 bytes)

Julia 1.4.2 I think

Seems like this should use div instead.

Also, why is everyone refusing to interpolate variables in the benchmarks? Sure, it often doesn’t make a difference, but it also often does. Isn’t it just good practice?

7 Likes

Actually, I have done so now, and the difference gone away…

3 Likes

I see your definition of often is smaller than or equal to 50% of the cases.

3 Likes

Was the issue variable interpolation or defining A as a constant IE: const A = ... in this instance?

I wasn’t able to reproduce a speed difference with any combination of interpolating, const, etc.
It’s a bit disconcerting seeing a ~30% speed hit with no explanation.

1 Like

Actually, my definition is β€˜greater than or equal to some number smaller than 50%’.

1 Like

In this case, it could mean a hardcoding of N in the inner loop (if it inlines). I’d suspect the bigger difference is just happenstance with funny/lucky/unlucky caching behaviors. The difference in the machine code the OP posted is essentially just the length call itself β€” but that’s not the code BenchmarkTools is actually running. What’s running is more akin to:

f() = length_iterator(A)
g() = N_iterator(A, N)

When you flag the arguments with a $, then you actually test what you want.

1 Like

This is a very simple mental model I wish I had stumbled before and I will shamelessly steal it.