Benchmarking with interpolated variable faster than hardcoded literal value

Consistently observing that using an interpolated variable to iterate over for i = 1:n is faster than using a literal for i = 1:10. It’s about 7.5 ns vs 6 ns, so not much absolute difference, but somewhat significant relative difference. Maybe this is just an artifact of such small time intervals.

Mostly curious why this is happening. I’d expect the compiler to do best with literal / hardcoded values.

n = 10
a1 = collect(1:n)
a2 = collect(1:n)
b = fill(0,n)

function const_range(a1, a2, b)
    @inbounds for i = 1:10
         b[i] = a1[i] * a2[i]
    end
end

function var_range(a1, a2, b, n)
    @inbounds for i = 1:n
         b[i] = a1[i] * a2[i]
    end
end

julia> @benchmark const_range($a1, $a2, $b)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     7.381 ns (0.00% GC)
  median time:      7.661 ns (0.00% GC)
  mean time:        7.884 ns (0.00% GC)
  maximum time:     38.938 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     999

julia> @benchmark var_range($a1, $a2, $b, $n)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     6.044 ns (0.00% GC)
  median time:      6.074 ns (0.00% GC)
  mean time:        6.367 ns (0.00% GC)
  maximum time:     38.083 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     999
1 Like

Interesting. I can reproduce.

I wonder what is happening under the hood.
Also interesting that

function var_range2(a1, a2, b)
    @inbounds for i = 1:length(b)
        b[i] = a1[i] * a2[i]
    end
end

is slower. As is passing in a reference to n.

julia> rn = Ref(n);

julia> @benchmark var_range($a1, $a2, $b, ($rn)[])
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     8.060 ns (0.00% GC)
  median time:      8.202 ns (0.00% GC)
  mean time:        8.496 ns (0.00% GC)
  maximum time:     34.533 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     999

julia> @benchmark var_range2($a1, $a2, $b)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     6.585 ns (0.00% GC)
  median time:      6.785 ns (0.00% GC)
  mean time:        6.899 ns (0.00% GC)
  maximum time:     23.647 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1000

Your two originals give me:

julia> @benchmark const_range($a1, $a2, $b)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     5.759 ns (0.00% GC)
  median time:      5.952 ns (0.00% GC)
  mean time:        6.196 ns (0.00% GC)
  maximum time:     23.878 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1000

julia> @benchmark var_range($a1, $a2, $b, $n)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     4.692 ns (0.00% GC)
  median time:      5.369 ns (0.00% GC)
  mean time:        5.417 ns (0.00% GC)
  maximum time:     22.348 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1000

I am not sure @code_native helps us here, because

const_range2(a1, a2, b) = var_range(a1, a2, b, 10)

gives us the exact same assembly (and benchmark performance) as const_range. Meaning I don’t know what the benchmark is really benchmarking; is it able to optimize based on the constant n?

For what it’s worth, the assembly shows that const_range is completely unrolled, while var_range is a for loop:

julia> @code_native debuginfo=:none const_range(a1, a2, b)
	.text
	movq	%rsi, -8(%rsp)
	movq	(%rsi), %rax
	movq	8(%rsi), %rcx
	movq	16(%rsi), %rsi
	movq	(%rax), %rax
	movq	(%rcx), %rdx
	movq	(%rsi), %rcx
	movq	(%rdx), %rsi
	imulq	(%rax), %rsi
	movq	%rsi, (%rcx)
	movq	8(%rdx), %rsi
	imulq	8(%rax), %rsi
	movq	%rsi, 8(%rcx)
	movq	16(%rdx), %rsi
	imulq	16(%rax), %rsi
	movq	%rsi, 16(%rcx)
	movq	24(%rdx), %rsi
	imulq	24(%rax), %rsi
	movq	%rsi, 24(%rcx)
	movq	32(%rdx), %rsi
	imulq	32(%rax), %rsi
	movq	%rsi, 32(%rcx)
	movq	40(%rdx), %rsi
	imulq	40(%rax), %rsi
	movq	%rsi, 40(%rcx)
	movq	48(%rdx), %rsi
	imulq	48(%rax), %rsi
	movq	%rsi, 48(%rcx)
	movq	56(%rdx), %rsi
	imulq	56(%rax), %rsi
	movq	%rsi, 56(%rcx)
	movq	64(%rdx), %rsi
	imulq	64(%rax), %rsi
	movq	%rsi, 64(%rcx)
	movq	72(%rdx), %rdx
	imulq	72(%rax), %rdx
	movq	%rdx, 72(%rcx)
	movabsq	$jl_system_image_data, %rax
	retq

julia> @code_native debuginfo=:none var_range(a1, a2, b, 10)
	.text
	pushq	%rbx
	testq	%rcx, %rcx
	jle	L246
	movq	(%rdi), %r9
	movq	(%rsi), %rsi
	movq	(%rdx), %rdx
	movl	$1, %edi
	cmpq	$32, %rcx
	jb	L219
	leaq	(%rdx,%rcx,8), %rax
	leaq	(%r9,%rcx,8), %r8
	leaq	(%rsi,%rcx,8), %r10
	cmpq	%r8, %rdx
	setb	%r11b
	cmpq	%rax, %r9
	setb	%bl
	cmpq	%r10, %rdx
	setb	%r10b
	cmpq	%rax, %rsi
	setb	%r8b
	testb	%bl, %r11b
	jne	L219
	andb	%r8b, %r10b
	jne	L219
	movq	%rcx, %r8
	andq	$-32, %r8
	leaq	1(%r8), %rdi
	xorl	%eax, %eax
	nopl	(%rax,%rax)
L112:
	vmovdqu64	(%rsi,%rax,8), %zmm0
	vmovdqu64	64(%rsi,%rax,8), %zmm1
	vmovdqu64	128(%rsi,%rax,8), %zmm2
	vmovdqu64	192(%rsi,%rax,8), %zmm3
	vpmullq	(%r9,%rax,8), %zmm0, %zmm0
	vpmullq	64(%r9,%rax,8), %zmm1, %zmm1
	vpmullq	128(%r9,%rax,8), %zmm2, %zmm2
	vpmullq	192(%r9,%rax,8), %zmm3, %zmm3
	vmovdqu64	%zmm0, (%rdx,%rax,8)
	vmovdqu64	%zmm1, 64(%rdx,%rax,8)
	vmovdqu64	%zmm2, 128(%rdx,%rax,8)
	vmovdqu64	%zmm3, 192(%rdx,%rax,8)
	addq	$32, %rax
	cmpq	%rax, %r8
	jne	L112
	cmpq	%rcx, %r8
	je	L246
L219:
	addq	$-1, %rdi
	nop
L224:
	movq	(%rsi,%rdi,8), %rax
	imulq	(%r9,%rdi,8), %rax
	movq	%rax, (%rdx,%rdi,8)
	addq	$1, %rdi
	cmpq	%rdi, %rcx
	jne	L224
L246:
	popq	%rbx
	vzeroupper
	retq
	nopl	(%rax,%rax)

One last set of benchmarks I tried: repeating these in a loop. We have var range, where the function is passed a reference, var range where it is passed an integer, and finally const range. The function is called 1000 times in a loop.

julia> function repeat_loop_var_range(a1, a2, b, rn)
           n = rn[]
           for i in 1:1000
               var_range(a1, a2, b, n)
           end
       end
repeat_loop_var_range (generic function with 1 method)

julia> function repeat_loop_var_range2(a1, a2, b, n)
           for i in 1:1000
               var_range(a1, a2, b, n)
           end
       end
repeat_loop_var_range2 (generic function with 1 method)

julia> function repeat_loop_const_range(a1, a2, b)
           for i in 1:1000
               const_range(a1, a2, b)
           end
       end
repeat_loop_const_range (generic function with 1 method)

Results:

julia> @benchmark repeat_loop_var_range($a1, $a2, $b, $rn)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     4.925 μs (0.00% GC)
  median time:      4.928 μs (0.00% GC)
  mean time:        4.994 μs (0.00% GC)
  maximum time:     9.918 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     7

julia> @benchmark repeat_loop_var_range2($a1, $a2, $b, $n)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     3.583 μs (0.00% GC)
  median time:      3.645 μs (0.00% GC)
  mean time:        3.714 μs (0.00% GC)
  maximum time:     75.997 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     8

julia> @benchmark repeat_loop_const_range($a1, $a2, $b)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     2.259 μs (0.00% GC)
  median time:      2.265 μs (0.00% GC)
  mean time:        2.313 μs (0.00% GC)
  maximum time:     4.581 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     9

It definitely looks like it is optimizing on the value of n. The assmbly for both var_range versions was practically identical, while the runtime performance was not.
Here, inside the loop, const_range performed the best. Its assembly still showed const_range was unrolled (and inlined), and repeated 1000x in a loop.

3 Likes