Simple conditional loop

Hi folks, I have a question about ifs inside for loops. Suppose I have this function:

function c_cheating(high)
	total = 0
	high *= 2
	for i in 4:4:high
		total += i
	end
	total
end

Giving

@btime c_cheating(1_000_000)
# 1.811 ns (0 allocations: 0 bytes)
# 500001000000

And

function c_loop(high)
	total = 0
	for i in 1:high
        if i%2==0
            total += i*2
        end
	end
	total
end

Giving

@btime c_loop(1_000_000)
# 192.674 ΞΌs (0 allocations: 0 bytes)
# 500001000000

Also

@btime sum(4:4:(1_000_000*2))
# 0.022 ns (0 allocations: 0 bytes)
# 500001000000

How could I improve the last for loop, or something using generators as the later? I tried using Iterators.filter, but it was slower than c_loop.

I think the compiler has been too smart for the two fast benchmarks and have performed the computation at compile time/ replaced the loop with the answer. Timings on the order of 1ns or less should be taken with a large grain of salt.

7 Likes

You can check with @code_llvm or @code_native the generated assembly code to see if the compiler β€œcheated”.
In general 200 microseconds for 1M loop iterations is not too bad (if the compiler is not able to optimize the loop away), it is still more than 1 loop iteration per CPU cycle.

I don’t quite understand what’s happening here, but this is the output:

@code_native c_cheating(1_000_000)
	.text
; β”Œ @ In[1]:3 within `c_cheating'
; β”‚β”Œ @ In[1]:1 within `*'
	pushq	%rbx
; β”‚β””
; β”‚β”Œ @ int.jl:87 within `*'
	leaq	(%rdi,%rdi), %rdx
; β”‚β””
; β”‚ @ In[1]:4 within `c_cheating'
; β”‚β”Œ @ range.jl:22 within `Colon'
; β”‚β”‚β”Œ @ range.jl:24 within `_colon'
; β”‚β”‚β”‚β”Œ @ range.jl:256 within `StepRange' @ range.jl:205
	movabsq	$steprange_last, %rax
	movl	$4, %ebx
	movl	$4, %edi
	movl	$4, %esi
	callq	*%rax
; β”‚β””β””β””
; β”‚β”Œ @ range.jl:620 within `iterate'
; β”‚β”‚β”Œ @ range.jl:501 within `isempty'
; β”‚β”‚β”‚β”Œ @ bool.jl:40 within `&'
	cmpq	$4, %rax
; β”‚β””β””β””
	jl	L71
; β”‚ @ In[1]:5 within `c_cheating'
	negq	%rax
	xorl	%ecx, %ecx
	nopl	(%rax,%rax)
; β”‚β”Œ @ int.jl:86 within `+'
L48:
	addq	%rbx, %rcx
; β”‚β””
; β”‚β”Œ @ range.jl:624 within `iterate'
; β”‚β”‚β”Œ @ promotion.jl:398 within `=='
	leaq	(%rax,%rbx), %rdx
	addq	$4, %rdx
; β”‚β”‚β””
	addq	$4, %rbx
; β”‚β”‚β”Œ @ promotion.jl:398 within `=='
	cmpq	$4, %rdx
; β”‚β””β””
	jne	L48
	jmp	L73
L71:
	xorl	%ecx, %ecx
; β”‚ @ In[1]:7 within `c_cheating'
L73:
	movq	%rcx, %rax
	popq	%rbx
	retq
	nop
; β””
@code_native sum(4:4:(1_000_000*2))
	.text
; β”Œ @ range.jl:1022 within `sum'
	pushq	%rbx
	movq	%rdi, %rbx
; β”‚ @ range.jl:1023 within `sum'
	movabsq	$length, %rax
	callq	*%rax
	movq	(%rbx), %rcx
; β”‚ @ range.jl:1025 within `sum'
; β”‚β”Œ @ int.jl:87 within `*'
	imulq	%rax, %rcx
; β”‚β””
; β”‚β”Œ @ int.jl:85 within `-'
	leaq	-1(%rax), %rdx
; β”‚β””
	testb	$1, %al
	jne	L45
; β”‚β”Œ @ int.jl:461 within `>>' @ int.jl:454
	sarq	%rax
; β”‚β””
; β”‚β”Œ @ int.jl:87 within `*'
	imulq	%rdx, %rax
	imulq	8(%rbx), %rax
; β”‚β””
	jmp	L60
; β”‚β”Œ @ int.jl:461 within `>>' @ int.jl:454
L45:
	sarq	%rdx
; β”‚β””
; β”‚β”Œ @ int.jl:87 within `*'
	imulq	%rax, %rdx
	imulq	8(%rbx), %rdx
	movq	%rdx, %rax
; β”‚β””
; β”‚β”Œ @ int.jl:86 within `+'
L60:
	addq	%rcx, %rax
; β”‚β””
	popq	%rbx
	retq
	nopw	%cs:(%rax,%rax)
	nopl	(%rax,%rax)
; β””

And for

@code_native c_loop(1_000_000)

is just too big to place here.

In general for nanosecond timings, it is better to profile it using a setup code like this:

julia> @btime c_cheating(x) setup=(x=1_000_000)
  443.892 ΞΌs (0 allocations: 0 bytes)
500001000000

Compared with

julia> @btime c_loop(1_000_000)
  284.405 ΞΌs (0 allocations: 0 bytes)
500001000000
4 Likes