Why does @simd here make the loop run faster even if the while version has shorter generated assembly? (Thiscode is part of a prime sieve implementation.)
using BenchmarkTools
# Auxillary functions
begin
const _uint_bit_length = sizeof(UInt) * 8
const _div_uint_size_shift = Int(log2(_uint_bit_length))
@inline _mul2(i::Integer) = i << 1
@inline _div2(i::Integer) = i >> 1
@inline _map_to_index(i::Integer) = _div2(i - 1)
@inline _map_to_factor(i::Integer) = _mul2(i) + 1
@inline _mod_uint_size(i::Integer) = i & (_uint_bit_length - 1)
@inline _div_uint_size(i::Integer) = i >> _div_uint_size_shift
@inline _get_chunk_index(i::Integer) = _div_uint_size(i + (_uint_bit_length - 1))
@inline _get_bit_index_mask(i::Integer) = UInt(1) << _mod_uint_size(i - 1)
end
# Main code
function clear_factors_while!(arr::Vector{UInt}, factor_index::Integer, max_index::Integer)
factor = _map_to_factor(factor_index)
index = _div2(factor * factor)
while index <= max_index
@inbounds arr[_get_chunk_index(index)] |= _get_bit_index_mask(index)
index += factor
end
return arr
end
function clear_factors_simd!(arr::Vector{UInt}, factor_index::Integer, max_index::Integer)
factor = _map_to_factor(factor_index)
@simd for index in _div2(factor * factor):factor:max_index
@inbounds arr[_get_chunk_index(index)] |= _get_bit_index_mask(index)
end
return arr
end
println(
clear_factors_while!(zeros(UInt, cld(500_000, _uint_bit_length)), 1, 500_000) ==
clear_factors_simd!(zeros(UInt, cld(500_000, _uint_bit_length)), 1, 500_000)
)
const x = zeros(UInt, cld(500_000, sizeof(UInt) * 8))
@benchmark clear_factors_while!(x, 1, 500_000)
@benchmark clear_factors_simd!(x, 1, 500_000)
Benchmark results:
julia> @benchmark clear_factors_while!(x, 1, 500_000)
BenchmarkTools.Trial: 10000 samples with 1 evaluation.
Range (min β¦ max): 133.000 ΞΌs β¦ 3.780 ms β GC (min β¦ max): 0.00% β¦ 0.00%
Time (median): 141.100 ΞΌs β GC (median): 0.00%
Time (mean Β± Ο): 142.086 ΞΌs Β± 40.572 ΞΌs β GC (mean Β± Ο): 0.00% Β± 0.00%
ββ ββββββββββ β
βββββββββββββββββββββββββββ
ββ
β
ββ
βββ
β
β
ββ
ββ
β
β
β
β
ββββββββββββ
βββ
β
133 ΞΌs Histogram: log(frequency) by time 221 ΞΌs <
Memory estimate: 0 bytes, allocs estimate: 0.
julia> @benchmark clear_factors_simd!(x, 1, 500_000)
BenchmarkTools.Trial: 10000 samples with 1 evaluation.
Range (min β¦ max): 133.100 ΞΌs β¦ 1.549 ms β GC (min β¦ max): 0.00% β¦ 0.00%
Time (median): 138.600 ΞΌs β GC (median): 0.00%
Time (mean Β± Ο): 141.710 ΞΌs Β± 30.068 ΞΌs β GC (mean Β± Ο): 0.00% Β± 0.00%
β ββββββββββ β
βββββββββββββββββββββββββ
β
β
β
βββββββ
β
ββββββ
ββ
ββββββ
βββ
ββ
ββ
β
β
β
β
133 ΞΌs Histogram: log(frequency) by time 234 ΞΌs <
Memory estimate: 0 bytes, allocs estimate: 0.
And, the generated assembly:
julia> @code_native clear_factors_simd!(x, 1, 500_000)
.text
; β @ REPL[4]:1 within `clear_factors_simd!'
pushq %r15
pushq %r14
pushq %rbx
subq $32, %rsp
movq %rdi, %r14
; β @ REPL[4]:2 within `clear_factors_simd!'
; ββ @ REPL[2]:8 within `_map_to_factor'
; βββ @ int.jl:87 within `+'
leaq (%rsi,%rsi), %r15
incq %r15
; βββ
; β @ REPL[4]:3 within `clear_factors_simd!'
; ββ @ simdloop.jl:69 within `macro expansion'
; βββ @ int.jl:88 within `*'
movq %r15, %rbx
imulq %r15, %rbx
; βββ
; βββ @ REPL[2]:6 within `_div2'
; ββββ @ int.jl:462 within `>>' @ int.jl:455
sarq %rbx
; ββββ
; βββ @ range.jl:22 within `Colon'
; ββββ @ range.jl:24 within `_colon'
; βββββ @ range.jl:263 within `StepRange' @ range.jl:208
movabsq $steprange_last, %rax
movq %rbx, %rdi
movq %r15, %rsi
callq *%rax
movq %rbx, 8(%rsp)
movq %r15, 16(%rsp)
movq %rax, 24(%rsp)
; βββββ
; ββ @ simdloop.jl:71 within `macro expansion'
; βββ @ simdloop.jl:51 within `simd_inner_length'
movabsq $length, %rax
leaq 8(%rsp), %rdi
callq *%rax
; βββ
; ββ @ simdloop.jl:72 within `macro expansion'
; βββ @ int.jl:83 within `<'
testq %rax, %rax
; βββ
jle L121
; ββ
; ββ @ simdloop.jl:77 within `macro expansion' @ REPL[4]:4
; βββ @ array.jl within `getindex'
movq (%r14), %rcx
; βββ
; ββ @ simdloop.jl:75 within `macro expansion'
addq $63, %rbx
movl $1, %edx
; ββ
; ββ @ simdloop.jl:77 within `macro expansion' @ REPL[4]:4
; βββ @ REPL[2]:11 within `_get_chunk_index'
; ββββ @ REPL[2]:10 within `_div_uint_size'
; βββββ @ int.jl:462 within `>>' @ int.jl:455
L96:
movq %rbx, %rsi
sarq $6, %rsi
; βββββ
; βββ @ REPL[2]:12 within `_get_bit_index_mask'
; ββββ @ int.jl:464 within `<<' @ int.jl:457
shlxq %rbx, %rdx, %rdi
; ββββ
; βββ @ array.jl:839 within `setindex!'
orq %rdi, -8(%rcx,%rsi,8)
; βββ
; ββ @ simdloop.jl:75 within `macro expansion'
; βββ @ int.jl:83 within `<'
addq %r15, %rbx
decq %rax
; βββ
jne L96
; ββ
; β @ REPL[4]:6 within `clear_factors_simd!'
L121:
movq %r14, %rax
addq $32, %rsp
popq %rbx
popq %r14
popq %r15
retq
nopw %cs:(%rax,%rax)
; β
julia> @code_native clear_factors_while!(x, 1, 500_000)
.text
; β @ REPL[3]:2 within `clear_factors_while!'
movq %rdi, %rax
; β @ REPL[3]:3 within `clear_factors_while!'
; ββ @ REPL[2]:8 within `_map_to_factor'
; βββ @ int.jl:87 within `+'
leaq (%rsi,%rsi), %r9
incq %r9
; βββ
; β @ REPL[3]:4 within `clear_factors_while!'
; ββ @ int.jl:88 within `*'
movq %r9, %rsi
imulq %r9, %rsi
; ββ
; ββ @ REPL[2]:6 within `_div2'
; βββ @ int.jl:462 within `>>' @ int.jl:455
sarq %rsi
; βββ
; β @ REPL[3]:5 within `clear_factors_while!'
; ββ @ int.jl:442 within `<='
cmpq %rdx, %rsi
; ββ
jg L74
; β @ REPL[3]:6 within `clear_factors_while!'
; ββ @ array.jl within `getindex'
movq (%rax), %r10
movl $1, %r8d
nopw %cs:(%rax,%rax)
; ββ
; ββ @ REPL[2]:11 within `_get_chunk_index'
; βββ @ int.jl:87 within `+'
L48:
leaq 63(%rsi), %rcx
; βββ
; ββ @ REPL[2]:12 within `_get_bit_index_mask'
; βββ @ int.jl:464 within `<<' @ int.jl:457
shlxq %rcx, %r8, %rdi
; βββ
; ββ @ REPL[2]:11 within `_get_chunk_index'
; βββ @ REPL[2]:10 within `_div_uint_size'
; ββββ @ int.jl:462 within `>>' @ int.jl:455
sarq $6, %rcx
; ββββ
; ββ @ array.jl:839 within `setindex!'
orq %rdi, -8(%r10,%rcx,8)
; ββ
; β @ REPL[3]:7 within `clear_factors_while!'
; ββ @ int.jl:87 within `+'
addq %r9, %rsi
; ββ
; β @ REPL[3]:5 within `clear_factors_while!'
; ββ @ int.jl:442 within `<='
cmpq %rdx, %rsi
; ββ
jle L48
; β @ REPL[3]:9 within `clear_factors_while!'
L74:
retq
nopl (%rax,%rax)
; β