Why is this @simd loop faster than a while loop even if it has longer assembly?

Why does @simd here make the loop run faster even if the while version has shorter generated assembly? (Thiscode is part of a prime sieve implementation.)

using BenchmarkTools

# Auxillary functions
begin
const _uint_bit_length = sizeof(UInt) * 8
const _div_uint_size_shift = Int(log2(_uint_bit_length))
@inline _mul2(i::Integer) = i << 1
@inline _div2(i::Integer) = i >> 1
@inline _map_to_index(i::Integer) = _div2(i - 1)
@inline _map_to_factor(i::Integer) = _mul2(i) + 1
@inline _mod_uint_size(i::Integer) = i & (_uint_bit_length - 1)
@inline _div_uint_size(i::Integer) = i >> _div_uint_size_shift
@inline _get_chunk_index(i::Integer) = _div_uint_size(i + (_uint_bit_length - 1))
@inline _get_bit_index_mask(i::Integer) = UInt(1) << _mod_uint_size(i - 1)
end

# Main code
function clear_factors_while!(arr::Vector{UInt}, factor_index::Integer, max_index::Integer)
    factor = _map_to_factor(factor_index)
    index = _div2(factor * factor)
    while index <= max_index
        @inbounds arr[_get_chunk_index(index)] |= _get_bit_index_mask(index)
        index += factor
    end
    return arr
end

function clear_factors_simd!(arr::Vector{UInt}, factor_index::Integer, max_index::Integer)
    factor = _map_to_factor(factor_index)
    @simd for index in _div2(factor * factor):factor:max_index
        @inbounds arr[_get_chunk_index(index)] |= _get_bit_index_mask(index)
    end
    return arr
end

println(
    clear_factors_while!(zeros(UInt, cld(500_000, _uint_bit_length)), 1, 500_000) ==
    clear_factors_simd!(zeros(UInt, cld(500_000, _uint_bit_length)), 1, 500_000)
)

const x = zeros(UInt, cld(500_000, sizeof(UInt) * 8))
@benchmark clear_factors_while!(x, 1, 500_000)
@benchmark clear_factors_simd!(x, 1, 500_000)

Benchmark results:

julia> @benchmark clear_factors_while!(x, 1, 500_000)
BenchmarkTools.Trial: 10000 samples with 1 evaluation.
 Range (min … max):  133.000 ΞΌs …  3.780 ms  β”Š GC (min … max): 0.00% … 0.00%
 Time  (median):     141.100 ΞΌs              β”Š GC (median):    0.00%
 Time  (mean Β± Οƒ):   142.086 ΞΌs Β± 40.572 ΞΌs  β”Š GC (mean Β± Οƒ):  0.00% Β± 0.00%

  β–ˆβ–‡  β–β–ˆβ–†β–ƒβ–ƒβ–ƒβ–ƒβ–„β–‚β–‚                                               β–‚
  β–ˆβ–ˆβ–‡β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‡β–‡β–‡β–‡β–‡β–‡β–†β–†β–†β–‡β–…β–„β–…β–…β–†β–…β–„β–„β–…β–…β–…β–„β–…β–„β–…β–…β–…β–…β–…β–„β–†β–†β–„β–„β–„β–„β–„β–β–„β–„β–…β–„β–„β–… β–ˆ
  133 ΞΌs        Histogram: log(frequency) by time       221 ΞΌs <

 Memory estimate: 0 bytes, allocs estimate: 0.

julia> @benchmark clear_factors_simd!(x, 1, 500_000)

BenchmarkTools.Trial: 10000 samples with 1 evaluation.
 Range (min … max):  133.100 ΞΌs …  1.549 ms  β”Š GC (min … max): 0.00% … 0.00%
 Time  (median):     138.600 ΞΌs              β”Š GC (median):    0.00%
 Time  (mean Β± Οƒ):   141.710 ΞΌs Β± 30.068 ΞΌs  β”Š GC (mean Β± Οƒ):  0.00% Β± 0.00%

  β–ˆ β–β–ˆβ–β–‚β–ƒβ–ƒβ–„β–ƒβ–‚β–                                                 ▁
  β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‡β–‡β–‡β–†β–‡β–†β–‡β–†β–†β–„β–†β–„β–…β–…β–…β–…β–„β–„β–„β–„β–„β–„β–…β–…β–„β–„β–„β–„β–„β–…β–„β–…β–„β–„β–†β–„β–†β–…β–„β–„β–…β–†β–…β–„β–…β–…β–…β–… β–ˆ
  133 ΞΌs        Histogram: log(frequency) by time       234 ΞΌs <

 Memory estimate: 0 bytes, allocs estimate: 0.

And, the generated assembly:

julia> @code_native clear_factors_simd!(x, 1, 500_000)
        .text
; β”Œ @ REPL[4]:1 within `clear_factors_simd!'
        pushq   %r15
        pushq   %r14
        pushq   %rbx
        subq    $32, %rsp
        movq    %rdi, %r14
; β”‚ @ REPL[4]:2 within `clear_factors_simd!'
; β”‚β”Œ @ REPL[2]:8 within `_map_to_factor'
; β”‚β”‚β”Œ @ int.jl:87 within `+'
        leaq    (%rsi,%rsi), %r15
        incq    %r15
; β”‚β””β””
; β”‚ @ REPL[4]:3 within `clear_factors_simd!'
; β”‚β”Œ @ simdloop.jl:69 within `macro expansion'
; β”‚β”‚β”Œ @ int.jl:88 within `*'
        movq    %r15, %rbx
        imulq   %r15, %rbx
; β”‚β”‚β””
; β”‚β”‚β”Œ @ REPL[2]:6 within `_div2'
; β”‚β”‚β”‚β”Œ @ int.jl:462 within `>>' @ int.jl:455
        sarq    %rbx
; β”‚β”‚β””β””
; β”‚β”‚β”Œ @ range.jl:22 within `Colon'
; β”‚β”‚β”‚β”Œ @ range.jl:24 within `_colon'
; β”‚β”‚β”‚β”‚β”Œ @ range.jl:263 within `StepRange' @ range.jl:208
        movabsq $steprange_last, %rax
        movq    %rbx, %rdi
        movq    %r15, %rsi
        callq   *%rax
        movq    %rbx, 8(%rsp)
        movq    %r15, 16(%rsp)
        movq    %rax, 24(%rsp)
; β”‚β””β””β””β””
; β”‚β”Œ @ simdloop.jl:71 within `macro expansion'
; β”‚β”‚β”Œ @ simdloop.jl:51 within `simd_inner_length'
        movabsq $length, %rax
        leaq    8(%rsp), %rdi
        callq   *%rax
; β”‚β””β””
; β”‚β”Œ @ simdloop.jl:72 within `macro expansion'
; β”‚β”‚β”Œ @ int.jl:83 within `<'
        testq   %rax, %rax
; β”‚β”‚β””
        jle     L121
; β”‚β””
; β”‚β”Œ @ simdloop.jl:77 within `macro expansion' @ REPL[4]:4
; β”‚β”‚β”Œ @ array.jl within `getindex'
        movq    (%r14), %rcx
; β”‚β””β””
; β”‚β”Œ @ simdloop.jl:75 within `macro expansion'
        addq    $63, %rbx
        movl    $1, %edx
; β”‚β””
; β”‚β”Œ @ simdloop.jl:77 within `macro expansion' @ REPL[4]:4
; β”‚β”‚β”Œ @ REPL[2]:11 within `_get_chunk_index'
; β”‚β”‚β”‚β”Œ @ REPL[2]:10 within `_div_uint_size'
; β”‚β”‚β”‚β”‚β”Œ @ int.jl:462 within `>>' @ int.jl:455
L96:
        movq    %rbx, %rsi
        sarq    $6, %rsi
; β”‚β”‚β””β””β””
; β”‚β”‚β”Œ @ REPL[2]:12 within `_get_bit_index_mask'
; β”‚β”‚β”‚β”Œ @ int.jl:464 within `<<' @ int.jl:457
        shlxq   %rbx, %rdx, %rdi
; β”‚β”‚β””β””
; β”‚β”‚β”Œ @ array.jl:839 within `setindex!'
        orq     %rdi, -8(%rcx,%rsi,8)
; β”‚β””β””
; β”‚β”Œ @ simdloop.jl:75 within `macro expansion'
; β”‚β”‚β”Œ @ int.jl:83 within `<'
        addq    %r15, %rbx
        decq    %rax
; β”‚β”‚β””
        jne     L96
; β”‚β””
; β”‚ @ REPL[4]:6 within `clear_factors_simd!'
L121:
        movq    %r14, %rax
        addq    $32, %rsp
        popq    %rbx
        popq    %r14
        popq    %r15
        retq
        nopw    %cs:(%rax,%rax)
; β””

julia> @code_native clear_factors_while!(x, 1, 500_000)
        .text
; β”Œ @ REPL[3]:2 within `clear_factors_while!'
        movq    %rdi, %rax
; β”‚ @ REPL[3]:3 within `clear_factors_while!'
; β”‚β”Œ @ REPL[2]:8 within `_map_to_factor'
; β”‚β”‚β”Œ @ int.jl:87 within `+'
        leaq    (%rsi,%rsi), %r9
        incq    %r9
; β”‚β””β””
; β”‚ @ REPL[3]:4 within `clear_factors_while!'
; β”‚β”Œ @ int.jl:88 within `*'
        movq    %r9, %rsi
        imulq   %r9, %rsi
; β”‚β””
; β”‚β”Œ @ REPL[2]:6 within `_div2'
; β”‚β”‚β”Œ @ int.jl:462 within `>>' @ int.jl:455
        sarq    %rsi
; β”‚β””β””
; β”‚ @ REPL[3]:5 within `clear_factors_while!'
; β”‚β”Œ @ int.jl:442 within `<='
        cmpq    %rdx, %rsi
; β”‚β””
        jg      L74
; β”‚ @ REPL[3]:6 within `clear_factors_while!'
; β”‚β”Œ @ array.jl within `getindex'
        movq    (%rax), %r10
        movl    $1, %r8d
        nopw    %cs:(%rax,%rax)
; β”‚β””
; β”‚β”Œ @ REPL[2]:11 within `_get_chunk_index'
; β”‚β”‚β”Œ @ int.jl:87 within `+'
L48:
        leaq    63(%rsi), %rcx
; β”‚β””β””
; β”‚β”Œ @ REPL[2]:12 within `_get_bit_index_mask'
; β”‚β”‚β”Œ @ int.jl:464 within `<<' @ int.jl:457
        shlxq   %rcx, %r8, %rdi
; β”‚β””β””
; β”‚β”Œ @ REPL[2]:11 within `_get_chunk_index'
; β”‚β”‚β”Œ @ REPL[2]:10 within `_div_uint_size'
; β”‚β”‚β”‚β”Œ @ int.jl:462 within `>>' @ int.jl:455
        sarq    $6, %rcx
; β”‚β””β””β””
; β”‚β”Œ @ array.jl:839 within `setindex!'
        orq     %rdi, -8(%r10,%rcx,8)
; β”‚β””
; β”‚ @ REPL[3]:7 within `clear_factors_while!'
; β”‚β”Œ @ int.jl:87 within `+'
        addq    %r9, %rsi
; β”‚β””
; β”‚ @ REPL[3]:5 within `clear_factors_while!'
; β”‚β”Œ @ int.jl:442 within `<='
        cmpq    %rdx, %rsi
; β”‚β””
        jle     L48
; β”‚ @ REPL[3]:9 within `clear_factors_while!'
L74:
        retq
        nopl    (%rax,%rax)
; β””

you have a loop. Absolute length of the assembly is not so meaningful.

Btw to me it seems they are about same speed. (I only look at min time for this kind of tight loop benchmark)

1 Like

It seems like a statistically insignificant difference, but in the context of the entire prime sieve, it gives a significant speed difference.

I’ve simulated the prime sieve below by clearing factor indices from 1 to 500.

function clear_many_factors_simd!(arr::Vector{UInt}, max_index::Integer)
    for i in 1:500
        clear_factors_simd!(arr, i, max_index)
    end
    return arr
end

function clear_many_factors_while!(arr::Vector{UInt}, max_index::Integer)
    for i in 1:500
        clear_factors_while!(arr, i, max_index)
    end
    return arr
end

Results:

julia> @btime clear_many_factors_while!(x, 400_000);
  679.600 ΞΌs (0 allocations: 0 bytes)

julia> @btime clear_many_factors_simd!(x, 400_000);
  628.900 ΞΌs (0 allocations: 0 bytes)

julia> @benchmark clear_many_factors_while!(x, 400_000)
BenchmarkTools.Trial: 6864 samples with 1 evaluation.
 Range (min … max):  671.800 ΞΌs …  5.021 ms  β”Š GC (min … max): 0.00% … 0.00%
 Time  (median):     717.800 ΞΌs              β”Š GC (median):    0.00%
 Time  (mean Β± Οƒ):   723.917 ΞΌs Β± 66.558 ΞΌs  β”Š GC (mean Β± Οƒ):  0.00% Β± 0.00%

            β–ƒβ–ˆβ–†β–…β–†β–†β–ƒ
  β–‚β–‚β–‚β–‚β–ƒβ–ƒβ–…β–‡β–‡β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–†β–…β–„β–„β–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–β–β–‚ β–ƒ
  672 ΞΌs          Histogram: frequency by time          874 ΞΌs <

 Memory estimate: 0 bytes, allocs estimate: 0.

julia> @benchmark clear_many_factors_simd!(x, 400_000)
BenchmarkTools.Trial: 7472 samples with 1 evaluation.
 Range (min … max):  629.600 ΞΌs …  1.359 ms  β”Š GC (min … max): 0.00% … 0.00%
 Time  (median):     657.800 ΞΌs              β”Š GC (median):    0.00%
 Time  (mean Β± Οƒ):   664.742 ΞΌs Β± 46.172 ΞΌs  β”Š GC (mean Β± Οƒ):  0.00% Β± 0.00%

     β–β–„β–…β–‡β–ˆβ–†β–ƒ
  β–‚β–ƒβ–„β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–†β–„β–„β–ƒβ–ƒβ–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–β–‚β–‚β–‚β–‚β–‚β–‚β–β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–β–‚β–‚β–‚β–‚β–‚β–‚β–β–‚ β–ƒ
  630 ΞΌs          Histogram: frequency by time          858 ΞΌs <

 Memory estimate: 0 bytes, allocs estimate: 0.

Looking at the generated assembly, it seems that the inner loops in clear_factors_simd! and clear_factors_while! only differ by two instructions, while swapping two others:

Inner loop in clear_factors_simd!:

L96:
        mov     rsi, rbx
        sar     rsi, 6
        shlx    rdi, rdx, rbx
        or      qword ptr [rcx + 8*rsi - 8], rdi
        add     rbx, r15
        dec     rax
        jne     L96

Inner loop in clear_factors_while!:

L48:
        lea     rcx, [rsi + 63]
        shlx    rdi, r8, rcx
        sar     rcx, 6
        or      qword ptr [r10 + 8*rcx - 8], rdi
        add     rsi, r9
        cmp     rsi, rdx
        jle     L48

I also wrote a version that gets the number of iterations, then loops that many times, and it seems to produce the same output for its inner loop:

function clear_factors_while_itercount!(arr::Vector{UInt}, factor_index::Integer, max_index::Integer)
    factor = _map_to_factor(factor_index)
    start_index = _div2(factor * factor)
    iterations = cld(max_index - start_index, factor)
    while !iszero(iterations)
        iterations -= 1
        index = start_index + (factor * iterations)
        @inbounds arr[_get_chunk_index(index)] |= _get_bit_index_mask(index)
    end
    return arr
end

julia> @code_native syntax=:intel debuginfo=:none clear_factors_while_itercount!(x, 1, 500_000)
...
    L160:
        mov     rdi, rax
        sar     rdi, 6
        shlx    rdx, r9, rax
        or      qword ptr [r11 + 8*rdi - 8], rdx
        add     rax, rsi
        dec     rcx
        jne     L160
...

The simd and manual iteration count versions use a mov instead of a lea, reorder the sar and shlx (bit shifts), and use a dec instead of a cmp. I don’t know if that makes a significant difference.

Is lea more expensive than mov, especially when the numbers get larger? Actually, why does @simd generate a mov instead of a lea? Compiler is too smart, I guess?

And why does the simd version (and manual iteration count version) generate dec instead of cmp? That I really cannot understand.

I still don’t know why the @simd version seems to be faster.

The simd version of course isn’t actually SIMD here. Often, @simd will unroll code, making the assembly longer (along with encouraging the code to SIMD in some cases), but this will improve performance more often than not. In particular, @simd will help SIMD-ing floating point reductions (by allowing the compiler to reorder, and when you have reductions, unrolling is likely to improve performance dramatically by breaking up the dependency chains between operations. Breaking up dependency chains allows the CPU to take advantage of super-scalar parallelism.

In your example, the @simd version is using the rcx register as a counter for how many loop iterations are remaining. You can imagine the loop looks like this:

rcx = length(_div2(factor * factor):factor:max_index)
@label L96
# do stuff
rcx -= 1  # dec
rcx == 0 || @goto L96 # jne (jump if not zero)

I think creating an iteration counter like this is a result of the @simd macro, but I’ve never actually looked at how it works/is implemented.

Meanwhile, your while loop acts more as written. Therefore it has to cmp to compare rsi = index with rdx = max_index on each iteration.

Using llvm-mva to analyze these loops, to see what it predicts. The @simd version:

# llvm-mca --bottleneck-analysis -mcpu=cascadelake atsimd.s
Iterations:        100
Instructions:      600
Total Cycles:      161
Total uOps:        800

Dispatch Width:    6
uOps Per Cycle:    4.97
IPC:               3.73
Block RThroughput: 1.3


No resource or data dependency bottlenecks discovered.


Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)

[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 1      1     0.25                        mov   rsi, rbx
 1      1     0.50                        sar   rsi, 6
 1      1     0.50                        shlx  rdi, rdx, rbx
 3      7     1.00    *      *            or    qword ptr [rcx + 8*rsi - 8], rdi
 1      1     0.25                        add   rbx, r15
 1      1     0.25                        dec   rax


Resources:
[0]   - SKXDivider
[1]   - SKXFPDivider
[2]   - SKXPort0
[3]   - SKXPort1
[4]   - SKXPort2
[5]   - SKXPort3
[6]   - SKXPort4
[7]   - SKXPort5
[8]   - SKXPort6
[9]   - SKXPort7


Resource pressure per iteration:
[0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
 -      -     1.50   1.50   0.66   0.67   1.00   1.50   1.50   0.67

Resource pressure by instruction:
[0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
 -      -      -     0.99    -      -      -      -     0.01    -     mov       rsi, rbx
 -      -     0.99    -      -      -      -      -     0.01    -     sar       rsi, 6
 -      -     0.51    -      -      -      -      -     0.49    -     shlx      rdi, rdx, rbx
 -      -      -     0.01   0.66   0.67   1.00   0.99    -     0.67   or        qword ptr [rcx + 8*rsi - 8], rdi
 -      -      -     0.49    -      -      -     0.01   0.50    -     add       rbx, r15
 -      -      -     0.01    -      -      -     0.50   0.49    -     dec       rax

The while version:

# llvm-mca --bottleneck-analysis -mcpu=cascadelake while.s
Iterations:        100
Instructions:      600
Total Cycles:      178
Total uOps:        800

Dispatch Width:    6
uOps Per Cycle:    4.49
IPC:               3.37
Block RThroughput: 1.3


Cycles with backend pressure increase [ 29.21% ]
Throughput Bottlenecks:
  Resource Pressure       [ 28.09% ]
  - SKXPort0  [ 28.09% ]
  - SKXPort6  [ 28.09% ]
  Data Dependencies:      [ 29.21% ]
  - Register Dependencies [ 29.21% ]
  - Memory Dependencies   [ 0.00% ]

Critical sequence based on the simulation:

              Instruction                                 Dependency Information
 +----< 4.    add       rsi, r9
 |
 |    < loop carried >
 |
 +----> 0.    lea       rcx, [rsi + 63]                   ## REGISTER dependency:  rsi
 +----> 1.    shlx      rdi, r8, rcx                      ## REGISTER dependency:  rcx
 |      2.    sar       rcx, 6
 +----> 3.    or        qword ptr [r10 + 8*rcx - 8], rdi          ## REGISTER dependency:  rdi
 +----> 4.    add       rsi, r9                           ## RESOURCE interference:  SKXPort6 [ probability: 33% ]
 +----> 5.    cmp       rsi, rdx                          ## REGISTER dependency:  rsi


Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)

[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 1      1     0.50                        lea   rcx, [rsi + 63]
 1      1     0.50                        shlx  rdi, r8, rcx
 1      1     0.50                        sar   rcx, 6
 3      7     1.00    *      *            or    qword ptr [r10 + 8*rcx - 8], rdi
 1      1     0.25                        add   rsi, r9
 1      1     0.25                        cmp   rsi, rdx


Resources:
[0]   - SKXDivider
[1]   - SKXFPDivider
[2]   - SKXPort0
[3]   - SKXPort1
[4]   - SKXPort2
[5]   - SKXPort3
[6]   - SKXPort4
[7]   - SKXPort5
[8]   - SKXPort6
[9]   - SKXPort7


Resource pressure per iteration:
[0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
 -      -     1.66   1.01   0.66   0.67   1.00   1.66   1.67   0.67

Resource pressure by instruction:
[0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
 -      -      -     0.65    -      -      -     0.35    -      -     lea       rcx, [rsi + 63]
 -      -     0.98    -      -      -      -      -     0.02    -     shlx      rdi, r8, rcx
 -      -     0.02    -      -      -      -      -     0.98    -     sar       rcx, 6
 -      -      -     0.33   0.66   0.67   1.00    -     0.67   0.67   or        qword ptr [r10 + 8*rcx - 8], rdi
 -      -     0.34   0.01    -      -      -     0.65    -      -     add       rsi, r9
 -      -     0.32   0.02    -      -      -     0.66    -      -     cmp       rsi, rdx

Two things to note:

  1. Doing 100 iterations of the @simd version is estimated to take 161 cycles on a cascadelake CPU, vs 178 cycles for the while version. llvm-mca thinks the @simd version will be faster on that CPU model.
  2. For the @simd version, LLVM’s bottleneck analysis failed to identify any bottlenecks.
  3. For the while version, it does, but the dependencies it points out don’t seem to be fundamentally different, just deeper, than what we have in the @simd version.

To elaborate, lets look at the indices.
For @simd, on the previous iteration, after we just calculated rbx…

  1. Assign rsi via moving rbx there
  2. Update rsi via sar rsi, 6
    β†’ Use rsi for indexing.

While for the while version, after we just updated rsi with an add…

  1. Assign rcx with lea
    β†’ Use rcx for shlx.
  2. Update rcx via sar rcx, 6
    β†’ Use rcx for indexing.

It looks like the bigger problem might be that peak resource uses are higher, e.g. the bottleneck analysis points to SKXPort6 ([8] on the "Resource pressure by instruction:table), which gets hit by bothsarand thenor` one after the other.

Note that these results differ by CPU. For znver2, llvm-mca (version 11) predicts that the @simd and while versions will be equally fast (and the bottleneck analysis doesn’t highlight any issues in either version).

9 Likes

simdloop.jl is actually quite short. It’s under 200 lines of code, half of which are comments. It seems that it creates two loops, an outer scalar loop and an inner loop, and it does keep track of the β€œtrip count” of the inner loop.

I don’t claim to fully understand the output you posted, but from what I gather, this is wandering near the realm of CPU-specific quirks rather than overall performance? I’ve ran similar tests on the Raspberry Pi, and I’ve always thought it was because of @simd’s overhead that it was so much slower there, but faster on powerful machines.

Thanks for the help, by the way, that was very insightful!

1 Like

Ah, yes, quite clear. Also, note that the outerloop is normally 0:0, the primary exception being CartesianIndices:

julia> Base.SimdLoop.simd_outer_range(1:10)
0:0

julia> Base.SimdLoop.simd_outer_range(CartesianIndices((1:2,1:3,1:4)))
3Γ—4 CartesianIndices{2, Tuple{UnitRange{Int64}, UnitRange{Int64}}}:
 CartesianIndex(1, 1)  CartesianIndex(1, 2)  CartesianIndex(1, 3)  CartesianIndex(1, 4)
 CartesianIndex(2, 1)  CartesianIndex(2, 2)  CartesianIndex(2, 3)  CartesianIndex(2, 4)
 CartesianIndex(3, 1)  CartesianIndex(3, 2)  CartesianIndex(3, 3)  CartesianIndex(3, 4)

In this case, yes.
In many cases, where @simd actually allows for SIMD, the amount of performance gained will vary a lot by CPU, but will normaly do better in most benchmarks, including your raspberry pi.
I don’t have a raspberry pi to test, but if you try this on your laptop and an x86_64 CPU without AVX512:

function mydot(x,y)
    s = zero(promote_type(eltype(x),eltype(y)))
    @inbounds @simd for i ∈ eachindex(x,y)
        s += x[i]*y[i]
    end
    s
end
function mydot_nosimd(x,y)
    s = zero(promote_type(eltype(x),eltype(y)))
    @inbounds for i ∈ eachindex(x,y)
        s += x[i]*y[i]
    end
    s
end
x = rand(256); y = Int64(-120):Int64(135);
@btime mydot($x,$y)
@btime mydot_nosimd($x,$y)

the SIMD version will probably be faster on the pi, while they’ll probably be equally fast on the non-AVX512 x86_64 CPU. I’d be interested in seeing the results if you benchmark it.
(ARM NEON can SIMD convert 64 bit integers into Float64, as can AVX512, but AVX and AVX2 cannot.)

But this sort of thing is more typical. Seeing the change in how iteration is represented causing an increase in performance is a bit more of an odd quirk, and something that I’d hope LLVM would do a better job optimizing.

1 Like