LoopVectorization multithreading for multidimensional arrays

Elrod · February 25, 2022, 8:49pm

Here is what I get on a computer with 18 cores/36 threads:

n=100
serial1!   210.579 ns (0 allocations: 0 bytes)
tturbo1!   42.110 ns (0 allocations: 0 bytes)
parallel1!   181.338 ns (0 allocations: 0 bytes)
serial2!   172.693 ns (0 allocations: 0 bytes)
tturbo2!   367.043 ns (0 allocations: 0 bytes)
parallel2!   538.820 ns (0 allocations: 0 bytes)
serialSA!  92.536 ns (0 allocations: 0 bytes)
parallelSA!  84.333 μs (182 allocations: 17.25 KiB)
n=10000
serial1!   20.648 μs (0 allocations: 0 bytes)
tturbo1!   2.942 μs (0 allocations: 0 bytes)
parallel1!   21.608 μs (0 allocations: 0 bytes)
serial2!   19.573 μs (0 allocations: 0 bytes)
tturbo2!   3.911 μs (0 allocations: 0 bytes)
parallel2!   55.714 μs (0 allocations: 0 bytes)
serialSA!  7.455 μs (0 allocations: 0 bytes)
parallelSA!  113.002 μs (181 allocations: 17.22 KiB)
n=1000000
serial1!   4.953 ms (0 allocations: 0 bytes)
tturbo1!   1.076 ms (0 allocations: 0 bytes)
parallel1!   1.235 ms (151 allocations: 9.41 KiB)
serial2!   5.839 ms (0 allocations: 0 bytes)
tturbo2!   1.165 ms (0 allocations: 0 bytes)
parallel2!   1.909 ms (152 allocations: 9.44 KiB)
serialSA!  5.026 ms (0 allocations: 0 bytes)
parallelSA!  1.099 ms (182 allocations: 17.25 KiB)

julia> Threads.nthreads()
36

Close to a 5x improvement.
This computer has 4 channel memory. The laptop has 2 channel.

Each array at the largest size requires about 22.9 MiB. My 18 core computer only has 24.75 MiB of L3 cache total.

It’s require 69 MiB to hold all three. A 5950X would come close (64 MiB).

LaurentPlagne · February 26, 2022, 9:57am

Indeed this computational kernel is memory bound and I found interesting to note that a serial computations on the M1 max outperforms by a large factor (>x5) the serial performance of this beefy desktop machine (and also outperforms parallel performance using 36 threads):

 Row │ n          fn           t            GFlops      GBs        
     │ Int64      String       Float64      Float64     Float64    
─────┼─────────────────────────────────────────────────────────────
   1 │       100  serial1!     1.59809e-7    5.63172     45.0538
   2 │       100  parallel1!   1.31715e-7    6.83293     54.6634
   3 │       100  serial2!     1.58541e-7    5.67676     45.414
   4 │       100  parallel2!   3.23913e-7    2.77852     22.2282
   5 │       100  serialSA!    7.51029e-8   11.9836      95.8685
   6 │       100  parallelSA!  1.3042e-5     0.0690078    0.552063
   7 │     10000  serial1!     1.8958e-5     4.74734     37.9787
   8 │     10000  parallel1!   1.75e-5       5.14286     41.1429
   9 │     10000  serial2!     1.5625e-5     5.76        46.08
  10 │     10000  parallel2!   3.525e-5      2.55319     20.4255
  11 │     10000  serialSA!    7.4585e-6    12.0668      96.5342
  12 │     10000  parallelSA!  3.1958e-5     2.8162      22.5296
  13 │   1000000  serial1!     0.002885      3.11958     24.9567
  14 │   1000000  parallel1!   0.000793833  11.3374      90.6992
  15 │   1000000  serial2!     0.00252079    3.57031     28.5625
  16 │   1000000  parallel2!   0.000852791  10.5536      84.4287
  17 │   1000000  serialSA!    0.000855792  10.5166      84.1326
  18 │   1000000  parallelSA!  0.000475958  18.9092     151.274
  19 │ 100000000  serial1!     0.433559      2.07584     16.6067
  20 │ 100000000  parallel1!   0.0975399     9.22699     73.8159
  21 │ 100000000  serial2!     0.373829      2.40752     19.2601
  22 │ 100000000  parallel2!   0.104613      8.60313     68.825
  23 │ 100000000  serialSA!    0.181682      4.95372     39.6298
  24 │ 100000000  parallelSA!  0.0602159    14.9462     119.57

modified benchmark

using LinearAlgebra, StaticArrays, LoopVectorization, Tullio, BenchmarkTools
using DataFrames

function serial1!(ds, s, h)
    for ci in 1:size(ds, 1)
        ds[ci, 1] = s[ci, 2]*h[ci, 3] - s[ci, 3]*h[ci, 2]
        ds[ci, 2] = s[ci, 3]*h[ci, 1] - s[ci, 1]*h[ci, 3]
        ds[ci, 3] = s[ci, 1]*h[ci, 2] - s[ci, 2]*h[ci, 1]
    end
end

function parallel1!(ds, s, h)
    idx1 = (2, 3, 1)
    idx2 = (3, 1, 2)
    @tullio ds[ci, cj] = s[ci, idx1[cj]]*h[ci, idx2[cj]] - s[ci, idx2[cj]]*h[ci, idx1[cj]]
end

function serial2!(ds, s, h)
    for cj in 1:size(ds, 2)
        ds[1, cj] = s[2, cj]*h[3, cj] - s[3, cj]*h[2, cj]
        ds[2, cj] = s[3, cj]*h[1, cj] - s[1, cj]*h[3, cj]
        ds[3, cj] = s[1, cj]*h[2, cj] - s[2, cj]*h[1, cj]
    end
end

function parallel2!(ds, s, h)
    idx1 = (2, 3, 1)
    idx2 = (3, 1, 2)
    @tullio ds[ci, cj] = s[idx1[ci], cj]*h[idx2[ci], cj] - s[idx2[ci], cj]*h[idx1[ci], cj]
end

function serialSA!(ds, s, h)
    @. ds = cross(s,h)
end

function parallelSA!(ds, s, h)
    Threads.@threads for i ∈ 1:length(s)
         @inbounds ds[i] = cross(s[i],h[i])
    end
end

s = rand(10, 3)
h = rand(10, 3)
ds11 = Matrix{Float64}(undef, 10, 3)
serial1!(ds11, s, h)
ds12 = Matrix{Float64}(undef, 10, 3)
parallel1!(ds12, s, h)
@assert ds12 ≈ ds11

# s = rand(3, 10)
# h = rand(3, 10)
ds21 = Matrix{Float64}(undef, 3, 10)
serial2!(ds21, s', h')
@assert ds21 ≈ ds12'
ds22 = Matrix{Float64}(undef, 3, 10)
parallel2!(ds22, s', h')
@assert ds22 ≈ ds21

array_size_in_bytes(n) = 3n*8 
gbs(t,n) = 3array_size_in_bytes(n)/(t*1.e9) # 2R + 1W
gflops(t,n) = 9n/(t*1.e9) # 6mul + 3minus

function bench()
    df = DataFrame()

    for p in [2,4,6,8]
        n =10^p
        println("n=$n")

        for f ∈ (serial1!,parallel1!)
            t=@belapsed $f(ds, s, h) setup=(ds = Matrix{Float64}(undef, $n, 3); s = rand($n, 3); h = rand($n, 3))
            append!(df,DataFrame(n = n,fn=string(f),t=t,GFlops=gflops(t,n), GBs=gbs(t,n)))
            println("$(string(f)) t=$t s (GB/s=$(gbs(t,n)))")
        end

        for f ∈ (serial2!,parallel2!)
            t=@belapsed $f(ds, s, h) setup=(ds = Matrix{Float64}(undef, 3, $n); s = rand(3, $n); h = rand(3, $n))
            append!(df,DataFrame(n = n,fn=string(f),t=t,GFlops=gflops(t,n), GBs=gbs(t,n)))
            println("$(string(f)) t=$t s (GB/s=$(gbs(t,n)))")
        end

        for f ∈ (serialSA!,parallelSA!)
            t=@belapsed $f(ds, s, h) setup=(ds = rand(SVector{3, Float64},$n);  s = rand(SVector{3, Float64},$n);  h = rand(SVector{3, Float64},$n))
            append!(df,DataFrame(n = n,fn=string(f),t=t,GFlops=gflops(t,n), GBs=gbs(t,n)))
            println("$(string(f)) t=$t s (GB/s=$(gbs(t,n)))")    
        end
    end

    df
end

bench()

Elrod · February 26, 2022, 10:21pm

I added @inbounds @fastmath as well as @turbo and @tturbo:

Modified benchmark + @tturbo

using LinearAlgebra, StaticArrays, LoopVectorization, Tullio, BenchmarkTools
using DataFrames

function serial1!(ds, s, h)
    @inbounds @fastmath for ci in 1:size(ds, 1)
        ds[ci, 1] = s[ci, 2]*h[ci, 3] - s[ci, 3]*h[ci, 2]
        ds[ci, 2] = s[ci, 3]*h[ci, 1] - s[ci, 1]*h[ci, 3]
        ds[ci, 3] = s[ci, 1]*h[ci, 2] - s[ci, 2]*h[ci, 1]
    end
end
function turbo1!(ds, s, h)
    @turbo for ci in 1:size(ds, 1)
        ds[ci, 1] = s[ci, 2]*h[ci, 3] - s[ci, 3]*h[ci, 2]
        ds[ci, 2] = s[ci, 3]*h[ci, 1] - s[ci, 1]*h[ci, 3]
        ds[ci, 3] = s[ci, 1]*h[ci, 2] - s[ci, 2]*h[ci, 1]
    end
end
function tturbo1!(ds, s, h)
    @tturbo for ci in 1:size(ds, 1)
        ds[ci, 1] = s[ci, 2]*h[ci, 3] - s[ci, 3]*h[ci, 2]
        ds[ci, 2] = s[ci, 3]*h[ci, 1] - s[ci, 1]*h[ci, 3]
        ds[ci, 3] = s[ci, 1]*h[ci, 2] - s[ci, 2]*h[ci, 1]
    end
end

function parallel1!(ds, s, h)
    idx1 = (2, 3, 1)
    idx2 = (3, 1, 2)
    @tullio ds[ci, cj] = s[ci, idx1[cj]]*h[ci, idx2[cj]] - s[ci, idx2[cj]]*h[ci, idx1[cj]]
end

function serial2!(ds, s, h)
    @inbounds @fastmath for cj in 1:size(ds, 2)
        ds[1, cj] = s[2, cj]*h[3, cj] - s[3, cj]*h[2, cj]
        ds[2, cj] = s[3, cj]*h[1, cj] - s[1, cj]*h[3, cj]
        ds[3, cj] = s[1, cj]*h[2, cj] - s[2, cj]*h[1, cj]
    end
end
function turbo2!(ds, s, h)
    @turbo for cj in 1:size(ds, 2)
        ds[1, cj] = s[2, cj]*h[3, cj] - s[3, cj]*h[2, cj]
        ds[2, cj] = s[3, cj]*h[1, cj] - s[1, cj]*h[3, cj]
        ds[3, cj] = s[1, cj]*h[2, cj] - s[2, cj]*h[1, cj]
    end
end

function tturbo2!(ds, s, h)
    @tturbo for cj in 1:size(ds, 2)
        ds[1, cj] = s[2, cj]*h[3, cj] - s[3, cj]*h[2, cj]
        ds[2, cj] = s[3, cj]*h[1, cj] - s[1, cj]*h[3, cj]
        ds[3, cj] = s[1, cj]*h[2, cj] - s[2, cj]*h[1, cj]
    end
end

function parallel2!(ds, s, h)
    idx1 = (2, 3, 1)
    idx2 = (3, 1, 2)
    @tullio ds[ci, cj] = s[idx1[ci], cj]*h[idx2[ci], cj] - s[idx2[ci], cj]*h[idx1[ci], cj]
end

function serialSA!(ds, s, h)
    @. ds = cross(s,h)
end

function parallelSA!(ds, s, h)
    Threads.@threads for i ∈ 1:length(s)
         @inbounds ds[i] = cross(s[i],h[i])
    end
end

s = rand(10, 3);
h = rand(10, 3);
ds11 = Matrix{Float64}(undef, 10, 3);
serial1!(ds11, s, h);
ds12 = Matrix{Float64}(undef, 10, 3);
parallel1!(ds12, s, h);
@assert ds12 ≈ ds11
ds12 .= NaN;
turbo1!(ds12, s, h);
@assert ds12 ≈ ds11
ds12 .= NaN;
tturbo1!(ds12, s, h);
@assert ds12 ≈ ds11

s = permutedims(s);
h = permutedims(h);
ds21 = Matrix{Float64}(undef, 3, 10);
serial2!(ds21, s, h)
@assert ds21 ≈ ds12'
ds21 .= NaN;
turbo2!(ds21, s, h)
@assert ds21 ≈ ds12'
ds21 .= NaN;
tturbo2!(ds21, s, h)
@assert ds21 ≈ ds12'

ds22 = Matrix{Float64}(undef, 3, 10);
parallel2!(ds22, s, h);
@assert ds22 ≈ ds21

array_size_in_bytes(n) = 3n*8 
gbs(t,n) = 3array_size_in_bytes(n)/(t*1.e9) # 2R + 1W
gflops(t,n) = 9n/(t*1.e9) # 6mul + 3minus

function bench()
    df = DataFrame()

    for p in [2,4,6,8]
        n =10^p
        println("n=$n")

        for f ∈ (serial1!,turbo1!,tturbo1!,parallel1!)
            t=@belapsed $f(ds, s, h) setup=(ds = Matrix{Float64}(undef, $n, 3); s = rand($n, 3); h = rand($n, 3))
            append!(df,DataFrame(n = n,fn=string(f),t=t,GFlops=gflops(t,n), GBs=gbs(t,n)))
            println("$(string(f)) t=$t s (GB/s=$(gbs(t,n)))")
        end

        for f ∈ (serial2!,turbo2!,tturbo2!,parallel2!)
            t=@belapsed $f(ds, s, h) setup=(ds = Matrix{Float64}(undef, 3, $n); s = rand(3, $n); h = rand(3, $n))
            append!(df,DataFrame(n = n,fn=string(f),t=t,GFlops=gflops(t,n), GBs=gbs(t,n)))
            println("$(string(f)) t=$t s (GB/s=$(gbs(t,n)))")
        end

        for f ∈ (serialSA!,parallelSA!)
            t=@belapsed $f(ds, s, h) setup=(ds = rand(SVector{3, Float64},$n);  s = rand(SVector{3, Float64},$n);  h = rand(SVector{3, Float64},$n))
            append!(df,DataFrame(n = n,fn=string(f),t=t,GFlops=gflops(t,n), GBs=gbs(t,n)))
            println("$(string(f)) t=$t s (GB/s=$(gbs(t,n)))")    
        end
    end

    df
end
bench()

This is what I get on an M1 Mac Mini (4 threads):

40×5 DataFrame
 Row │ n          fn           t           GFlops     GBs       
     │ Int64      String       Float64     Float64    Float64   
─────┼──────────────────────────────────────────────────────────
   1 │       100  serial1!     1.44349e-7   6.23488    49.8791
   2 │       100  turbo1!      5.21783e-8  17.2485    137.988
   3 │       100  tturbo1!     6.22243e-8  14.4638    115.71
   4 │       100  parallel1!   1.32925e-7   6.77074    54.166
   5 │       100  serial2!     1.12106e-7   8.02809    64.2247
   6 │       100  turbo2!      1.28553e-7   7.00099    56.0079
   7 │       100  tturbo2!     2.09722e-6   0.429139    3.43311
   8 │       100  parallel2!   3.30187e-7   2.72573    21.8058
   9 │       100  serialSA!    7.64311e-8  11.7753     94.2025
  10 │       100  parallelSA!  4.23814e-6   0.212357    1.69886
  11 │     10000  serial1!     1.8875e-5    4.76821    38.1457
  12 │     10000  turbo1!      8.98633e-6  10.0152     80.1217
  13 │     10000  tturbo1!     4.97029e-6  18.1076    144.861
  14 │     10000  parallel1!   1.7917e-5    5.02316    40.1853
  15 │     10000  serial2!     1.1875e-5    7.57895    60.6316
  16 │     10000  turbo2!      1.275e-5     7.05882    56.4706
  17 │     10000  tturbo2!     6.4305e-6   13.9958    111.966
  18 │     10000  parallel2!   3.5625e-5    2.52632    20.2105
  19 │     10000  serialSA!    7.52075e-6  11.9669     95.7351
  20 │     10000  parallelSA!  3.4292e-5    2.62452    20.9962
  21 │   1000000  serial1!     0.00309058   2.91207    23.2966
  22 │   1000000  turbo1!      0.00236067   3.81248    30.4999
  23 │   1000000  tturbo1!     0.00184904   4.86739    38.9391
  24 │   1000000  parallel1!   0.00188825   4.76632    38.1305
  25 │   1000000  serial2!     0.00248383   3.62343    28.9874
  26 │   1000000  turbo2!      0.0025265    3.56224    28.4979
  27 │   1000000  tturbo2!     0.00169158   5.32046    42.5637
  28 │   1000000  parallel2!   0.00185517   4.85132    38.8105
  29 │   1000000  serialSA!    0.00128117   7.02485    56.1988
  30 │   1000000  parallelSA!  0.00132217   6.80701    54.4561
  31 │ 100000000  serial1!     0.419392     2.14597    17.1677
  32 │ 100000000  turbo1!      0.342131     2.63057    21.0446
  33 │ 100000000  tturbo1!     0.165464     5.43924    43.5139
  34 │ 100000000  parallel1!   2.81473      0.319746    2.55797
  35 │ 100000000  serial2!     0.377544     2.38383    19.0706
  36 │ 100000000  turbo2!      0.378712     2.37648    19.0118
  37 │ 100000000  tturbo2!     0.163761     5.49582    43.9666
  38 │ 100000000  parallel2!   2.26201      0.397875    3.183
  39 │ 100000000  serialSA!    0.186558     4.82423    38.5938
  40 │ 100000000  parallelSA!  0.57691      1.56003    12.4803

Unsurprisingly, the serial performance is similar between the M1 and M1 max, but something seems wrong with my multithreaded performance.

Also somewhat surprising (to me) is just how well the StaticArray methods perform, particularly at larger sizes.

For comparison, here is what I get on 4 threads of a Skylake-AVX512 system:

40×5 DataFrame
 Row │ n          fn           t           GFlops     GBs
     │ Int64      String       Float64     Float64    Float64
─────┼──────────────────────────────────────────────────────────
   1 │       100  serial1!     2.04572e-7   4.39943    35.1954
   2 │       100  turbo1!      3.15392e-8  28.5359    228.287
   3 │       100  tturbo1!     4.10727e-8  21.9124    175.299
   4 │       100  parallel1!   1.86461e-7   4.82674    38.6139
   5 │       100  serial2!     1.67515e-7   5.37265    42.9812
   6 │       100  turbo2!      1.50617e-7   5.97543    47.8034
   7 │       100  tturbo2!     3.43509e-7   2.62002    20.9601
   8 │       100  parallel2!   5.00284e-7   1.79898    14.3918
   9 │       100  serialSA!    7.62214e-8  11.8077     94.4616
  10 │       100  parallelSA!  3.16538e-6   0.284327    2.27461
  11 │     10000  serial1!     2.0156e-5    4.46517    35.7214
  12 │     10000  turbo1!      7.59117e-6  11.8559     94.8471
  13 │     10000  tturbo1!     3.67789e-6  24.4706    195.764
  14 │     10000  parallel1!   2.2013e-5    4.08849    32.7079
  15 │     10000  serial2!     1.9908e-5    4.5208     36.1664
  16 │     10000  turbo2!      1.6786e-5    5.36161    42.8929
  17 │     10000  tturbo2!     4.88788e-6  18.4129    147.303
  18 │     10000  parallel2!   5.5112e-5    1.63304    13.0643
  19 │     10000  serialSA!    6.9584e-6   12.934     103.472
  20 │     10000  parallelSA!  1.21577e-5   7.40274    59.2219
  21 │   1000000  serial1!     0.00389657   2.30972    18.4778
  22 │   1000000  turbo1!      0.00375387   2.39753    19.1802
  23 │   1000000  tturbo1!     0.00130671   6.88755    55.1004
  24 │   1000000  parallel1!   0.00135674   6.63353    53.0683
  25 │   1000000  serial2!     0.00395984   2.27282    18.1825
  26 │   1000000  turbo2!      0.00397627   2.26343    18.1074
  27 │   1000000  tturbo2!     0.0012328    7.30044    58.4035
  28 │   1000000  parallel2!   0.00159334   5.64852    45.1882
  29 │   1000000  serialSA!    0.00386145   2.33073    18.6459
  30 │   1000000  parallelSA!  0.00122699   7.33503    58.6802
  31 │ 100000000  serial1!     1.09482      0.822056    6.57644
  32 │ 100000000  turbo1!      1.03902      0.866197    6.92957
  33 │ 100000000  tturbo1!     0.282697     3.18362    25.469
  34 │ 100000000  parallel1!   0.29845      3.01558    24.1246
  35 │ 100000000  serial2!     1.10248      0.816344    6.53075
  36 │ 100000000  turbo2!      1.04852      0.858349    6.86679
  37 │ 100000000  tturbo2!     0.285908     3.14787    25.183
  38 │ 100000000  parallel2!   0.356553     2.52417    20.1933
  39 │ 100000000  serialSA!    0.428117     2.10223    16.8178
  40 │ 100000000  parallelSA!  0.138906     6.4792     51.8336

julia> versioninfo()
Julia Version 1.9.0-DEV.79
Commit df81bf9a96 (2022-02-24 07:30 UTC)
Platform Info:
  OS: Linux (x86_64-redhat-linux)
  CPU: 28 × Intel(R) Core(TM) i9-9940X CPU @ 3.30GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-13.0.1 (ORCJIT, skylake-avx512)
  Threads: 4 on 28 virtual cores

Much better performance at small sizes than the M1 for turbo1! and tturbo1!, but performance falls off a cliff as it it chokes on memory bandwidth.
It’s often been noted before, but bares repeating that the M1’s memory bandwidth is impressive.

LaurentPlagne · February 27, 2022, 7:33am

Focusing on intermediate sizes (closest to OP target ?) M1/M1 max are super close

Apple M1 Max

  11 │     10000  serial1!     1.8708e-5     4.81078     38.4862
  12 │     10000  turbo1!      8.68067e-6   10.3679      82.9429
  13 │     10000  tturbo1!     4.75512e-6   18.9269     151.416
  14 │     10000  parallel1!   1.75e-5       5.14286     41.1429
  15 │     10000  serial2!     1.1792e-5     7.63229     61.0583
  16 │     10000  turbo2!      1.2333e-5     7.29749     58.38
  17 │     10000  tturbo2!     5.84033e-6   15.4101     123.281
  18 │     10000  parallel2!   3.525e-5      2.55319     20.4255
  19 │     10000  serialSA!    7.5105e-6    11.9832      95.8658
  20 │     10000  parallelSA!  4.1833e-5     2.15141     17.2113

Apple M1 (mac mini)

Elrod:

  11 │     10000  serial1!     1.8875e-5    4.76821    38.1457
  12 │     10000  turbo1!      8.98633e-6  10.0152     80.1217
  13 │     10000  tturbo1!     4.97029e-6  18.1076    144.861
  14 │     10000  parallel1!   1.7917e-5    5.02316    40.1853
  15 │     10000  serial2!     1.1875e-5    7.57895    60.6316
  16 │     10000  turbo2!      1.275e-5     7.05882    56.4706
  17 │     10000  tturbo2!     6.4305e-6   13.9958    111.966
  18 │     10000  parallel2!   3.5625e-5    2.52632    20.2105
  19 │     10000  serialSA!    7.52075e-6  11.9669     95.7351
  20 │     10000  parallelSA!  3.4292e-5    2.62452    20.9962

Skylake AVX 512

Elrod:

  11 │     10000  serial1!     2.0156e-5    4.46517    35.7214
  12 │     10000  turbo1!      7.59117e-6  11.8559     94.8471
  13 │     10000  tturbo1!     3.67789e-6  24.4706    195.764
  14 │     10000  parallel1!   2.2013e-5    4.08849    32.7079
  15 │     10000  serial2!     1.9908e-5    4.5208     36.1664
  16 │     10000  turbo2!      1.6786e-5    5.36161    42.8929
  17 │     10000  tturbo2!     4.88788e-6  18.4129    147.303
  18 │     10000  parallel2!   5.5112e-5    1.63304    13.0643
  19 │     10000  serialSA!    6.9584e-6   12.934     103.472
  20 │     10000  parallelSA!  1.21577e-5   7.40274    59.2219

For completeness here are my results with the last benchmark

All result for M1 max

40×5 DataFrame
 Row │ n          fn           t            GFlops      GBs        
     │ Int64      String       Float64      Float64     Float64    
─────┼─────────────────────────────────────────────────────────────
   1 │       100  serial1!     1.44109e-7    6.24528     49.9623
   2 │       100  turbo1!      5.13854e-8   17.5147     140.118
   3 │       100  tturbo1!     6.0887e-8    14.7815     118.252
   4 │       100  parallel1!   1.31858e-7    6.82552     54.6041
   5 │       100  serial2!     1.10077e-7    8.17613     65.409
   6 │       100  turbo2!      1.26454e-7    7.11722     56.9378
   7 │       100  tturbo2!     1.97222e-6    0.456338     3.6507
   8 │       100  parallel2!   3.2373e-7     2.78009     22.2407
   9 │       100  serialSA!    7.51029e-8   11.9836      95.8685
  10 │       100  parallelSA!  1.6875e-5     0.0533333    0.426667
  11 │     10000  serial1!     1.8708e-5     4.81078     38.4862
  12 │     10000  turbo1!      8.68067e-6   10.3679      82.9429
  13 │     10000  tturbo1!     4.75512e-6   18.9269     151.416
  14 │     10000  parallel1!   1.75e-5       5.14286     41.1429
  15 │     10000  serial2!     1.1792e-5     7.63229     61.0583
  16 │     10000  turbo2!      1.2333e-5     7.29749     58.38
  17 │     10000  tturbo2!     5.84033e-6   15.4101     123.281
  18 │     10000  parallel2!   3.525e-5      2.55319     20.4255
  19 │     10000  serialSA!    7.5105e-6    11.9832      95.8658
  20 │     10000  parallelSA!  4.1833e-5     2.15141     17.2113
  21 │   1000000  serial1!     0.00287833    3.12681     25.0145
  22 │   1000000  turbo1!      0.00190671    4.72018     37.7614
  23 │   1000000  tturbo1!     0.00102754    8.75877     70.0701
  24 │   1000000  parallel1!   0.000786833  11.4383      91.5061
  25 │   1000000  serial2!     0.002188      4.11335     32.9068
  26 │   1000000  turbo2!      0.00223104    4.03399     32.2719
  27 │   1000000  tturbo2!     0.00109825    8.19486     65.5588
  28 │   1000000  parallel2!   0.000855125  10.5248      84.1982
  29 │   1000000  serialSA!    0.000856291  10.5104      84.0836
  30 │   1000000  parallelSA!  0.000502584  17.9075     143.26
  31 │ 100000000  serial1!     0.392213      2.29467     18.3574
  32 │ 100000000  turbo1!      0.305043      2.9504      23.6032
  33 │ 100000000  tturbo1!     0.116874      7.70062     61.605
  34 │ 100000000  parallel1!   0.0913492     9.8523      78.8184
  35 │ 100000000  serial2!     0.314335      2.86319     22.9055
  36 │ 100000000  turbo2!      0.315129      2.85597     22.8478
  37 │ 100000000  tturbo2!     0.122657      7.33752     58.7001
  38 │ 100000000  parallel2!   0.0983045     9.15522     73.2418
  39 │ 100000000  serialSA!    0.131371      6.85082     54.8066
  40 │ 100000000  parallelSA!  0.049987     18.0047     144.037

Gregstrq · March 17, 2022, 1:04pm

@goerch, @LaurentPlagne, @Elrod, @mcabbott
Everything that we discussed before makes sense if one is dealing with arrays that have Linear IndexStyle. But how does one parallelize operations with arrays that use Cartesian IndexStyle?

For concreteness, imagine that I want to parallelize an operation over ShiftedArrays.CircShiftedArray from package ShiftedArrays. This type of array has Cartesian IndexStyle, and eachindex returns an instance of CartesianIndices.
I have noticed that @tturbo for I in eachindex(a) does not work in this case. I have also tried to split the outermost loop and apply @tturbo, but it still does not work.

So, how does one use @tturbo or @tullio with Cartesian IndexStyle arrays?

Topic		Replies	Views
Multithreading in LoopVectorization.jl General Usage	8	853	June 30, 2021
Can't understand what LoopVectorization is doing General Usage	7	772	September 1, 2021
Replicate @tturbo performance Performance	23	2428	August 23, 2022
Threading, Threads.@threads vs polyester.@batch vs LoopVectorization.@tturbo General Usage multithreading , threads , loopvectorization , polyester	4	1714	July 23, 2022
LoopVec, Tullio losing to Matrix multiplication Performance	9	723	July 25, 2024

LoopVectorization multithreading for multidimensional arrays

Related topics