[ANN] ThreadsX.jl: Parallelized Base functions

Hey @tkf,

I updated my container with perf, so now I can run your

function perf(f, args)
    pid = getpid()
    cmd = `perf $args --pid=$pid`
    proc = run(pipeline(cmd, stdout=stdout, stderr=stderr); wait=false)
    try
        return f()
    finally
        flush(stdout)
        flush(stderr)
        kill(proc, Base.SIGINT)
        wait(proc)
    end
end

using BenchmarkTools, ThreadsX
b = @benchmarkable ThreadsX.sort($(rand(0:0.01:1, 1_000_000)); alg=MergeSort)
tune!(b)

perf(`stat -ddd`) do
    run(b)
end |> display

Here’s the result (using the Manifest.toml in the ThreadsXBenchmarks opt branch):

 Performance counter stats for process id 'xxxxxx':

        320,806.28 msec task-clock:u              #   61.441 CPUs utilized
                 0      context-switches:u        #    0.000 K/sec
                 0      cpu-migrations:u          #    0.000 K/sec
           780,457      page-faults:u             #    0.002 M/sec
   998,915,051,084      cycles:u                  #    3.114 GHz                      (33.17%)
   105,151,583,800      stalled-cycles-frontend:u #   10.53% frontend cycles idle     (33.24%)
   759,847,656,100      stalled-cycles-backend:u  #   76.07% backend cycles idle      (33.32%)
   469,608,517,210      instructions:u            #    0.47  insn per cycle
                                                  #    1.62  stalled cycles per insn  (33.39%)
    82,639,767,893      branches:u                #  257.600 M/sec                    (33.47%)
     1,999,816,791      branch-misses:u           #    2.42% of all branches          (33.49%)
   123,771,767,763      L1-dcache-loads:u         #  385.815 M/sec                    (33.49%)
     3,366,246,284      L1-dcache-load-misses:u   #    2.72% of all L1-dcache hits    (33.50%)
   <not supported>      LLC-loads:u
   <not supported>      LLC-load-misses:u
     3,417,363,890      L1-icache-loads:u         #   10.652 M/sec                    (33.48%)
        27,569,790      L1-icache-load-misses:u   #    0.81% of all L1-icache hits    (33.42%)
       179,455,568      dTLB-loads:u              #    0.559 M/sec                    (33.36%)
        56,062,290      dTLB-load-misses:u        #   31.24% of all dTLB cache hits   (33.28%)
         1,656,153      iTLB-loads:u              #    0.005 M/sec                    (33.20%)
           580,824      iTLB-load-misses:u        #   35.07% of all iTLB cache hits   (33.14%)
     1,401,532,572      L1-dcache-prefetches:u    #    4.369 M/sec                    (33.12%)
   <not supported>      L1-dcache-prefetch-misses:u

       5.221355168 seconds time elapsed

BenchmarkTools.Trial:
  memory estimate:  22.30 MiB
  allocs estimate:  41828
  --------------
  minimum time:     7.516 ms (0.00% GC)
  median time:      9.815 ms (0.00% GC)
  mean time:        10.559 ms (10.14% GC)
  maximum time:     16.400 ms (42.81% GC)
  --------------
  samples:          474
  evals/sample:     1

I wish I had access to this monster machine so that I can debug the saturation/degradation in the performance

That may be possible, I sent you an email.

2 Likes