This is the current master, with a few small bugs in compression fixed and a slightly different benchmark using Polyester
for lighter threading.
in_same_set(ds::DataStructures.IntDisjointSets, x, y) = DS.in_same_set(ds, x, y)
in_same_set(ds::CDS.AbstractDisjointSet, x, y) = CDS.same_set(ds, x, y)
function union_bench(ds, r)
ans = [true for _ in axes(r, 2)]
for i in axes(r, 2)
x, y = r[1, i], r[2, i]
union!(ds, x, y)
ans[i] = in_same_set(ds, x, y)
end
@assert all(ans)
return ds
end
function union_bench_poly(ds, r)
ans = [true for _ in axes(r, 2)]
Polyester.@batch minbatch = 2^7 for i in axes(r, 2)
x, y = r[1, i], r[2, i]
union!(ds, x, y)
ans[i] = in_same_set(ds, x, y)
end
@assert all(ans)
return ds
end
timings = map(2:1:20) do N
Random.seed!(1234)
r = rand(1:2^N, 2, 2^(N - 1))
cds = @benchmark union_bench_poly(x, $r) setup =
(x = CDS.ConcurrentDisjointSet($(2^N))) evals = 1
cds1 = @benchmark union_bench(x, $r) setup =
(x = CDS.ConcurrentDisjointSet($(2^N))) evals = 1
ds = @benchmark union_bench(x, $r) setup = (x = CDS.DisjointSet($(2^N))) evals = 1
baseline = @benchmark union_bench(x, $r) setup =
(x = DS.IntDisjointSets($(2^N))) evals = 1
# @info N median(cds) median(ds) median(baseline)
return cds, cds1, ds, baseline
end
results
(threaded execution, concurrent ds + serial execution, serial ds, baseline (DataStructures)
(120.000 ns, 120.000 ns, 80.000 ns, 60.000 ns)
(240.000 ns, 240.000 ns, 140.000 ns, 100.000 ns)
(440.000 ns, 450.000 ns, 270.000 ns, 180.000 ns)
(871.000 ns, 881.000 ns, 561.000 ns, 341.000 ns)
(2.183 μs, 1.863 μs, 1.192 μs, 691.000 ns)
(4.608 μs, 3.576 μs, 2.455 μs, 1.352 μs)
(5.250 μs, 7.143 μs, 5.170 μs, 2.724 μs)
(8.285 μs, 14.276 μs, 10.339 μs, 5.550 μs)
(15.378 μs, 28.862 μs, 22.009 μs, 11.371 μs)
(27.640 μs, 57.544 μs, 45.713 μs, 23.373 μs)
(51.463 μs, 119.898 μs, 100.753 μs, 49.189 μs)
(96.475 μs, 244.163 μs, 350.346 μs, 103.848 μs)
(181.830 μs, 665.529 μs, 926.805 μs, 325.151 μs)
(366.777 μs, 1.431 ms, 1.993 ms, 849.415 μs)
(816.837 μs, 3.060 ms, 4.495 ms, 1.920 ms)
(1.693 ms, 6.773 ms, 10.365 ms, 4.345 ms)
(3.497 ms, 15.192 ms, 24.507 ms, 9.430 ms)
(6.849 ms, 32.876 ms, 55.914 ms, 21.628 ms)
(13.822 ms, 72.533 ms, 129.908 ms, 78.798 ms)
- concurrent faster than sequential for N > 2^7;
- concurrent faster than baseline for N > 2^13;
- nice scaling on 8 threads: up to 5 times faster N = 2^20;
- there’s something wrong with the serial data structure as it becomes 2 times slower than it should (the concurrent in single-threaded execution! is 2× faster).