Recently when working on sorting data with Hilbert curves, I noticed something very peculiar when benchmarking.
function hilbert_sort(m::AbstractMatrix{<:Integer}, b::Integer; rev::Bool = false)
return m[:, hilbert_sortperm(m, b; rev)]
end
function hilbert_sort!(m::AbstractMatrix{<:Integer}, b::Integer; rev::Bool = false)
return Base.permutecols!!(m, hilbert_sortperm(m, b; rev))
end
function hilbert_sortperm(m::AbstractMatrix{<:Integer}, b::Integer; rev::Bool = false)
h = mapslices(__with_bit(b), m; dims = 1) |> vec
return sortperm(h; rev)
end
function hilbert_sortperm!(m::AbstractMatrix{<:Integer}, b::Integer; rev::Bool = false)
h = eachcol(m) .|> __with_bit(b)
return sortperm(h; rev)
end
Where __with_bit is a function factory that returns an expensive-ish computation function (Skilling’s method returning a distance) that mutates. Naturally, I would expect the performance of hilbert_sortperm! > hilbert_sortperm > hilbert_sort! > hilbert_sort. However, the result is the last I would expect:
Large m benchmark
julia> m = rand(Bool, (10, 1000))
10×1000 Matrix{Bool}:
[...]
julia> @benchmark hilbert_sort($(copy(m)), $1)
BenchmarkTools.Trial: 10000 samples with 1 evaluation per sample.
Range (min … max): 56.600 μs … 16.335 ms ┊ GC (min … max): 0.00% … 99.36%
Time (median): 62.400 μs ┊ GC (median): 0.00%
Time (mean ± σ): 68.468 μs ± 163.403 μs ┊ GC (mean ± σ): 2.37% ± 0.99%
▇▄▁▇█▃▁
▅███████▇▆▅▅▄▄▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ ▂
56.6 μs Histogram: frequency by time 116 μs <
Memory estimate: 33.89 KiB, allocs estimate: 23.
julia> @benchmark hilbert_sort!($(copy(m)), $1)
BenchmarkTools.Trial: 10000 samples with 1 evaluation per sample.
Range (min … max): 26.300 μs … 926.400 μs ┊ GC (min … max): 0.00% … 0.00%
Time (median): 29.000 μs ┊ GC (median): 0.00%
Time (mean ± σ): 30.798 μs ± 11.887 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
▃▇█▇▇▇▅▄▄▃▃▃▂▂▂▁▁ ▂
██████████████████▇▇▇▆▇▆▆▇▆▅▄▄▃▄▄▁▃▅▃▁▁▃▃▄▁▆▇▆▇▇▇▇▇▆▆▇▆▅▅▆▆▆ █
26.3 μs Histogram: log(frequency) by time 68.2 μs <
Memory estimate: 16.11 KiB, allocs estimate: 17.
julia> @benchmark hilbert_sortperm($(copy(m)), $1)
BenchmarkTools.Trial: 10000 samples with 1 evaluation per sample.
Range (min … max): 52.600 μs … 15.688 ms ┊ GC (min … max): 0.00% … 99.41%
Time (median): 56.900 μs ┊ GC (median): 0.00%
Time (mean ± σ): 61.121 μs ± 156.932 μs ┊ GC (mean ± σ): 2.55% ± 0.99%
▅█▃▆▃▁
▃██████▆▆▅▄▄▃▃▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ ▂
52.6 μs Histogram: frequency by time 103 μs <
Memory estimate: 23.99 KiB, allocs estimate: 20.
julia> @benchmark hilbert_sortperm!($(copy(m)), $1)
BenchmarkTools.Trial: 10000 samples with 1 evaluation per sample.
Range (min … max): 72.700 μs … 11.514 ms ┊ GC (min … max): 0.00% … 99.06%
Time (median): 76.700 μs ┊ GC (median): 0.00%
Time (mean ± σ): 80.416 μs ± 115.160 μs ┊ GC (mean ± σ): 1.42% ± 0.99%
▃█▅▁▁
▁▅█████▇▄▃▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ ▂
72.7 μs Histogram: frequency by time 119 μs <
Memory estimate: 23.65 KiB, allocs estimate: 9.
While the performance and allocation of hilbert_sort! beats hilbert_sort as expected, the rest is puzzling.
Although hilbert_sortperm! allocates 9 times instead of 20 times as in hilbert_sortperm, the memory allocation is roughly the same, and the performance is drastically worse. When I tested with smaller m, this result is then reversed:
Small m benchmark
julia> m = rand(Bool, (10, 10))
10×10 Matrix{Bool}:
[...]
julia> @benchmark hilbert_sortperm($(copy(m)), $1)
BenchmarkTools.Trial: 10000 samples with 159 evaluations per sample.
Range (min … max): 664.780 ns … 303.486 μs ┊ GC (min … max): 0.00% … 99.56%
Time (median): 782.390 ns ┊ GC (median): 0.00%
Time (mean ± σ): 857.347 ns ± 4.029 μs ┊ GC (mean ± σ): 7.89% ± 1.72%
█
██▆▄▅▅▃▂▁▁▃▃▃▄▃▃▃▄▄▅▆▇▅▄▄▃▃▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ ▂
665 ns Histogram: frequency by time 1.21 μs <
Memory estimate: 640 bytes, allocs estimate: 15.
julia> @benchmark hilbert_sortperm!($(copy(m)), $1)
BenchmarkTools.Trial: 10000 samples with 258 evaluations per sample.
Range (min … max): 296.899 ns … 164.754 μs ┊ GC (min … max): 0.00% … 99.65%
Time (median): 366.279 ns ┊ GC (median): 0.00%
Time (mean ± σ): 398.245 ns ± 1.924 μs ┊ GC (mean ± σ): 6.62% ± 1.41%
▂▄▅▆▅▂▂▂ ▅█▇▆▆▅▅▄▄▄▃▂▂▂▁▁ ▂
█████████▇▇█████████████████▇▇▇▇▅▅▅▅▅▂▄▄▄▅▄▄▄▄▆▄▄▆█▆▄▅▅▅▄▃▄▄▅ █
297 ns Histogram: log(frequency) by time 621 ns <
Memory estimate: 288 bytes, allocs estimate: 4.
Apart from that, hilbert_sort!, which calls hilbert_sortperm, somehow beats the latter in both performance and allocation. I tried using JET to detect type instabilities but no errors. I also tried using a simplified scenario:
function some_sort!(m::AbstractMatrix{<:Integer})
return Base.permutecols!!(m, some_sortperm(m))
end
function some_sortperm(m::AbstractMatrix{<:Integer})
return mapslices(example!, m; dims = 1) |> vec |> sortperm
end
function example!(x::AbstractVector{<:Integer})
for _ in 1:10 # expensive-ish
x .⊻= x[end]
end
return sum(x)
end
But cannot replicate the peculiar result:
Simplified benchmark
julia> m = rand(Bool, (10, 1000))
10×1000 Matrix{Bool}:
[...]
julia> @benchmark some_sort!($(copy(m)))
BenchmarkTools.Trial: 10000 samples with 1 evaluation per sample.
Range (min … max): 66.900 μs … 417.600 μs ┊ GC (min … max): 0.00% … 0.00%
Time (median): 68.400 μs ┊ GC (median): 0.00%
Time (mean ± σ): 70.269 μs ± 10.250 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
▇█▇▆▆▄▃▃▂▂▂▂▁▁▁▁ ▂
██████████████████▇▇▆▆▅▅▅▄▅▅▅▄▄▃▄▅▄▄▁▄▁▄▁▄▄▁▁▅▆▇██▆▆▄▅▅▄▄▅▅▄ █
66.9 μs Histogram: log(frequency) by time 105 μs <
Memory estimate: 16.25 KiB, allocs estimate: 19.
julia> @benchmark some_sortperm($(copy(m)))
BenchmarkTools.Trial: 10000 samples with 1 evaluation per sample.
Range (min … max): 64.100 μs … 922.100 μs ┊ GC (min … max): 0.00% … 0.00%
Time (median): 68.000 μs ┊ GC (median): 0.00%
Time (mean ± σ): 70.893 μs ± 18.556 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
▅▇█▇▇▇▇▆▄▃▂▂▂▁▁ ▂
█████████████████▆▇▅▅▅▅▄▅▅▃▄▄▅▄▃▇▆█▇▇▆▇▆▇▆▇▆▆▆▇▇▆▆▆▆▅▅▆▆▆▆▆▅ █
64.1 μs Histogram: log(frequency) by time 120 μs <
Memory estimate: 16.25 KiB, allocs estimate: 19.
This really left me puzzled. I am still new to Julia so any help would be greatly appreciated!