Contradictory nested function performance and allocation

lilachint · April 4, 2026, 3:49am

Recently when working on sorting data with Hilbert curves, I noticed something very peculiar when benchmarking.

function hilbert_sort(m::AbstractMatrix{<:Integer}, b::Integer; rev::Bool = false)
    return m[:, hilbert_sortperm(m, b; rev)]
end

function hilbert_sort!(m::AbstractMatrix{<:Integer}, b::Integer; rev::Bool = false)
    return Base.permutecols!!(m, hilbert_sortperm(m, b; rev))
end

function hilbert_sortperm(m::AbstractMatrix{<:Integer}, b::Integer; rev::Bool = false)
    h = mapslices(__with_bit(b), m; dims = 1) |> vec

    return sortperm(h; rev)
end

function hilbert_sortperm!(m::AbstractMatrix{<:Integer}, b::Integer; rev::Bool = false)
    h = eachcol(m) .|> __with_bit(b)

    return sortperm(h; rev)
end

Where __with_bit is a function factory that returns an expensive-ish computation function (Skilling’s method returning a distance) that mutates. Naturally, I would expect the performance of hilbert_sortperm! > hilbert_sortperm > hilbert_sort! > hilbert_sort. However, the result is the last I would expect:

Large m benchmark

julia> m = rand(Bool, (10, 1000))
10×1000 Matrix{Bool}:
[...]

julia> @benchmark hilbert_sort($(copy(m)), $1)
BenchmarkTools.Trial: 10000 samples with 1 evaluation per sample.
 Range (min … max):  56.600 μs …  16.335 ms  ┊ GC (min … max): 0.00% … 99.36%
 Time  (median):     62.400 μs               ┊ GC (median):    0.00%
 Time  (mean ± σ):   68.468 μs ± 163.403 μs  ┊ GC (mean ± σ):  2.37% ±  0.99%

   ▇▄▁▇█▃▁
  ▅███████▇▆▅▅▄▄▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ ▂
  56.6 μs         Histogram: frequency by time          116 μs <

 Memory estimate: 33.89 KiB, allocs estimate: 23.

julia> @benchmark hilbert_sort!($(copy(m)), $1)
BenchmarkTools.Trial: 10000 samples with 1 evaluation per sample.
 Range (min … max):  26.300 μs … 926.400 μs  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     29.000 μs               ┊ GC (median):    0.00%
 Time  (mean ± σ):   30.798 μs ±  11.887 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

  ▃▇█▇▇▇▅▄▄▃▃▃▂▂▂▁▁                                            ▂
  ██████████████████▇▇▇▆▇▆▆▇▆▅▄▄▃▄▄▁▃▅▃▁▁▃▃▄▁▆▇▆▇▇▇▇▇▆▆▇▆▅▅▆▆▆ █
  26.3 μs       Histogram: log(frequency) by time      68.2 μs <

 Memory estimate: 16.11 KiB, allocs estimate: 17.

julia> @benchmark hilbert_sortperm($(copy(m)), $1)
BenchmarkTools.Trial: 10000 samples with 1 evaluation per sample.
 Range (min … max):  52.600 μs …  15.688 ms  ┊ GC (min … max): 0.00% … 99.41%
 Time  (median):     56.900 μs               ┊ GC (median):    0.00%
 Time  (mean ± σ):   61.121 μs ± 156.932 μs  ┊ GC (mean ± σ):  2.55% ±  0.99%

   ▅█▃▆▃▁
  ▃██████▆▆▅▄▄▃▃▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ ▂
  52.6 μs         Histogram: frequency by time          103 μs <

 Memory estimate: 23.99 KiB, allocs estimate: 20.

julia> @benchmark hilbert_sortperm!($(copy(m)), $1)
BenchmarkTools.Trial: 10000 samples with 1 evaluation per sample.
 Range (min … max):  72.700 μs …  11.514 ms  ┊ GC (min … max): 0.00% … 99.06%
 Time  (median):     76.700 μs               ┊ GC (median):    0.00%
 Time  (mean ± σ):   80.416 μs ± 115.160 μs  ┊ GC (mean ± σ):  1.42% ±  0.99%

    ▃█▅▁▁
  ▁▅█████▇▄▃▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ ▂
  72.7 μs         Histogram: frequency by time          119 μs <

 Memory estimate: 23.65 KiB, allocs estimate: 9.

While the performance and allocation of hilbert_sort! beats hilbert_sort as expected, the rest is puzzling.

Although hilbert_sortperm! allocates 9 times instead of 20 times as in hilbert_sortperm, the memory allocation is roughly the same, and the performance is drastically worse. When I tested with smaller m, this result is then reversed:

Small m benchmark

julia> m = rand(Bool, (10, 10))
10×10 Matrix{Bool}:
[...]

julia> @benchmark hilbert_sortperm($(copy(m)), $1)
BenchmarkTools.Trial: 10000 samples with 159 evaluations per sample.
 Range (min … max):  664.780 ns … 303.486 μs  ┊ GC (min … max): 0.00% … 99.56%
 Time  (median):     782.390 ns               ┊ GC (median):    0.00%
 Time  (mean ± σ):   857.347 ns ±   4.029 μs  ┊ GC (mean ± σ):  7.89% ±  1.72%

   █
  ██▆▄▅▅▃▂▁▁▃▃▃▄▃▃▃▄▄▅▆▇▅▄▄▃▃▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ ▂
  665 ns           Histogram: frequency by time         1.21 μs <

 Memory estimate: 640 bytes, allocs estimate: 15.

julia> @benchmark hilbert_sortperm!($(copy(m)), $1)
BenchmarkTools.Trial: 10000 samples with 258 evaluations per sample.
 Range (min … max):  296.899 ns … 164.754 μs  ┊ GC (min … max): 0.00% … 99.65%
 Time  (median):     366.279 ns               ┊ GC (median):    0.00%
 Time  (mean ± σ):   398.245 ns ±   1.924 μs  ┊ GC (mean ± σ):  6.62% ±  1.41%

  ▂▄▅▆▅▂▂▂   ▅█▇▆▆▅▅▄▄▄▃▂▂▂▁▁                                   ▂
  █████████▇▇█████████████████▇▇▇▇▅▅▅▅▅▂▄▄▄▅▄▄▄▄▆▄▄▆█▆▄▅▅▅▄▃▄▄▅ █
  297 ns        Histogram: log(frequency) by time        621 ns <

 Memory estimate: 288 bytes, allocs estimate: 4.

Apart from that, hilbert_sort!, which calls hilbert_sortperm, somehow beats the latter in both performance and allocation. I tried using JET to detect type instabilities but no errors. I also tried using a simplified scenario:

function some_sort!(m::AbstractMatrix{<:Integer})
    return Base.permutecols!!(m, some_sortperm(m))
end

function some_sortperm(m::AbstractMatrix{<:Integer})
    return mapslices(example!, m; dims = 1) |> vec |> sortperm
end

function example!(x::AbstractVector{<:Integer})
    for _ in 1:10 # expensive-ish
        x .⊻= x[end] 
    end
    return sum(x)
end

But cannot replicate the peculiar result:

Simplified benchmark

julia> m = rand(Bool, (10, 1000))
10×1000 Matrix{Bool}:
[...]

julia> @benchmark some_sort!($(copy(m)))
BenchmarkTools.Trial: 10000 samples with 1 evaluation per sample.
 Range (min … max):  66.900 μs … 417.600 μs  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     68.400 μs               ┊ GC (median):    0.00%
 Time  (mean ± σ):   70.269 μs ±  10.250 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

  ▇█▇▆▆▄▃▃▂▂▂▂▁▁▁▁                                             ▂
  ██████████████████▇▇▆▆▅▅▅▄▅▅▅▄▄▃▄▅▄▄▁▄▁▄▁▄▄▁▁▅▆▇██▆▆▄▅▅▄▄▅▅▄ █
  66.9 μs       Histogram: log(frequency) by time       105 μs <

 Memory estimate: 16.25 KiB, allocs estimate: 19.

julia> @benchmark some_sortperm($(copy(m)))
BenchmarkTools.Trial: 10000 samples with 1 evaluation per sample.
 Range (min … max):  64.100 μs … 922.100 μs  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     68.000 μs               ┊ GC (median):    0.00%
 Time  (mean ± σ):   70.893 μs ±  18.556 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

  ▅▇█▇▇▇▇▆▄▃▂▂▂▁▁                                              ▂
  █████████████████▆▇▅▅▅▅▄▅▅▃▄▄▅▄▃▇▆█▇▇▆▇▆▇▆▇▆▆▆▇▇▆▆▆▆▅▅▆▆▆▆▆▅ █
  64.1 μs       Histogram: log(frequency) by time       120 μs <

 Memory estimate: 16.25 KiB, allocs estimate: 19.

This really left me puzzled. I am still new to Julia so any help would be greatly appreciated!

Benny · April 4, 2026, 8:24am

This stood out to me, and turns out it’s not a public function to begin with. It appears to mutate both inputs and return the 1st, so your benchmark is mutating the input copy(m) every call. That usually makes for very unrepresentative benchmarks, especially when it converges on a result and do a lot less work in the long run. Sorting tends to do that. The setup option of @benchmark can reset the input per sample, but each sample may have many calls, so you’ll want to also manually set the evals=1 option (which happens to match your benchmark anyway). This may not be the reason, but fixing a mutating benchmark is a start.

It’s not unusual for different iteration mechanisms e.g. map vs broadcast to have different performance, but I’d check if the indirection of .|> is making a difference. That’s not syntactic sugar for piping, it’s an actual higher-order function being broadcasted. Make a variant with the more conventional h = (__with_bit(b)).(eachcol(m)) and benchmark both hilbert_sortperm! and the variant back to back.

lilachint · April 4, 2026, 8:38am

Thank you very much! Indeed, I seem to have misunderstood the usage of $ interpolation and though it re-evaluates the expression everytime it’s ran. Now I am able to get this:

New benchmark

julia> const m = rand(Bool, (10, 1000))
10×1000 Matrix{Bool}:
[...]  

julia> @benchmark hilbert_sort(n, $1) setup=(n = copy($m)) evals=1
BenchmarkTools.Trial: 10000 samples with 1 evaluation per sample.
 Range (min … max):  59.000 μs …  6.247 ms  ┊ GC (min … max): 0.00% … 98.24%
 Time  (median):     65.500 μs              ┊ GC (median):    0.00%
 Time  (mean ± σ):   74.022 μs ± 97.924 μs  ┊ GC (mean ± σ):  6.20% ±  6.54%

  ▄▅▅▆██▇▆▆▅▅▄▄▃▂▁▁           ▁▁▂▂▂▁▂▂▂▁▁                     ▂
  ███████████████████▇▇▆▆▇██▇████████████▇█▇▇▇▇▅▅▆▅▃▅▄▂▅▃▄▄▃▄ █
  59 μs        Histogram: log(frequency) by time       123 μs <

 Memory estimate: 33.89 KiB, allocs estimate: 23.

julia> @benchmark hilbert_sort!(n, $1) setup=(n = copy($m)) evals=1
BenchmarkTools.Trial: 10000 samples with 1 evaluation per sample.
 Range (min … max):  64.200 μs …  5.999 ms  ┊ GC (min … max): 0.00% … 98.12%
 Time  (median):     70.200 μs              ┊ GC (median):    0.00%
 Time  (mean ± σ):   75.020 μs ± 80.601 μs  ┊ GC (mean ± σ):  1.69% ±  1.68%

   ▂▄▇██▇▇▆▆▆▅▅▅▅▄▃▃▂▁▁                 ▁▁▁▁▁ ▁▁▁▁▁▁▁         ▃
  ▇██████████████████████▇▇▇▅▅▅▆▄▄▆▄▅▆▇▇██████████████▇▇▇▇▆▆▆ █
  64.2 μs      Histogram: log(frequency) by time       112 μs <

 Memory estimate: 23.99 KiB, allocs estimate: 20.

julia> @benchmark hilbert_sortperm(n, $1) setup=(n = copy($m)) evals=1
BenchmarkTools.Trial: 10000 samples with 1 evaluation per sample.
 Range (min … max):  55.600 μs …  4.894 ms  ┊ GC (min … max): 0.00% … 98.30%
 Time  (median):     61.100 μs              ┊ GC (median):    0.00%
 Time  (mean ± σ):   67.047 μs ± 85.246 μs  ┊ GC (mean ± σ):  2.65% ±  2.17%

  ▂▃▃▆█▇▆▆▅▅▅▄▄▄▂▃▂▁                        ▁▁▁▂▂▁            ▂
  ████████████████████▇▇▆▅▆▆▅▂▂▅▆▇▇▆▇▇▇████████████▇▇▇▇▆▇▆▅▄▅ █
  55.6 μs      Histogram: log(frequency) by time       107 μs <

 Memory estimate: 23.99 KiB, allocs estimate: 20.

julia> @benchmark hilbert_sortperm!(n, $1) setup=(n = copy($m)) evals=1
BenchmarkTools.Trial: 10000 samples with 1 evaluation per sample.
 Range (min … max):  53.900 μs …   8.961 ms  ┊ GC (min … max): 0.00% … 98.91%
 Time  (median):     61.000 μs               ┊ GC (median):    0.00%
 Time  (mean ± σ):   65.985 μs ± 108.955 μs  ┊ GC (mean ± σ):  3.71% ±  3.16%

      ▄▆█▇▃▅▂▂
  ▁▄▅▇█████████▆▆▆▅▄▄▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▂▂▂▁▂▂▁▁▂▁▁▁▁▁▁ ▃
  53.9 μs         Histogram: frequency by time         99.7 μs <

 Memory estimate: 23.65 KiB, allocs estimate: 9.

Which is exactly what I expect. Thank you for helping!

Topic		Replies	Views
Fast permutation vector: code needed Performance sorting	38	1030	March 21, 2023
Allocation / performance for 2d array sorts Performance	5	675	November 10, 2020
How to sort a vector by an expensive function in place efficiently? Performance sorting	14	1921	March 8, 2022
Why is sorting matrices allocating and why is it sometimes slower than the naive approach? Performance sort , matrices	11	638	November 6, 2021
10x faster sortperm() Performance sortperm	10	2915	October 29, 2024

Contradictory nested function performance and allocation

Related topics