Contradictory nested function performance and allocation

Recently when working on sorting data with Hilbert curves, I noticed something very peculiar when benchmarking.

function hilbert_sort(m::AbstractMatrix{<:Integer}, b::Integer; rev::Bool = false)
    return m[:, hilbert_sortperm(m, b; rev)]
end

function hilbert_sort!(m::AbstractMatrix{<:Integer}, b::Integer; rev::Bool = false)
    return Base.permutecols!!(m, hilbert_sortperm(m, b; rev))
end

function hilbert_sortperm(m::AbstractMatrix{<:Integer}, b::Integer; rev::Bool = false)
    h = mapslices(__with_bit(b), m; dims = 1) |> vec

    return sortperm(h; rev)
end

function hilbert_sortperm!(m::AbstractMatrix{<:Integer}, b::Integer; rev::Bool = false)
    h = eachcol(m) .|> __with_bit(b)

    return sortperm(h; rev)
end

Where __with_bit is a function factory that returns an expensive-ish computation function (Skilling’s method returning a distance) that mutates. Naturally, I would expect the performance of hilbert_sortperm! > hilbert_sortperm > hilbert_sort! > hilbert_sort. However, the result is the last I would expect:

Large m benchmark
julia> m = rand(Bool, (10, 1000))
10×1000 Matrix{Bool}:
[...]

julia> @benchmark hilbert_sort($(copy(m)), $1)
BenchmarkTools.Trial: 10000 samples with 1 evaluation per sample.
 Range (min … max):  56.600 μs …  16.335 ms  ┊ GC (min … max): 0.00% … 99.36%
 Time  (median):     62.400 μs               ┊ GC (median):    0.00%
 Time  (mean ± σ):   68.468 μs ± 163.403 μs  ┊ GC (mean ± σ):  2.37% ±  0.99%

   ▇▄▁▇█▃▁
  ▅███████▇▆▅▅▄▄▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ ▂
  56.6 μs         Histogram: frequency by time          116 μs <

 Memory estimate: 33.89 KiB, allocs estimate: 23.

julia> @benchmark hilbert_sort!($(copy(m)), $1)
BenchmarkTools.Trial: 10000 samples with 1 evaluation per sample.
 Range (min … max):  26.300 μs … 926.400 μs  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     29.000 μs               ┊ GC (median):    0.00%
 Time  (mean ± σ):   30.798 μs ±  11.887 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

  ▃▇█▇▇▇▅▄▄▃▃▃▂▂▂▁▁                                            ▂
  ██████████████████▇▇▇▆▇▆▆▇▆▅▄▄▃▄▄▁▃▅▃▁▁▃▃▄▁▆▇▆▇▇▇▇▇▆▆▇▆▅▅▆▆▆ █
  26.3 μs       Histogram: log(frequency) by time      68.2 μs <

 Memory estimate: 16.11 KiB, allocs estimate: 17.

julia> @benchmark hilbert_sortperm($(copy(m)), $1)
BenchmarkTools.Trial: 10000 samples with 1 evaluation per sample.
 Range (min … max):  52.600 μs …  15.688 ms  ┊ GC (min … max): 0.00% … 99.41%
 Time  (median):     56.900 μs               ┊ GC (median):    0.00%
 Time  (mean ± σ):   61.121 μs ± 156.932 μs  ┊ GC (mean ± σ):  2.55% ±  0.99%

   ▅█▃▆▃▁
  ▃██████▆▆▅▄▄▃▃▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ ▂
  52.6 μs         Histogram: frequency by time          103 μs <

 Memory estimate: 23.99 KiB, allocs estimate: 20.

julia> @benchmark hilbert_sortperm!($(copy(m)), $1)
BenchmarkTools.Trial: 10000 samples with 1 evaluation per sample.
 Range (min … max):  72.700 μs …  11.514 ms  ┊ GC (min … max): 0.00% … 99.06%
 Time  (median):     76.700 μs               ┊ GC (median):    0.00%
 Time  (mean ± σ):   80.416 μs ± 115.160 μs  ┊ GC (mean ± σ):  1.42% ±  0.99%

    ▃█▅▁▁
  ▁▅█████▇▄▃▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ ▂
  72.7 μs         Histogram: frequency by time          119 μs <

 Memory estimate: 23.65 KiB, allocs estimate: 9.

While the performance and allocation of hilbert_sort! beats hilbert_sort as expected, the rest is puzzling.

Although hilbert_sortperm! allocates 9 times instead of 20 times as in hilbert_sortperm, the memory allocation is roughly the same, and the performance is drastically worse. When I tested with smaller m, this result is then reversed:

Small m benchmark
julia> m = rand(Bool, (10, 10))
10×10 Matrix{Bool}:
[...]

julia> @benchmark hilbert_sortperm($(copy(m)), $1)
BenchmarkTools.Trial: 10000 samples with 159 evaluations per sample.
 Range (min … max):  664.780 ns … 303.486 μs  ┊ GC (min … max): 0.00% … 99.56%
 Time  (median):     782.390 ns               ┊ GC (median):    0.00%
 Time  (mean ± σ):   857.347 ns ±   4.029 μs  ┊ GC (mean ± σ):  7.89% ±  1.72%

   █
  ██▆▄▅▅▃▂▁▁▃▃▃▄▃▃▃▄▄▅▆▇▅▄▄▃▃▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ ▂
  665 ns           Histogram: frequency by time         1.21 μs <

 Memory estimate: 640 bytes, allocs estimate: 15.

julia> @benchmark hilbert_sortperm!($(copy(m)), $1)
BenchmarkTools.Trial: 10000 samples with 258 evaluations per sample.
 Range (min … max):  296.899 ns … 164.754 μs  ┊ GC (min … max): 0.00% … 99.65%
 Time  (median):     366.279 ns               ┊ GC (median):    0.00%
 Time  (mean ± σ):   398.245 ns ±   1.924 μs  ┊ GC (mean ± σ):  6.62% ±  1.41%

  ▂▄▅▆▅▂▂▂   ▅█▇▆▆▅▅▄▄▄▃▂▂▂▁▁                                   ▂
  █████████▇▇█████████████████▇▇▇▇▅▅▅▅▅▂▄▄▄▅▄▄▄▄▆▄▄▆█▆▄▅▅▅▄▃▄▄▅ █
  297 ns        Histogram: log(frequency) by time        621 ns <

 Memory estimate: 288 bytes, allocs estimate: 4.

Apart from that, hilbert_sort!, which calls hilbert_sortperm, somehow beats the latter in both performance and allocation. I tried using JET to detect type instabilities but no errors. I also tried using a simplified scenario:

function some_sort!(m::AbstractMatrix{<:Integer})
    return Base.permutecols!!(m, some_sortperm(m))
end

function some_sortperm(m::AbstractMatrix{<:Integer})
    return mapslices(example!, m; dims = 1) |> vec |> sortperm
end

function example!(x::AbstractVector{<:Integer})
    for _ in 1:10 # expensive-ish
        x .⊻= x[end] 
    end
    return sum(x)
end

But cannot replicate the peculiar result:

Simplified benchmark
julia> m = rand(Bool, (10, 1000))
10×1000 Matrix{Bool}:
[...]

julia> @benchmark some_sort!($(copy(m)))
BenchmarkTools.Trial: 10000 samples with 1 evaluation per sample.
 Range (min … max):  66.900 μs … 417.600 μs  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     68.400 μs               ┊ GC (median):    0.00%
 Time  (mean ± σ):   70.269 μs ±  10.250 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

  ▇█▇▆▆▄▃▃▂▂▂▂▁▁▁▁                                             ▂
  ██████████████████▇▇▆▆▅▅▅▄▅▅▅▄▄▃▄▅▄▄▁▄▁▄▁▄▄▁▁▅▆▇██▆▆▄▅▅▄▄▅▅▄ █
  66.9 μs       Histogram: log(frequency) by time       105 μs <

 Memory estimate: 16.25 KiB, allocs estimate: 19.

julia> @benchmark some_sortperm($(copy(m)))
BenchmarkTools.Trial: 10000 samples with 1 evaluation per sample.
 Range (min … max):  64.100 μs … 922.100 μs  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     68.000 μs               ┊ GC (median):    0.00%
 Time  (mean ± σ):   70.893 μs ±  18.556 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

  ▅▇█▇▇▇▇▆▄▃▂▂▂▁▁                                              ▂
  █████████████████▆▇▅▅▅▅▄▅▅▃▄▄▅▄▃▇▆█▇▇▆▇▆▇▆▇▆▆▆▇▇▆▆▆▆▅▅▆▆▆▆▆▅ █
  64.1 μs       Histogram: log(frequency) by time       120 μs <

 Memory estimate: 16.25 KiB, allocs estimate: 19.

This really left me puzzled. I am still new to Julia so any help would be greatly appreciated!