Documenting performance model of `empty!`, `sizehint!`, `push!` & friends

FWIW,

julia> function pushtest_hint!(n, v = Vector{Int}())
       empty!(v)
       sizehint!(v, n)
       for i=1:n
       push!(v, i)
       end
       v
       end;
julia> function filltest!(n, v = Vector{Int}())
       resize!(v, n)
       for i=1:n
       v[i]=i
       end
       v
       end;
julia> function pushtest_nohint!(n, v = Vector{Int}())
       empty!(v)
       for i=1:n
       push!(v, i)
       end
       v
       end;

tested with

julia> N1=10_000; N2 = 10_000_000; N3 = 50_000_000; v1=zeros(Int, N1); v2=zeros(Int, N2); v3=zeros(Int, N3);
julia> using BenchmarkTools
julia> begin 
       run(`cat /sys/kernel/mm/transparent_hugepage/enabled`)
       for (N,v) in [(N1,v1), (N2,v2), (N3,v3)]
       @show N, :no_reuse
       @btime pushtest_nohint!($N)
       @btime pushtest_hint!($N)
       @btime filltest!($N)
       @show N, :reuse
       @btime pushtest_nohint!($N, $v)
       @btime pushtest_hint!($N, $v)
       @btime filltest!($N, $v)
       end; end;

yields on my machine

[always] madvise never
(N, :no_reuse) = (10000, :no_reuse)
  79.359 μs (14 allocations: 256.64 KiB)
  62.778 μs (2 allocations: 78.27 KiB)
  8.214 μs (2 allocations: 78.27 KiB)
(N, :reuse) = (10000, :reuse)
  61.888 μs (0 allocations: 0 bytes)
  61.914 μs (0 allocations: 0 bytes)
  5.836 μs (0 allocations: 0 bytes)
(N, :no_reuse) = (10000000, :no_reuse)
  105.401 ms (24 allocations: 129.00 MiB)
  71.133 ms (2 allocations: 76.29 MiB)
  14.669 ms (2 allocations: 76.29 MiB)
(N, :reuse) = (10000000, :reuse)
  63.381 ms (0 allocations: 0 bytes)
  63.283 ms (0 allocations: 0 bytes)
  8.419 ms (0 allocations: 0 bytes)
(N, :no_reuse) = (50000000, :no_reuse)
  537.729 ms (26 allocations: 416.61 MiB)
  356.954 ms (2 allocations: 381.47 MiB)
  141.562 ms (2 allocations: 381.47 MiB)
(N, :reuse) = (50000000, :reuse)
  317.353 ms (0 allocations: 0 bytes)
  316.974 ms (0 allocations: 0 bytes)
  42.260 ms (0 allocations: 0 bytes)

and

always [madvise] never
(N, :no_reuse) = (10000, :no_reuse)
  82.518 μs (14 allocations: 256.64 KiB)
  62.817 μs (2 allocations: 78.27 KiB)
  8.026 μs (2 allocations: 78.27 KiB)
(N, :reuse) = (10000, :reuse)
  65.603 μs (0 allocations: 0 bytes)
  61.832 μs (0 allocations: 0 bytes)
  5.481 μs (0 allocations: 0 bytes)
(N, :no_reuse) = (10000000, :no_reuse)
  176.516 ms (24 allocations: 129.00 MiB)
  95.101 ms (2 allocations: 76.29 MiB)
  39.220 ms (2 allocations: 76.29 MiB)
(N, :reuse) = (10000000, :reuse)
  66.751 ms (0 allocations: 0 bytes)
  63.399 ms (0 allocations: 0 bytes)
  9.020 ms (0 allocations: 0 bytes)
(N, :no_reuse) = (50000000, :no_reuse)
  837.384 ms (26 allocations: 416.61 MiB)
  476.236 ms (2 allocations: 381.47 MiB)
  193.821 ms (2 allocations: 381.47 MiB)
(N, :reuse) = (50000000, :reuse)
  334.521 ms (0 allocations: 0 bytes)
  319.596 ms (0 allocations: 0 bytes)
  44.946 ms (0 allocations: 0 bytes)

Take-away: Tuning THP in the linux kernel is just as relevant as sizehint! (I learned that yesterday; many distros use madvise as default; defrag is also a relevant tunable). The foreign function call overhead from push! is more important than sizehint!. If you insist about performance, then don’t use push!, instead run your own overallocation scheme (resize! to something big, fill up tracking the current index, afterwards resize! and sizehint! to shrink back; effectively what IOBuffer does).