FWIW,
julia> function pushtest_hint!(n, v = Vector{Int}())
empty!(v)
sizehint!(v, n)
for i=1:n
push!(v, i)
end
v
end;
julia> function filltest!(n, v = Vector{Int}())
resize!(v, n)
for i=1:n
v[i]=i
end
v
end;
julia> function pushtest_nohint!(n, v = Vector{Int}())
empty!(v)
for i=1:n
push!(v, i)
end
v
end;
tested with
julia> N1=10_000; N2 = 10_000_000; N3 = 50_000_000; v1=zeros(Int, N1); v2=zeros(Int, N2); v3=zeros(Int, N3);
julia> using BenchmarkTools
julia> begin
run(`cat /sys/kernel/mm/transparent_hugepage/enabled`)
for (N,v) in [(N1,v1), (N2,v2), (N3,v3)]
@show N, :no_reuse
@btime pushtest_nohint!($N)
@btime pushtest_hint!($N)
@btime filltest!($N)
@show N, :reuse
@btime pushtest_nohint!($N, $v)
@btime pushtest_hint!($N, $v)
@btime filltest!($N, $v)
end; end;
yields on my machine
[always] madvise never
(N, :no_reuse) = (10000, :no_reuse)
79.359 Ī¼s (14 allocations: 256.64 KiB)
62.778 Ī¼s (2 allocations: 78.27 KiB)
8.214 Ī¼s (2 allocations: 78.27 KiB)
(N, :reuse) = (10000, :reuse)
61.888 Ī¼s (0 allocations: 0 bytes)
61.914 Ī¼s (0 allocations: 0 bytes)
5.836 Ī¼s (0 allocations: 0 bytes)
(N, :no_reuse) = (10000000, :no_reuse)
105.401 ms (24 allocations: 129.00 MiB)
71.133 ms (2 allocations: 76.29 MiB)
14.669 ms (2 allocations: 76.29 MiB)
(N, :reuse) = (10000000, :reuse)
63.381 ms (0 allocations: 0 bytes)
63.283 ms (0 allocations: 0 bytes)
8.419 ms (0 allocations: 0 bytes)
(N, :no_reuse) = (50000000, :no_reuse)
537.729 ms (26 allocations: 416.61 MiB)
356.954 ms (2 allocations: 381.47 MiB)
141.562 ms (2 allocations: 381.47 MiB)
(N, :reuse) = (50000000, :reuse)
317.353 ms (0 allocations: 0 bytes)
316.974 ms (0 allocations: 0 bytes)
42.260 ms (0 allocations: 0 bytes)
and
always [madvise] never
(N, :no_reuse) = (10000, :no_reuse)
82.518 Ī¼s (14 allocations: 256.64 KiB)
62.817 Ī¼s (2 allocations: 78.27 KiB)
8.026 Ī¼s (2 allocations: 78.27 KiB)
(N, :reuse) = (10000, :reuse)
65.603 Ī¼s (0 allocations: 0 bytes)
61.832 Ī¼s (0 allocations: 0 bytes)
5.481 Ī¼s (0 allocations: 0 bytes)
(N, :no_reuse) = (10000000, :no_reuse)
176.516 ms (24 allocations: 129.00 MiB)
95.101 ms (2 allocations: 76.29 MiB)
39.220 ms (2 allocations: 76.29 MiB)
(N, :reuse) = (10000000, :reuse)
66.751 ms (0 allocations: 0 bytes)
63.399 ms (0 allocations: 0 bytes)
9.020 ms (0 allocations: 0 bytes)
(N, :no_reuse) = (50000000, :no_reuse)
837.384 ms (26 allocations: 416.61 MiB)
476.236 ms (2 allocations: 381.47 MiB)
193.821 ms (2 allocations: 381.47 MiB)
(N, :reuse) = (50000000, :reuse)
334.521 ms (0 allocations: 0 bytes)
319.596 ms (0 allocations: 0 bytes)
44.946 ms (0 allocations: 0 bytes)
Take-away: Tuning THP in the linux kernel is just as relevant as sizehint!
(I learned that yesterday; many distros use madvise
as default; defrag
is also a relevant tunable). The foreign function call overhead from push!
is more important than sizehint!
. If you insist about performance, then donāt use push!
, instead run your own overallocation scheme (resize!
to something big, fill up tracking the current index, afterwards resize!
and sizehint!
to shrink back; effectively what IOBuffer
does).