When a Vector{Bool}
is deserialized, stdlib/Serialization/src/Serialization.jl:1338
allocates for each element deserialized. I changed the A = Array{Bool, length(dims)}(undef, dims)
to A = Vector{Bool}(undef, n)
and I was able to get a 6x speedup. This doesn’t quite make sense to me (@infiltrate shows the same types are created), but the following script shows the results of the change:
using Serialization, Infiltrator
N = 1000000
x = rand(Bool, N)
buffer = Vector{UInt8}(undef, Int(3e9));
io = IOBuffer(buffer; read=true, write=true)
run_test(x; warmup=true) = begin
empty!(buffer)
mark(io)
@assert typeof(x) <: Vector{Bool}
if warmup
serialize(io, x)
else
@time serialize(io, x)
end
reset(io)
if warmup
x2 = deserialize(io)
else
@time x2 = deserialize(io)
end
@assert typeof(x2) <: Vector{Bool} && x2 == x
end;
run_test(x)
println("Testing Vector{Bool} with default deserialize_array")
run_test(x; warmup=false)
function Serialization.deserialize_array(s::AbstractSerializer)
slot = s.counter; s.counter += 1
d1 = deserialize(s)
if isa(d1, Type)
elty = d1
d1 = deserialize(s)
else
elty = UInt8
end
if isa(d1, Int32) || isa(d1, Int64)
if elty !== Bool && isbitstype(elty)
a = Vector{elty}(undef, d1)
s.table[slot] = a
return read!(s.io, a)
end
dims = (Int(d1),)
elseif d1 isa Dims
dims = d1::Dims
else
dims = convert(Dims, d1::Tuple{Vararg{OtherInt}})::Dims
end
if isbitstype(elty)
n = prod(dims)::Int
if elty === Bool && n > 0
# @infiltrate
A = Vector{Bool}(undef, n)
i = 1
while i <= n
b = read(s.io, UInt8)::UInt8
v::Bool = (b >> 7) != 0
count = b & 0x7f
nxt = i + count
while i < nxt
A[i] = v
i += 1
end
end
else
A = read!(s.io, Array{elty}(undef, dims))
end
s.table[slot] = A
return A
end
A = Array{elty, length(dims)}(undef, dims)
s.table[slot] = A
sizehint!(s.table, s.counter + div(length(A)::Int,4))
Serialization.deserialize_fillarray!(A, s)
return A
end
run_test(x)
println("Testing Vector{Bool} with custom deserialize_array")
run_test(x; warmup=false)
I get the following output on my Windows machine:
Testing Vector{Bool} with default deserialize_array
0.006477 seconds (10 allocations: 1.047 KiB)
0.028968 seconds (999.50 k allocations: 16.206 MiB)
Testing Vector{Bool} with custom deserialize_array
0.006436 seconds (10 allocations: 1.047 KiB)
0.004163 seconds (13 allocations: 977.688 KiB)