Slow deserialization of booleans

When a Vector{Bool} is deserialized, stdlib/Serialization/src/Serialization.jl:1338 allocates for each element deserialized. I changed the A = Array{Bool, length(dims)}(undef, dims) to A = Vector{Bool}(undef, n) and I was able to get a 6x speedup. This doesn’t quite make sense to me (@infiltrate shows the same types are created), but the following script shows the results of the change:

using Serialization, Infiltrator

N = 1000000
x = rand(Bool, N)
buffer = Vector{UInt8}(undef, Int(3e9));
io = IOBuffer(buffer; read=true, write=true)

run_test(x; warmup=true) = begin
	empty!(buffer)
	mark(io)
	@assert typeof(x) <: Vector{Bool}
	if warmup
		serialize(io, x)
	else
		@time serialize(io, x)
	end
	reset(io)
	if warmup
		x2 = deserialize(io)
	else
		@time x2 = deserialize(io)
	end
	@assert typeof(x2) <: Vector{Bool} && x2 == x
end;

run_test(x)
println("Testing Vector{Bool} with default deserialize_array")
run_test(x; warmup=false)

function Serialization.deserialize_array(s::AbstractSerializer)
	slot = s.counter; s.counter += 1
	d1 = deserialize(s)
	if isa(d1, Type)
		 elty = d1
		 d1 = deserialize(s)
	else
		 elty = UInt8
	end
	if isa(d1, Int32) || isa(d1, Int64)
		 if elty !== Bool && isbitstype(elty)
			  a = Vector{elty}(undef, d1)
			  s.table[slot] = a
			  return read!(s.io, a)
		 end
		 dims = (Int(d1),)
	elseif d1 isa Dims
		 dims = d1::Dims
	else
		 dims = convert(Dims, d1::Tuple{Vararg{OtherInt}})::Dims
	end
	if isbitstype(elty)
		 n = prod(dims)::Int
		 if elty === Bool && n > 0
			# @infiltrate
			  A = Vector{Bool}(undef, n)
			  i = 1
			  while i <= n
					b = read(s.io, UInt8)::UInt8
					v::Bool = (b >> 7) != 0
					count = b & 0x7f
					nxt = i + count
					while i < nxt
						 A[i] = v
						 i += 1
					end
			  end
		 else
			  A = read!(s.io, Array{elty}(undef, dims))
		 end
		 s.table[slot] = A
		 return A
	end
	A = Array{elty, length(dims)}(undef, dims)
	s.table[slot] = A
	sizehint!(s.table, s.counter + div(length(A)::Int,4))
	Serialization.deserialize_fillarray!(A, s)
	return A
end

run_test(x)
println("Testing Vector{Bool} with custom deserialize_array")
run_test(x; warmup=false)

I get the following output on my Windows machine:

Testing Vector{Bool} with default deserialize_array
  0.006477 seconds (10 allocations: 1.047 KiB)
  0.028968 seconds (999.50 k allocations: 16.206 MiB)
Testing Vector{Bool} with custom deserialize_array
  0.006436 seconds (10 allocations: 1.047 KiB)
  0.004163 seconds (13 allocations: 977.688 KiB)
1 Like

I’d hazard a guess that dims is only known at runtime in general, so your change hardcoded the concrete type for your tested input x = rand(Bool, N). Thing is, the relevant line with improved type stability would be A[i] = v, which I would have assumed to be optimized even with runtime dims due to the one method for linear set-indexing.

1 Like

A work-around can be to reshape the array after creating it with single dimension.

begin
	function create_array() 
		dims = rand(10:10, 6)
		A = Array{Bool, length(dims)}(undef, tuple(dims...))
		for i in 1:length(A)
			A[i] = i % 2 == 0
		end
		return A
	end

	function create_array2() 
		dims = rand(10:10, 6)
		A = Vector{Bool}(undef, prod(dims))
		for i in 1:length(A)
			A[i] = i % 2 == 0
		end
		return reshape(A, tuple(dims...))
	end

	function create_array3() 
		dims = rand(10:10, 6)
		A = Vector{Bool}(undef, prod(dims))
		for i in 1:length(A)
			A[i] = i % 2 == 0
		end
		return A
	end

	create_array();
	create_array2();
	create_array3();

	println("Testing assigning to obfuscated dims")
	a1 = @time create_array();
	println("Testing obfuscated dims with reshaping array")
	a2 = @time create_array2();
	println("Testing flat array with no reshaping	")
	a3 = @time create_array3();
	@assert a1 == a2 
	@assert a1 == reshape(a3, size(a1))
end

I get the following output:

Testing assigning to obfuscated dims
  0.089560 seconds (3.00 M allocations: 61.974 MiB, 2.66% gc time)
Testing obfuscated dims with reshaping array
  0.000231 seconds (14 allocations: 977.375 KiB)
Testing flat array with no reshaping
  0.000205 seconds (11 allocations: 977.156 KiB)
1 Like

That’s clever, the reshape changes vectors to Arrays instead of ReshapedArrays, and it is relatively cheap too because the underlying Memory buffer is reused. I don’t know if that’s something we can count on, so maybe directly making a Memory to be wrapped is the way to go now?

Also figured out why the optimization doesn’t happen, I made a bad assumption. That instantiation call you replaced is inferred as ::Any, not ::Array{Bool, N} where N, so the linear setindexing wouldn’t have 1 method to look at anymore. That’s because the call can go to 5 methods(Array{Bool}.body, (typeof(undef), Dims)), so the compiler gives up at >3 methods and infers ::Any. Julia doesn’t guarantee that called types return their own types (this is one of the many listed wants of a v2 that may never come), so it would need an extra seemingly redundant type assertion.