Slow deserialization of booleans

When a Vector{Bool} is deserialized, stdlib/Serialization/src/Serialization.jl:1338 allocates for each element deserialized. I changed the A = Array{Bool, length(dims)}(undef, dims) to A = Vector{Bool}(undef, n) and I was able to get a 6x speedup. This doesn’t quite make sense to me (@infiltrate shows the same types are created), but the following script shows the results of the change:

using Serialization, Infiltrator

N = 1000000
x = rand(Bool, N)
buffer = Vector{UInt8}(undef, Int(3e9));
io = IOBuffer(buffer; read=true, write=true)

run_test(x; warmup=true) = begin
	empty!(buffer)
	mark(io)
	@assert typeof(x) <: Vector{Bool}
	if warmup
		serialize(io, x)
	else
		@time serialize(io, x)
	end
	reset(io)
	if warmup
		x2 = deserialize(io)
	else
		@time x2 = deserialize(io)
	end
	@assert typeof(x2) <: Vector{Bool} && x2 == x
end;

run_test(x)
println("Testing Vector{Bool} with default deserialize_array")
run_test(x; warmup=false)

function Serialization.deserialize_array(s::AbstractSerializer)
	slot = s.counter; s.counter += 1
	d1 = deserialize(s)
	if isa(d1, Type)
		 elty = d1
		 d1 = deserialize(s)
	else
		 elty = UInt8
	end
	if isa(d1, Int32) || isa(d1, Int64)
		 if elty !== Bool && isbitstype(elty)
			  a = Vector{elty}(undef, d1)
			  s.table[slot] = a
			  return read!(s.io, a)
		 end
		 dims = (Int(d1),)
	elseif d1 isa Dims
		 dims = d1::Dims
	else
		 dims = convert(Dims, d1::Tuple{Vararg{OtherInt}})::Dims
	end
	if isbitstype(elty)
		 n = prod(dims)::Int
		 if elty === Bool && n > 0
			# @infiltrate
			  A = Vector{Bool}(undef, n)
			  i = 1
			  while i <= n
					b = read(s.io, UInt8)::UInt8
					v::Bool = (b >> 7) != 0
					count = b & 0x7f
					nxt = i + count
					while i < nxt
						 A[i] = v
						 i += 1
					end
			  end
		 else
			  A = read!(s.io, Array{elty}(undef, dims))
		 end
		 s.table[slot] = A
		 return A
	end
	A = Array{elty, length(dims)}(undef, dims)
	s.table[slot] = A
	sizehint!(s.table, s.counter + div(length(A)::Int,4))
	Serialization.deserialize_fillarray!(A, s)
	return A
end

run_test(x)
println("Testing Vector{Bool} with custom deserialize_array")
run_test(x; warmup=false)

I get the following output on my Windows machine:

Testing Vector{Bool} with default deserialize_array
  0.006477 seconds (10 allocations: 1.047 KiB)
  0.028968 seconds (999.50 k allocations: 16.206 MiB)
Testing Vector{Bool} with custom deserialize_array
  0.006436 seconds (10 allocations: 1.047 KiB)
  0.004163 seconds (13 allocations: 977.688 KiB)
1 Like

I’d hazard a guess that dims is only known at runtime in general, so your change hardcoded the concrete type for your tested input x = rand(Bool, N). Thing is, the relevant line with improved type stability would be A[i] = v, which I would have assumed to be optimized even with runtime dims due to the one method for linear set-indexing.

2 Likes

A work-around can be to reshape the array after creating it with single dimension.

begin
	function create_array() 
		dims = rand(10:10, 6)
		A = Array{Bool, length(dims)}(undef, tuple(dims...))
		for i in 1:length(A)
			A[i] = i % 2 == 0
		end
		return A
	end

	function create_array2() 
		dims = rand(10:10, 6)
		A = Vector{Bool}(undef, prod(dims))
		for i in 1:length(A)
			A[i] = i % 2 == 0
		end
		return reshape(A, tuple(dims...))
	end

	function create_array3() 
		dims = rand(10:10, 6)
		A = Vector{Bool}(undef, prod(dims))
		for i in 1:length(A)
			A[i] = i % 2 == 0
		end
		return A
	end

	create_array();
	create_array2();
	create_array3();

	println("Testing assigning to obfuscated dims")
	a1 = @time create_array();
	println("Testing obfuscated dims with reshaping array")
	a2 = @time create_array2();
	println("Testing flat array with no reshaping	")
	a3 = @time create_array3();
	@assert a1 == a2 
	@assert a1 == reshape(a3, size(a1))
end

I get the following output:

Testing assigning to obfuscated dims
  0.089560 seconds (3.00 M allocations: 61.974 MiB, 2.66% gc time)
Testing obfuscated dims with reshaping array
  0.000231 seconds (14 allocations: 977.375 KiB)
Testing flat array with no reshaping
  0.000205 seconds (11 allocations: 977.156 KiB)
1 Like

That’s clever, the reshape changes vectors to Arrays instead of ReshapedArrays, and it is relatively cheap too because the underlying Memory buffer is reused. I don’t know if that’s something we can count on, so maybe directly making a Memory to be wrapped is the way to go now?

Also figured out why the optimization doesn’t happen, I made a bad assumption. That instantiation call you replaced is inferred as ::Any, not ::Array{Bool, N} where N, so the linear setindexing wouldn’t have 1 method to look at anymore. That’s because the call can go to 5 methods(Array{Bool}.body, (typeof(undef), Dims)), so the compiler gives up at >3 methods and infers ::Any. Julia doesn’t guarantee that called types return their own types (this is one of the many listed wants of a v2 that may never come), so it would need an extra seemingly redundant type assertion.

I found that type asserting works if the number of dims is known at compile time. The allocation is still there if number of dims is dynamic.

begin
	function create_array4() 
		dims = rand(10:10, 6)
		A = Array{Bool, length(dims)}(undef, Tuple(dims))::Array{Bool, 6}
		for i in 1:length(A)
			A[i] = i % 2 == 0
		end
		return A
	end
	function create_array5() 
		dims = rand(10:10, 6)
		A = Array{Bool, length(dims)}(undef, Tuple(dims))::Array{Bool, length(dims)}
		for i in 1:length(A)
			A[i] = i % 2 == 0
		end
		return A
	end
	create_array4();
	create_array5();
	
	println("Testing hardcoded type assert")
	a4 = @time create_array4();
	println("Testing runtime type assert")
	a5 = @time create_array5();
	@assert a4 == a5 
end

outputs:

Testing hardcoded type assert
  0.000247 seconds (13 allocations: 977.312 KiB)
Testing runtime type assert
  0.013480 seconds (999.50 k allocations: 16.205 MiB)

1 Like

I stand corrected again, and it jogged my memory about something I learned just last week. Improving inference to Array{Bool} and any optimizations from knowing the only setindex! method does not eliminate the runtime dispatch, and that contributes allocations by boxing unboxed inputs or outputs, even when similar techniques improved type inference all the way to concrete types. Your code is actually doing fewer allocations than I’d expect, but I’m still terrible at predicting dispatch boxing. To save a click of the link, it’s hypothetically possible for runtime dispatch to not do that, but that’s not how it’s implemented right now and would require a significant change to Core Julia and the GC.

That’s still an abstract type because the dimensionality is unknown.

julia -O3 is still unable to recognize length(dims) as a value known at compile time`

Furthermore, it seems it got worse in julia 1.12. Using @btime instead of @time:

with 1.11.5

Testing hardcoded type assert
  574.006 μs (6 allocations: 976.85 KiB)
Testing runtime type assert
  69.579 ms (2000007 allocations: 61.99 MiB)

with 1.12.0-beta3

Testing hardcoded type assert
  648.211 μs (6 allocations: 976.85 KiB)
Testing runtime type assert
  133.855 ms (2998986 allocations: 61.97 MiB)

In the test scripts the number of dims was obfuscated to simulate deserialization, where dims comes from the bitstream and cannot be known at compile time.

When you deserialize Vector{Int} the array is directly read from the bitstream and does not allocate for each element:

using Serialization, Infiltrator

T = Int
N = 1000000
x = rand(T, N)
buffer = Vector{UInt8}(undef, Int(3e9));
io = IOBuffer(buffer; read=true, write=true)

run_test(x; warmup=true) = begin
	empty!(buffer)
	mark(io)
	@assert typeof(x) <: Vector{T}
	if warmup
		serialize(io, x)
	else
		@time serialize(io, x)
	end
	reset(io)
	if warmup
		x2 = deserialize(io)
	else
		@time x2 = deserialize(io)
	end
	@assert typeof(x2) <: Vector{T} && x2 == x
end;

run_test(x)
println("Testing Vector{$T} with default deserialize_array")
run_test(x; warmup=false)

outputs:

Testing Vector{Int64} with default deserialize_array
  0.001174 seconds (11 allocations: 1.062 KiB)
  0.000774 seconds (11 allocations: 7.630 MiB)

Vector{Bool} needs to unpack the bits that’s when it runs into the assignment issue. It sounds like the issue is not knowing which method to call. Perhaps setindex! can be dispatched by the type of the index (having a single dimension in our case), and have the array object deal with dimensionality?