When a Vector{Bool} is deserialized, stdlib/Serialization/src/Serialization.jl:1338 allocates for each element deserialized. I changed the A = Array{Bool, length(dims)}(undef, dims) to A = Vector{Bool}(undef, n) and I was able to get a 6x speedup. This doesn’t quite make sense to me (@infiltrate shows the same types are created), but the following script shows the results of the change:
using Serialization, Infiltrator
N = 1000000
x = rand(Bool, N)
buffer = Vector{UInt8}(undef, Int(3e9));
io = IOBuffer(buffer; read=true, write=true)
run_test(x; warmup=true) = begin
empty!(buffer)
mark(io)
@assert typeof(x) <: Vector{Bool}
if warmup
serialize(io, x)
else
@time serialize(io, x)
end
reset(io)
if warmup
x2 = deserialize(io)
else
@time x2 = deserialize(io)
end
@assert typeof(x2) <: Vector{Bool} && x2 == x
end;
run_test(x)
println("Testing Vector{Bool} with default deserialize_array")
run_test(x; warmup=false)
function Serialization.deserialize_array(s::AbstractSerializer)
slot = s.counter; s.counter += 1
d1 = deserialize(s)
if isa(d1, Type)
elty = d1
d1 = deserialize(s)
else
elty = UInt8
end
if isa(d1, Int32) || isa(d1, Int64)
if elty !== Bool && isbitstype(elty)
a = Vector{elty}(undef, d1)
s.table[slot] = a
return read!(s.io, a)
end
dims = (Int(d1),)
elseif d1 isa Dims
dims = d1::Dims
else
dims = convert(Dims, d1::Tuple{Vararg{OtherInt}})::Dims
end
if isbitstype(elty)
n = prod(dims)::Int
if elty === Bool && n > 0
# @infiltrate
A = Vector{Bool}(undef, n)
i = 1
while i <= n
b = read(s.io, UInt8)::UInt8
v::Bool = (b >> 7) != 0
count = b & 0x7f
nxt = i + count
while i < nxt
A[i] = v
i += 1
end
end
else
A = read!(s.io, Array{elty}(undef, dims))
end
s.table[slot] = A
return A
end
A = Array{elty, length(dims)}(undef, dims)
s.table[slot] = A
sizehint!(s.table, s.counter + div(length(A)::Int,4))
Serialization.deserialize_fillarray!(A, s)
return A
end
run_test(x)
println("Testing Vector{Bool} with custom deserialize_array")
run_test(x; warmup=false)
I’d hazard a guess that dims is only known at runtime in general, so your change hardcoded the concrete type for your tested input x = rand(Bool, N). Thing is, the relevant line with improved type stability would be A[i] = v, which I would have assumed to be optimized even with runtime dims due to the one method for linear set-indexing.
A work-around can be to reshape the array after creating it with single dimension.
begin
function create_array()
dims = rand(10:10, 6)
A = Array{Bool, length(dims)}(undef, tuple(dims...))
for i in 1:length(A)
A[i] = i % 2 == 0
end
return A
end
function create_array2()
dims = rand(10:10, 6)
A = Vector{Bool}(undef, prod(dims))
for i in 1:length(A)
A[i] = i % 2 == 0
end
return reshape(A, tuple(dims...))
end
function create_array3()
dims = rand(10:10, 6)
A = Vector{Bool}(undef, prod(dims))
for i in 1:length(A)
A[i] = i % 2 == 0
end
return A
end
create_array();
create_array2();
create_array3();
println("Testing assigning to obfuscated dims")
a1 = @time create_array();
println("Testing obfuscated dims with reshaping array")
a2 = @time create_array2();
println("Testing flat array with no reshaping ")
a3 = @time create_array3();
@assert a1 == a2
@assert a1 == reshape(a3, size(a1))
end
I get the following output:
Testing assigning to obfuscated dims
0.089560 seconds (3.00 M allocations: 61.974 MiB, 2.66% gc time)
Testing obfuscated dims with reshaping array
0.000231 seconds (14 allocations: 977.375 KiB)
Testing flat array with no reshaping
0.000205 seconds (11 allocations: 977.156 KiB)
That’s clever, the reshape changes vectors to Arrays instead of ReshapedArrays, and it is relatively cheap too because the underlying Memory buffer is reused. I don’t know if that’s something we can count on, so maybe directly making a Memory to be wrapped is the way to go now?
Also figured out why the optimization doesn’t happen, I made a bad assumption. That instantiation call you replaced is inferred as ::Any, not ::Array{Bool, N} where N, so the linear setindexing wouldn’t have 1 method to look at anymore. That’s because the call can go to 5 methods(Array{Bool}.body, (typeof(undef), Dims)), so the compiler gives up at >3 methods and infers ::Any. Julia doesn’t guarantee that called types return their own types (this is one of the many listed wants of a v2 that may never come), so it would need an extra seemingly redundant type assertion.
I found that type asserting works if the number of dims is known at compile time. The allocation is still there if number of dims is dynamic.
begin
function create_array4()
dims = rand(10:10, 6)
A = Array{Bool, length(dims)}(undef, Tuple(dims))::Array{Bool, 6}
for i in 1:length(A)
A[i] = i % 2 == 0
end
return A
end
function create_array5()
dims = rand(10:10, 6)
A = Array{Bool, length(dims)}(undef, Tuple(dims))::Array{Bool, length(dims)}
for i in 1:length(A)
A[i] = i % 2 == 0
end
return A
end
create_array4();
create_array5();
println("Testing hardcoded type assert")
a4 = @time create_array4();
println("Testing runtime type assert")
a5 = @time create_array5();
@assert a4 == a5
end
outputs:
Testing hardcoded type assert
0.000247 seconds (13 allocations: 977.312 KiB)
Testing runtime type assert
0.013480 seconds (999.50 k allocations: 16.205 MiB)
I stand corrected again, and it jogged my memory about something I learned just last week. Improving inference to Array{Bool} and any optimizations from knowing the only setindex! method does not eliminate the runtime dispatch, and that contributes allocations by boxing unboxed inputs or outputs, even when similar techniques improved type inference all the way to concrete types. Your code is actually doing fewer allocations than I’d expect, but I’m still terrible at predicting dispatch boxing. To save a click of the link, it’s hypothetically possible for runtime dispatch to not do that, but that’s not how it’s implemented right now and would require a significant change to Core Julia and the GC.
In the test scripts the number of dims was obfuscated to simulate deserialization, where dims comes from the bitstream and cannot be known at compile time.
When you deserialize Vector{Int} the array is directly read from the bitstream and does not allocate for each element:
using Serialization, Infiltrator
T = Int
N = 1000000
x = rand(T, N)
buffer = Vector{UInt8}(undef, Int(3e9));
io = IOBuffer(buffer; read=true, write=true)
run_test(x; warmup=true) = begin
empty!(buffer)
mark(io)
@assert typeof(x) <: Vector{T}
if warmup
serialize(io, x)
else
@time serialize(io, x)
end
reset(io)
if warmup
x2 = deserialize(io)
else
@time x2 = deserialize(io)
end
@assert typeof(x2) <: Vector{T} && x2 == x
end;
run_test(x)
println("Testing Vector{$T} with default deserialize_array")
run_test(x; warmup=false)
Vector{Bool} needs to unpack the bits that’s when it runs into the assignment issue. It sounds like the issue is not knowing which method to call. Perhaps setindex! can be dispatched by the type of the index (having a single dimension in our case), and have the array object deal with dimensionality?