How to dispatch this serializer function?

"""
io = serialize(v) reinterprets a Julia object into a series of bytes.

An exercise in multiple dispatch style.

Based on
https://de.mathworks.com/matlabcentral/fileexchange/29457-serialize-deserialize
and "julianized" with the experts on
https://discourse.julialang.org/

"""

# Encode object types
type2byte(::Float64)       = UInt8(0)
type2byte(::Float32)       = UInt8(1)
type2byte(::Float16)       = UInt8(2)
type2byte(::Bool)          = UInt8(3)
type2byte(::Char)          = UInt8(4)
type2byte(::String)        = UInt8(5)
type2byte(::Int8)          = UInt8(6)
type2byte(::UInt8)         = UInt8(7)
type2byte(::Int16)         = UInt8(8)
type2byte(::UInt16)        = UInt8(9)
type2byte(::Int32)         = UInt8(10)
type2byte(::UInt32)        = UInt8(11)
type2byte(::Int64)         = UInt8(12)
type2byte(::UInt64)        = UInt8(13)
type2byte(::Tuple)         = UInt8(100)
type2byte(::Any)           = UInt8(255) # struct
type2byte(::AbstractArray) = UInt8(255) # Array of struct

# Type and size prefix
function prefix(io, x)                       
    write(io, type2byte(x))
    write(io, UInt8(0))
end
function prefix(io, x::Union{String, Tuple})
    write(io, type2byte(x))
    write(io, UInt8(1))
    write(io, UInt32(length(x)))
end
function prefix(io, x::AbstractArray)
    write(io, type2byte(first(x)))
    write(io, UInt8(ndims(x)))
    write(io, UInt32.(size(x))...)
end

# Writable (homogeneous) object types
Writable = Union{<:Real, Char, String, 
                 AbstractArray{<:Real}, AbstractArray{Char}, AbstractArray{String}}

# Entry function
function serialize(v)
    io = IOBuffer(UInt8[]; append = true)
    _serialize(io, v)
    return take!(io)
end

# Write prefix and data
function _serialize(io, v::T) where {T <: Writable}
    println("(data)\n$v\n(\\data)")
    prefix(io, v)
    write(io, v)
end
# Dispatch for Tuples or Array of Any
function _serialize(io, v::T) where {T <: Union{Tuple, AbstractArray{Any}}}
    println("(Array of Any / Tuple)")
    prefix(io, v)
    _serialize.(Ref(io), v);
end
# Dispatch for Array of Struct (stored as Struct of Array)
function _serialize(io, v::AbstractArray)
    println("(Array of Struct)")
    prefix(io, v)
    for name in fieldnames(typeof(first(v)))
        println("$(name)")
        _serialize(io, getfield.(v, name));
    end
end
# Dispatch for Struct
function _serialize(io, v)
    println("(Struct)")
    prefix(io, v)
    for name in fieldnames(typeof(v))
        println("$(name)")
        _serialize(io, getfield(v, name));
    end
end

#############################################################

# Test data
Array_of_Int = [1 ,2]
Array_of_Tuple = [(1, 2), (2, 3)]
Array_of_Any = ["Ab", (1, 2)]

mutable struct Coords
    x::Float64
    y::Float64
    z::Float64
end
Coords() = Coords(rand(), rand(), rand())
Array_of_Struct = [Coords() for i in 1:5]
Single_Struct = Coords()

# Test
println("Test Array_of_Int")
Int64.(serialize(Array_of_Int))
println()
println("Test Array_of_Tuple")
Int64.(serialize(Array_of_Tuple))
println()
println("Test Array_of_Any")
Int64.(serialize(Array_of_Any))
println()
println("Test Single_Struct")
Int64.(serialize(Single_Struct))
println()
println("Test Array_of_Struct")
Int64.(serialize(Array_of_Struct))

#@code_warntype serialize(Array_of_Any)


Here would be my start. (there might be some typos, but I think you’ll get the gist).

# Encode number types
classToByte(::Float64) = Int8(0)
classToByte(::Union{Float32, Float16}) = Int8(1)
classToByte(::Bool) = Int8(2)
classToByte(::Char) = Int8(3)
classToByte(::Int8) = Int8(4)
classToByte(::UInt8) = Int8(5)
classToByte(::Int16) = Int8(6)
classToByte(::UInt16) = Int8(7)
classToByte(::Int32) = Int8(8)
classToByte(::UInt32) = Int8(9)
classToByte(::Int64) = Int8(10)
classToByte(::UInt64) = Int8(11)
classToByte(x) = error("Unknown number type: $cls")

function serializej2m(v)
    
    ```
    io = serializej2m(v)
    reinterprets a Julia object into a Matlab object's series of bytes.
    
    Based on
    https://de.mathworks.com/matlabcentral/fileexchange/29457-serialize-deserialize
    
    ```
    io = IOBuffer(UInt8[]; append = true)
    serializej2m(v,io)
    return take!(io)
end

function serializej2m(io, v::T) where {T<:Real}
    # Data type.
    write(io, classToByte(T))
    # Number of dimensions.
    write(io, UInt8(2))
    # Dimensions.
    write(io, UInt32(1))
    write(io, UInt32(1))
    # Data.
    write(io, v)
end

function serializej2m(io, v::Union{Char,String})
    # Data type.
    write(io, classToByte(Char)
    # Number of dimensions.
    write(io, UInt8(2))
    # Dimensions.
    write(io, UInt32(1))
    write(io, UInt32(length(v)))
    # Data.
    write(io, v)
end
# Matlab matrix. Numeric (n-dim) or character (vector).
function serializej2m(io, v::AbstractArray{T<:Real})
    iss = false
    # Data type.
    write(io, classToByte(eltype(v)))
    # Number of dimensions.
    nd = max(2, ndims(v))
    write(io, UInt8(nd))
    # Dimensions.
    for ii = 1:nd
        write(io, UInt32(size(v, ii)))
    end
    # Data.
    write(io, v)
end

# Matlab cell array. Any content not fitting into a Matlab matrix.
function serializej2m(io, v::AbstractArray)
    # Data type.
    write(io, UInt8(254)) # 254 = cell.
    # Number of dimensions.
    nd = max(2, ndims(v))
    write(io, UInt8(nd))
    # Dimensions.
    for ii = 1:nd
        write(io, UInt32.(size(v)))
    end
    # Just serialize each member.
    for ii = 1:numel(v)
        write(io, serializej2m(v[ii]))
    end
end
2 Likes

Basically, any time you’re saying

if typeof(x) == Y
    #choose which function to run
end

, you mean dispatch.

1 Like

This one

    # Just serialize each member.
    for ii = 1:numel(v)
        write(io, serializej2m(v[ii]))
    end

can be slow. Probably it’s better to expand it with union splitting.

Yes, there’s two bad things on those lines. Unnecessary linear indexing and unnecessarily making a String layover.

serializej2m.(Ref(io), v)
3 Likes

Whoa, thank you!
Does the ordering of the dispatches matter? Or is the ordering from specific (::type) to general (cls)?
Not heard of in the Manual or text books.

Only found Tim’s explanation, but could you elaborate for this case?

OK for the element-wise operator. But what is the Ref needed for?

It chooses its methods from specific to general.

Ref is like a container for a value, that (in this instance) tells Julia you don’t want to broadcast across it.

println.(stdout) gives an error, (no method matching length(::Base.TTY)). Essentially, you can’t broadcast across an IO. But you can broadcast across a Ref{IO}, and the result is to always return that one IO. So

serializej2m.(io, v) # errors, because you can't take "every element of io"
serializej2m.(Ref(io), v) # works, takes every element of v and uses it with io
2 Likes

Thx, clear now.

The order does not matter. Julia will always pick the most specific method, no matter the order they were defined in (assuming no method is overwritten by another, that is).

2 Likes

Lots of repetitions in the code. I think this covers Numbers, Characters and Strings, and arrays thereof.

It doesn’t do Arrays of Any, because I have to leave the house. There probably needs to be some special handling of that, an extra _serialize method. But there would be virtually no code repetition, I’ll wager.

Warning: Totally untested:

# Encode number types
classToByte(::Float64) = Int8(0)
classToByte(::Union{Float32, Float16}) = Int8(1)
classToByte(::Bool) = Int8(2)
classToByte(::Union{Char, String}) = Int8(3) 
classToByte(::Int8) = Int8(4)
classToByte(::UInt8) = Int8(5)
classToByte(::Int16) = Int8(6)
classToByte(::UInt16) = Int8(7)
classToByte(::Int32) = Int8(8)
classToByte(::UInt32) = Int8(9)
classToByte(::Int64) = Int8(10)
classToByte(::UInt64) = Int8(11)
classToByte(x) = UInt8(254)

dimensions(x) = (UInt8(2), UInt32(1), UInt32(length(x)))  # works for numbers, chars and strings
dimensions(x::AbstractArray) = (UInt8(max(2, ndims(x))), UInt32.(size(x))...)

prefix(v) = (classToByte(v), dimensions(v)...)

function serializej2m(v)
    io = IOBuffer(UInt8[]; append = true)
    _serialize(io, v)
    return take!(io)
end

function _serialize(io, v)
    # Meta information 
    write(io, prefix(v)...)
    # Data
    write(io, v)
end

My code may be buggy or even wrong, but I think this can be expressed extremely concisely with multiple dispatch, using functions like prefix etc.

But frankly, I think this could have been expressed almost as concisely in Matlab, too. The original code was just not well organized.

1 Like

Actually, I think it would be cleaner to let the serializer report the correct dimensionality, e.g. zero for scalars, instead of max(2,ndims). Then the deserializer could take care of adjusting it if that language cannot represent scalars.

Now, the serializer is throwing away information needlessly, and misrepresenting the data, and even storing it less efficiently.

2 Likes

Right.

A lot to learn from, thx!

Remaining from @Oscar_Smith’s code, how could you dispatch this?

if isstructtype(typeof(v))

you cant. (Although isstructtype probably doesn’t do what you think it does and probably isn’t the right thing to use)

2 Likes

You could put applicable(propertynames, x) in the fallback function. Or assume that you catch everything that is not a composite type, so that the fallback simply uses propertynames.

Edit: all of my guesses were wrong, but the principle holds.

1 Like

Indeed, I have some doubts:
isstructtype(typeof(v)) # gives true for v=[1], issue?

good to know applicable(), thx.