(Not) Giving up on dispatch

return reinterpret(T, read_vals)[1]

Thx for the only(), what a rich language.
Would be nice to have grouped lists of one-liner doc for an overview.

Agreed - in that context, why try to handle mixed element arrays more than necessary? As I understand it, the large matrices and vectors should all be homogenous anyway, for performance reasons. You certainly don’t want to dispatch dynamically in a hot loop.

However, I think this can throw a wrench into your serialization:

julia> a = Any[1,2,4] 
3-element Vector{Any}:
 1                    
 2                    
 4                    

Even though the vector is homogenous (Any), all it’s elements are the same. If you don’t want to lose this information, you’ll have to handle cases like this specially.

Well, you could branch on whether the eltype of the vector is a union:

julia> a = Union{Float64, UInt8}[0x1, 2.0]
2-element Vector{Union{Float64, UInt8}}:  
 0x01                                     
    2.0                                   
                                          
julia> eltype(a) isa Union
true                      

Ugly, but admittably it would work :man_shrugging: I see why you’d say that you lose assumed homogeneity, but in that case, again why handle mixed vectors at all? Is this really the common case or is this trying to anticipate something you don’t know you’ll hit? And if you want to assume homogeneity anyway, only writing the type tag per element in the case of actual mixed types is a must, since you don’t know which type any given element will have.

I’d propose using a flag to store the information whether it’s a homogenous array (and thus the elements have no type tag) or not. That would make reading of homogenous arrays the fast & easy case, and in the non-homogenous case you have to parse either way.

It’s not an argument for performance or such (though maybe one could be made, since I see you recursing… may not be impactful enough in terms of parsein, would have to be tested, as there’s no tail-call optimization in julia and type stability may come into play here…), it’s an argument for maintainability. Seperating different methods into their own implementations definitely makes it easier to extend the code in the future, because instead of having to modify two places (your type lookup table as well as this big function) you only have to modify one (the type table) and add the new method wherever you want. You don’t have to touch old & trusted code at all when adding a new method.


I guess we’re getting down into the weeds of what data you expect to see and what you want to be able to exchange over the wire. A lot of julia software doesn’t return naive clean Vectors, because specialized structures for holding simulation results are often more efficient. One example is SVector from LoopVectorization, which powers almost all scientific software packages in julia. Right now, you’d assume that this high performance data structure is just another struct, and sending & receiving loses a lot of type information, probably leading to abysmal performance once the data is back on the julia side.

Bought.

And thx for the reference to SVector.

Small update with your recommendations.

"""
io = serialize(v) reinterprets a Julia object into a series of bytes.
v = deserialize(io) recreates the data from a byte stream

As much information as possible shall be written and read together, used grouping:
- NumChar: writable
- Mixed: Any and Tuple types, serialize elements
- Struct: struct types, serialize fields.

An exercise in dispatch style, based on
https://de.mathworks.com/matlabcentral/fileexchange/29457-serialize-deserialize
and "julianized" with the experts on
https://discourse.julialang.org/

"""

# read/write routines
import Base.write
Base.write(io::IO, x::Tuple) = write(io::IO, x...)
import Base.read
function read(io, T::Type{<:Number})
    read_vals = Base.read(io, sizeof(T))
    return only(reinterpret(T, read_vals))
end
function read(io, T::Type{<:Number}, n)
    read_vals = Base.read(io, n*sizeof(T))
    return reinterpret(T, read_vals)
end

# Type encoding
struct Struct end
struct Mixed end
tcode = [
0   Float64
1   Float32 
2   Float16
3   Bool
4   Int8
5   UInt8
6   Int16
7   UInt16
8   Int32
9   UInt32
10  Int64
11  UInt64
12  Char
13  String
100 Tuple
200 Mixed
255 Struct
]

tcode2type = Dict(tcode[:,1] .=> tcode[:,2])
type2tcode = Dict(tcode[:,2] .=> tcode[:,1])

NumChar = Union{Number, Char}

function serialize(io, v::T) where {T<:NumChar}
    println("NumChar")
    write(io, UInt8(type2tcode[T]), UInt8(0), v)
end
function serialize(io, v::AbstractArray{T}) where {T<:NumChar}
    println("NumChar")
    write(io, UInt8(type2tcode[T]), UInt8(ndims(v)), UInt32.(size(v))..., v)
end
function serialize(io, v::T) where {T<:Tuple}
    println("Tuple")
    write(io, UInt8(type2tcode[Tuple]), UInt8(1), UInt32(length(v)))
    serialize.(Ref(io), v)
end
function serialize(io, v::String)
    println("String")
    write(io, UInt8(type2tcode[String]), UInt8(1), UInt32(ncodeunits(v)), v)
end
function serialize(io, v::AbstractArray{String})
    println("String")
    write(io, UInt8(type2tcode[String]), UInt8(ndims(v)), UInt32.(size(v))...)
    serialize.(Ref(io), v)
end
function serialize(io, v::AbstractArray{T}) where {T<:Tuple}
    println("Tuple")
    write(io, UInt8(type2tcode[Mixed]), UInt8(ndims(v)), UInt32.(size(v))...)
    serialize.(Ref(io), v)
end
function serialize(io, v::AbstractArray{Any})
    println("Any")
    write(io, UInt8(type2tcode[Mixed]), UInt8(ndims(v)), UInt32.(size(v))...)
    serialize.(Ref(io), v)
end
function serialize(io, v::AbstractArray{T}) where T
    println("Struct, eltype=$T)")
    write(io, UInt8(type2tcode[Struct]), UInt8(ndims(v)), UInt32.(size(v))...)
    write(io, UInt32(fieldcount(T)))
    for name in fieldnames(T)
        sname = String(name)
        write(io, UInt8(ncodeunits(sname)), sname)
        serialize(io, getfield.(v, name))
    end
end
function serialize(io, v::T) where T
    println("Struct, type=$T)")
    write(io, UInt8(type2tcode[Struct]), UInt8(0))
    write(io, UInt32(fieldcount(T)))
    for name in fieldnames(T)
        sname = String(name)
        write(io, UInt8(ncodeunits(sname)), sname)
        serialize(io, getfield(v, name))
    end
end

function deserialize(io) 
    type = tcode2type[Int(read(io, UInt8))]
    ndms = Int(read(io, UInt8))
    dms = ndms == 0 ? 1 : Int.(read(io, UInt32, ndms))
    return deserialize(io, type, ndms, dms)
end
function deserialize(io, ::Type{T}, ndms, dms) where T<:NumChar
    return ndms == 0 ? read(io, T) : reshape(read(io, T, prod(dms)), dms...)
end
function deserialize(io, ::Type{Struct}, ndms, dms)
    fname = Symbol[]
    fdata = []
    for i = 1:read(io, UInt32)
        push!(fname, Symbol(String(read(io, read(io, UInt8)))))
        push!(fdata, deserialize(io))
    end
    if ndms == 0
        return NamedTuple(zip.(Ref(fname), zip(fdata...)))
    else
        return reshape(NamedTuple.(zip.(Ref(fname), zip(fdata...))), dms...)
    end
end
function deserialize(io, ::Type{String}, ndms, dms)
    if ndms == 1
        return String(read(io, only(dms)))
    else
        return reshape(String.([deserialize(io) for i = 1:prod(dms)]), dms...)
    end
end
function deserialize(io, ::Type{Mixed}, ndms, dms)
    return reshape([deserialize(io) for i = 1:prod(dms)], dms...)
end
function deserialize(io, ::Type{Tuple}, ndms, dms)
    return Tuple(deserialize(io) for i = 1:only(dms))
end

############## Tests ##############

mutable struct Coords
    x::Float64
    y::Float64
    z::Float64
end
Coords() = Coords(rand(), rand(), rand())

Array_of_Int = [1, 2]
Array_of_Tuple = [(1, 2), (2, 3)]
Array_of_Any = ["Ab", (1, 2)]
Single_Num = pi
Array_of_Num = randn(3,3)
Single_Struct = Coords()
Array_of_Struct = [Coords() for i in 1:5]
Single_Tuple = ("Ab", [pi, 2.0])
Single_String = "toto"
Array_of_String = ["Ab" "toto"; "titi" "ok"]
Array_of_Char = ['a' 'b'; 'c' 'd']

function round_trip(data)
    open("io.bin", "w") do io
        serialize(io, data)
    end
    println("..deserialize..")
    data2 = open("io.bin", "r") do io
        deserialize(io)
    end
    return data2;
end
data = Array_of_Struct
data2 = round_trip(data)
[data data2] # simple side-by-side