(Not) Giving up on dispatch

Bardo · August 22, 2021, 8:40am

In another post, I got the comment

…the code you’ve posted here was not one of the recommended ways of doing …

Reason is, I could not reasonably put all conditions into regular dispatch (functions with the same name but different type arguments), they were just too many.
Then, if I understand correctly, if is a or if typeof are part of Union splitting and getting compiled.

My vehicle to experiment with the Julia type and dispatch universe:

(de-)serializer

"""
io = serialize(v) reinterprets a Julia object into a series of bytes.
v = deserialize(io) recreates the data from a byte stream

An exercise in dispatch style.

Based on
https://de.mathworks.com/matlabcentral/fileexchange/29457-serialize-deserialize
and "julianized" with the experts on
https://discourse.julialang.org/

"""

# Type encoding
tcode = [
0   Float64; 
1   Float32; 
2   Float16;
3   Bool;
4   Int8;
5   UInt8;
6   Int16;
7   UInt16;
8   Int32;
9   UInt32;
10  Int64;
11  UInt64
12  Char;
13  String;
100 Tuple;
200 Any
]
STRUCT = 255
WRITABLE = 0:12

tcode2type = Dict(tcode[:,1] .=> tcode[:,2])
type2tcode = Dict(tcode[:,2] .=> tcode[:,1])

function serialize(io, v)
    te = eltype(v)
    if typeof(v) <: Tuple
        write(io, UInt8(type2tcode[Tuple]))
        write(io, UInt8(1))
        write(io, UInt32(length(v)))
        serialize.(Ref(io), v)
    elseif typeof(v) == String
            write(io, UInt8(type2tcode[String]))
            write(io, UInt8(1))
            write(io, UInt32(length(v)))
            write(io, v)
    elseif eltype(v) <: Real || eltype(v) ==  Char
        write(io, UInt8(type2tcode[te]))
        nd = ndims(v)
        write(io, UInt8(nd))
        if nd > 0
            write(io, UInt32.(size(v))...)
        end
        write(io, v) 
    elseif v isa  AbstractArray
        if te == Any || te <: Tuple || te == String
            write(io, UInt8(type2tcode[Any])); 
            write(io, UInt8(ndims(v)))
            write(io, UInt32.(size(v))...)
            serialize.(Ref(io), v)
        elseif applicable(fieldcount, te) && fieldcount(te) > 0
            println(v)
            fc = fieldcount(eltype(v))
            write(io, UInt32(fc))
            for name in fieldnames(typeof(first(v)))
                sname = String(name)
                len = ncodeunits(sname)
                write(io, UInt8(len))
                writestr(io, sname)
                serialize(io, getfield.(v, name))
            end
        else
            error("no match for te=$te")
        end
    elseif applicable(fieldcount, te) && fieldcount(te) > 0
        write(io, UInt8(STRUCT))
        write(io, UInt8(0))
        fc = fieldcount(typeof(v))
        write(io, UInt32(fc))
        for name in fieldnames(typeof(v))
            sname = String(name)
            len = ncodeunits(sname)
            writen(io, UInt8(len))
            writestr(io, sname)
            serialize(io, getfield(v, name))
        end
    else
        error("expected struct, but did not find any field")
    end
end

function deserialize(io) 
    ity = Int(readnum(io, UInt8))
    ndms = Int(readnum(io, UInt8))
    dms = ndms == 0 ? 1 : Int.(readnum(io, UInt32, ndms))
    if ity in WRITABLE
        cls = tcode2type[ity]
        return ndms == 0 ? readnum(io, cls) : reshape(readnum(io, cls, prod(dms)), dms...)
    elseif ity == STRUCT
        fname = Symbol[]
        nfld = readnum(io, UInt32)
        fdata = []
        for i = 1:nfld
            fn = readstr(io, readnum(io, UInt8))
            push!(fname, Symbol(fn))
            push!(fdata, deserialize(io))
        end
        if ndms == 0
            return NamedTuple(zip.(Ref(fname), zip(fdata...)))
        else
            return reshape(NamedTuple.(zip.(Ref(fname), zip(fdata...))), dms...)
        end
    elseif ity == type2tcode[String]
        if ndms == 1
            return String(read(io, dms[1]))
        else
            se = String[]
            for i = 1:prod(dms)
                push!(se, deserialize(io))
            end
            return reshape(se, dms...)
        end
    elseif ity == type2tcode[Any] || ity == type2tcode[Tuple]
        istuple = ity == type2tcode[Tuple]
        ele = []
        for i = 1:prod(dms)
            push!(ele, deserialize(io))
        end
        if istuple
            return Tuple(ele)
        else
            return reshape(ele, dms...)
        end
    else
        error("unknown type index $ity")
    end
end

############## IO read/write routines ##############
# write number as type T
function writenum(io, num, T)
    write(io, T(num))
end
# read n numbers of type T
function readnum(io, T, n)
    s = sizeof(T)
    f = zeros(UInt8, s*n)
    readbytes!(io, f, s*n)
    return reinterpret(T, f)
end
# read single number of type T
function readnum(io, T)
    s = sizeof(T)
    f = zeros(UInt8, s)
    readbytes!(io, f, s)
    return reinterpret(T, f)[1]
end
# write string
function writestr(io, str)
    write(io, codeunits(str))
end
# read string of length n
function readstr(io, n)
    return String(read(io, n))
end

############## Tests ##############

function round_trip(data)
    println(data)
    open("io.bin", "w") do io
        serialize(io, data)
    end

    println("..deserialize..")

    io = open("io.bin", "r")
    data2 = deserialize(io)
    close(io)
    data2
end

mutable struct Coords
    x::Float64
    y::Float64
    z::Float64
end
Coords() = Coords(rand(), rand(), rand())

Array_of_Int = [1, 2]
Array_of_Tuple = [(1, 2), (2, 3)]
Array_of_Any = ["Ab", (1, 2)]
Single_Num = pi
Array_of_Num = randn(3,3)
Single_Struct = Coords()
Array_of_Struct = [Coords() for i in 1:5]
Single_Tuple = ("Ab", [pi, 2.0])
Single_String = "toto"
Array_of_String = ["Ab" "toto"; "titi" "ok"]
Array_of_Char = ['a' 'b'; 'c' 'd']

round_trip(Array_of_Any)

Sukera · August 22, 2021, 8:58am

I mean, the function by @DNF here seems pretty much optimal to me? The union splitting mentioned further down is only relevant if you want to eek out the absolute maximum of performance (and wouldn’t be done how you’ve done it in your OP here anyway).

As has been mentioned in your original thread, trying to differentiate structs from Any is a fools’ errand - there is no semantic difference.

DNF · August 22, 2021, 10:08am

I don’t think the problems you’re having are that much to do with dispatch or not dispatch. The problem is that you are taking the wrong approach, by writing a sort of “stream of consciousness” code, with all the code written out in full for each combination of type and container. The complexity grows like a bush.

Try to divide the problem into logical units instead. In the toy example I wrote previously, I divided the work into ‘write metainformation’, and ‘write data’. That simplified the code tremendously. Can’t you take a similar approach?

Bardo · August 22, 2021, 10:36am

Thanks for joining. Your approach works indeed nicely for the dimension part of the metainformation:

mutable struct Coords
    x::Float64
    y::Float64
    z::Float64
end
Coords() = Coords(rand(), rand(), rand())

Array_of_Int = [1, 2]
Array_of_Tuple = [(1, 2), (2, 3)]
Array_of_Any = ["Ab", (1, 2)]
Single_Num = pi
Array_of_Num = randn(3,3)
Single_Struct = Coords()
Array_of_Struct = [Coords() for i in 1:5]
Single_Tuple = ("Ab", [pi, 2.0])
Single_String = "toto"
Array_of_String = ["Ab" "toto"; "titi" "ok"]
Array_of_Char = ['a' 'b'; 'c' 'd']

dimensions(v) =                UInt8(0) # catches Real, Char and "Struct"
dimensions(v::String) =        [UInt8(1), UInt32(length(v))]
dimensions(v::Tuple) =         [UInt8(1), UInt32(length(v))]
dimensions(v::AbstractArray) = [UInt8(ndims(v)), UInt32.(size(v))...]

But for the type and data part, I was not able to avoid the complexity growth.

One of the reasons is recognizing structs. I lack a dispatchable way to recognize if I am dealing with a structure. In the code above this is solved by excluding the other types.

The more general reason for the if typeof elseif form is that it seems to give more expressivity compared to regular dispatch where you have only one type (Union appears to create a performance hit).

Another multiplier were some “special” cases:

Assuming homogeneity, store an array of structs as a struct of arrays.
The duality of 1.0 and [1.0] (resolved in dimensions above) can cause a code doubling with and without . (broadcasting).

Bardo · August 22, 2021, 11:44am

With some knowledge picked up, looks like I can dispatch using the combo Real, Char, String, Tuple, Any, the rest can be savely(?) regarded as struct:

mutable struct Coords
    x::Float64
    y::Float64
    z::Float64
end
Coords() = Coords(rand(), rand(), rand())

Array_of_Int = [1, 2]
Array_of_Tuple = [(1, 2), (2, 3)]
Array_of_Any = ["Ab", (1, 2)]
Single_Num = pi
Array_of_Num = randn(3,3)
Single_Struct = Coords()
Array_of_Struct = [Coords() for i in 1:5]
Single_Tuple = ("Ab", [pi, 2.0])
Single_String = "toto"
Array_of_String = ["Ab" "toto"; "titi" "ok"]
Array_of_Char = ['a' 'b'; 'c' 'd']

dimensions(v)                = UInt8(0) # catches Real, Char and "Struct"
dimensions(v::String)        = [UInt8(1), UInt32(length(v))]
dimensions(v::Tuple)         = [UInt8(1), UInt32(length(v))]
dimensions(v::AbstractArray) = [UInt8(ndims(v)), UInt32.(size(v))...]

treat(v)                                    = println("$v: assuming a struct: do fields")
treat(v::T) where {T<:Real}                 = println("$v: Real: write")
treat(v::Char)                              = println("$v: Char: write")
treat(v::String)                            = println("$v: String: write")
treat(v::T) where {T<:Tuple}                = println("$v: Tuple: do elements")
treat(v::AbstractArray)                     = println("$v: AbstractArray, assuming a struct: do fields")
treat(v::AbstractArray{T}) where {T<:Real}  = println("$v: AbstractArray Real: write")
treat(v::AbstractArray{Char})               = println("$v: AbstractArray Char: write")
treat(v::AbstractArray{String})             = println("$v: AbstractArray String : do elements")
treat(v::AbstractArray{T}) where {T<:Tuple} = println("$v: AbstractArray Tuple : do elements")
treat(v::AbstractArray{Any})                = println("$v: AbstractArray Any : do elements")

treat(Array_of_Int)
treat(Array_of_Tuple)
treat(Array_of_Any)
treat(Single_Num)
treat(Array_of_Num)
treat(Single_Struct)
treat(Array_of_Struct)
treat(Single_Tuple)
treat(Single_String)
treat(Array_of_String)
treat(Array_of_Char)

[1, 2]: AbstractArray Real: write
[(1, 2), (2, 3)]: AbstractArray Tuple : do elements
Any["Ab", (1, 2)]: AbstractArray Any : do elements
π: Real: write
[-0.18624382139218243 1.038717810926417 -0.5721804761418097; -0.06887849768129478 -0.4326395234814043 -0.022823882778455668; -0.133008431713054 1.3726789886513664 -0.6307134490103047]: AbstractArray Real: write
Coords(0.10965409968505369, 0.23307490361219174, 0.08137189161289693): assuming a struct: do fields
Coords[Coords(0.6789887110914838, 0.05693354040637022, 0.6087297411679566), Coords(0.49142154289794515, 0.24507798209664022, 0.6440033833399594), Coords(0.9339641294552887, 0.16546616160339211, 0.18130844393725987), Coords(0.5467068555079191, 0.8363102501162505, 0.5541500301421649), Coords(0.40920449846117113, 0.7647041101687808, 0.7075667859287602)]: AbstractArray, assuming a struct: do fields
("Ab", [3.141592653589793, 2.0]): Tuple: do elements
toto: String: write
["Ab" "toto"; "titi" "ok"]: AbstractArray String : do elements
['a' 'b'; 'c' 'd']: AbstractArray Char: write

Bardo · August 22, 2021, 4:30pm

@Sukera

Use write(io, num, T::Number) instead of writenum(io, num, T) . write(io, str::String) instead of writestr(io, str) . Use dispatch to your advantage, not to build explicit ifelse trees.

In reading, having a table converting tags to types is fine, but again - use dispatch to your advantage. read(io, T::Number) instead of readnum(io, T) . Call it via tag = read(io, UInt8); read(io, tag2type[tag]
[/quote]

Sorry, I do not understand this syntax, don’t find it in the Manual. Is this for adding a method to read/write instead of creating a new function, or is it function calls. Lost.

What about reading/writing n numbers of type T? My intention for the defined functions was to have a more homogeneous call syntax.

I tried

data = randn(2,1)
write(io, typeof(data)(data)) # ok
write(io, data::typeof(data)) # ok
write(io, data, typeof(data)) # error

Sukera · August 22, 2021, 4:53pm

What I was talking about is defining methods specialized for e.g. Number types:

function write(io, num, T::Type{<:Number})
     # [...]
     Base.write(io, T(num))
end

and so on for other types. Same goes for reading:

function read(io, T::Type{<:Number})
    Base.read(io, T)
end

Writing would literally just be write(io, val), dispatch selects the correct method. That’s what the solution by @DNF does.

Reading is the same, except you need to have the indirection via the lookup table because you have to find out what type you should read, based on the type tag you’ve read from the stream.

Reading n numbers can be done similarly, though you’ll have to first find out that you have to read n numbers (e.g. by reading first a type tag, then the number of elements and finally the elements themselves):

function read(io, n, T::Type{<:Number})
    read_vals = Base.read(io, n*sizeof(T))
    return reinterpret(T, read_vals)
end

This way, you only have to differentiate between reading one value and reading n values, not between arbitrary types, as dispatch handles that already for you.

This is slightly more complicated when dealing with generic structs, but in those cases you will have to know about the kinds of structs you want to de-/serialize or you have to at least save information about each field of the struct. This will most likely be done in a fallback method for your read, looking up a type tag, getting the fields in question via fieldtypes and recursing into read over all fieldtypes.

You may want to limit yourself to structs you know about, as it will make your life easier. Not all struct constructors take all fields of the objects they create during construction.

Bardo · August 22, 2021, 5:09pm

Thanks for the details.

Is the Base. necessary? Dispatch would recognize a different number of arguments?
Besides syntactic sugar, is there a difference to

write(io, typeof(data)(data))

Sukera · August 22, 2021, 5:15pm

Your write and read methods are presumably in their own module - to make sure you hit the correct methods from Base, prepending the module is essential. Especially since in

the signatures are exactly the same.

You’re converting data to the type it already has, which is redundant. write(io, T(num)) converts num to whatever type T is (provided it’s a numeric type, since they have conversion built in). E.g.

write(io, 4, Float64)

would convert 4 to a Float64 and write its binary representation to io, which is a different representation than for an integer.

Bardo · August 22, 2021, 5:19pm

Redundant indeed, got confused. And clear now for the module context. Thx.

Bardo · August 23, 2021, 10:00am

Getting closer, could use Union to collect same-treatment objects.
Except for the last two, where small differences in single/array treatment appear.
Besides, could not combine last two dispatches in a Union.

function serialize(io, v::T) where {T<:Union{<:Real, Char, String, AbstractArray{<:Real}, AbstractArray{Char}}} 
    println("$v: Union{<:Real, Char, String, AbstractArray{<:Real}, AbstractArray{Char}}")           
    write(io, UInt8(type2tcode[eltype(v)]))
    write(io, dimensions(v))
    write(io, v)
end
function serialize(io, v::T) where {T<:Union{<:Tuple, AbstractArray{<:Tuple}}}
    println("$v: Union{<:Tuple, AbstractArray{<:Tuple}")               
    write(io, UInt8(type2tcode[Tuple]))
    write(io, dimensions(v))
    serialize.(Ref(io), v)
end
function serialize(io, v::T) where {T<:Union{AbstractArray{String}, AbstractArray{Any}}}
    println("$v: Union{AbstractArray{String}, AbstractArray{Any}}")           
    write(io, UInt8(type2tcode[Any]))
    write(io, dimensions(v))
    serialize.(Ref(io), v)
end
function serialize(io, v::AbstractArray)
    println("$v: AbstractArray -> array of struct")                     
    write(io, UInt8(STRUCT))
    write(io, dimensions(v))
    fc = fieldcount(eltype(v))
    write(io, UInt32(fc))
    for name in fieldnames(typeof(first(v)))
        sname = String(name)
        len = ncodeunits(sname)
        write(io, UInt8(len))
        write(io, sname)
        serialize(io, getfield.(v, name))
    end
end
function serialize(io, v) 
    println("$v: -> single struct")  
    write(io, UInt8(STRUCT))
    write(io, dimensions(v))
    fc = fieldcount(typeof(v))
    write(io, UInt32(fc))
    for name in fieldnames(typeof(v)) # eltype or first for array only
        sname = String(name)
        len = ncodeunits(sname)
        write(io, UInt8(len))
        write(io, sname)
        serialize(io, getfield(v, name)) # getfield. for array only
    end
end

Sukera · August 23, 2021, 10:07am

Are you aware that you can use the T from the function definition inside of your function? That’ll save you from using eltype, because you can dispatch on e.g. serialize(io, v::AbstractArray{T}) where T to catch any objects that look like an Array. You can then do write(io, UInt8(type2tcode[T])).

Personally, I’d have a serialize function that deals with Arrays, Tuples and Strings, in which case you write the dimensions/length etc. and then write the contents, which are then just a single mapping of serialize over the contents (may be handled differently for strings than for arrays).

Seems like you’re making progress towards understanding dispatch though, nice

Bardo · August 23, 2021, 10:23am

Thx for the T and other tips, and the encouragement.

And a big thank you to you and the other guys for the unpaid work.
In the office, I would owe you a beer or two.

Bardo · August 23, 2021, 1:44pm

As often, “in principle, yes”.
My grouping is according what needs to be done from a write perspective.

Union{<:Real, Char, String, AbstractArray{<:Real}, AbstractArray{Char}}
Whatever can be directly passed to write.
Union{<:Tuple, AbstractArray{<:Tuple}}
Need to handle elements individually - no common type.
Written and read as arrays, then made to Tuples, store Tuple as key
Union{AbstractArray{String}, AbstractArray{Any}}
Need to handle elements individually - no common size or type, store Any as key for array
AbstractArray
By exclusion, should be a (homogeneous) array of struct
Iterate over fields, store as struct of arrays
Without type
By exclusion, should be struct.
Iterate over fields.

In my understanding, for your setup, I would need dynamical dispatch again.

Bardo · August 29, 2021, 10:38am

Better now:

"""
io = serialize(v) reinterprets a Julia object into a series of bytes.
v = deserialize(io) recreates the data from a byte stream

An exercise in dispatch style.

Based on
https://de.mathworks.com/matlabcentral/fileexchange/29457-serialize-deserialize
and "julianized" with the experts on
https://discourse.julialang.org/

"""

# read/write routines
import Base.write
Base.write(io::IO, x::Tuple) = write(io::IO, x...)
import Base.read
function read(io, T::Type{<:Number})
    read_vals = Base.read(io, sizeof(T))
    return reinterpret(T, read_vals)[1]
end
function read(io, T::Type{<:Number}, n)
    read_vals = Base.read(io, n*sizeof(T))
    return reinterpret(T, read_vals)
end

# Type encoding
struct Struct
end
tcode = [
0   Float64
1   Float32 
2   Float16
3   Bool
4   Int8
5   UInt8
6   Int16
7   UInt16
8   Int32
9   UInt32
10  Int64
11  UInt64
12  Char
13  String
100 Tuple
200 Any
255 Struct
]
WRITABLE = 0:12
tcode2type = Dict(tcode[:,1] .=> tcode[:,2])
type2tcode = Dict(tcode[:,2] .=> tcode[:,1])

NumChar = Union{Number, Char}

function serialize(io, v::T) where {T<:NumChar}
    write(io, UInt8(type2tcode[T]), UInt8(0), v)
end
function serialize(io, v::AbstractArray{T}) where {T<:NumChar}
    write(io, UInt8(type2tcode[T]), UInt8(ndims(v)), UInt32.(size(v))..., v)
end
function serialize(io, v::T) where {T<:String}
    write(io, UInt8(type2tcode[T]), UInt8(1), UInt32(length(v)), v)
end
function serialize(io, v::T) where {T<:Tuple}
    write(io, UInt8(type2tcode[T]), UInt8(1), UInt32(length(v)))
    serialize.(Ref(io), v)
end
function serialize(io, v::T) where {T<:AbstractArray{Any}}
    write(io, UInt8(type2tcode[Any]), UInt8(ndims(v)), UInt32.(size(v))...)
    serialize.(Ref(io), v)
end
function serialize(io, v::AbstractArray{T}) where T
    write(io, UInt8(type2tcode[Struct]), UInt8(ndims(v)), UInt32.(size(v))...)
    write(io, UInt32(fieldcount(T)))
    for name in fieldnames(T)
        sname = String(name)
        write(io, UInt8(ncodeunits(sname)), sname)
        serialize(io, getfield.(v, name))
    end
end
function serialize(io, v::T) where T
    write(io, UInt8(type2tcode[Struct]), UInt8(0))
    write(io, UInt32(fieldcount(T)))
    for name in fieldnames(T)
        sname = String(name)
        write(io, UInt8(ncodeunits(sname)), sname)
        serialize(io, getfield(v, name))
    end
end

function deserialize(io) 
    ity = Int(read(io, UInt8))
    ndms = Int(read(io, UInt8))
    dms = ndms == 0 ? 1 : Int.(read(io, UInt32, ndms))
    if ity in WRITABLE
        cls = tcode2type[ity]
        return ndms == 0 ? read(io, cls) : reshape(read(io, cls, prod(dms)), dms...)
    elseif ity == type2tcode[Struct]
        fname = Symbol[]
        fdata = []
        for i = 1:read(io, UInt32)
            push!(fname, Symbol(String(read(io, read(io, UInt8)))))
            push!(fdata, deserialize(io))
        end
        if ndms == 0
            return NamedTuple(zip.(Ref(fname), zip(fdata...)))
        else
            return reshape(NamedTuple.(zip.(Ref(fname), zip(fdata...))), dms...)
        end
    elseif ity == type2tcode[String]
        if ndms == 1
            return String(read(io, dms[1]))
        else
            se = String[]
            for i = 1:prod(dms)
                push!(se, deserialize(io))
            end
            return reshape(se, dms...)
        end
    elseif ity == type2tcode[Any] || ity == type2tcode[Tuple]
        ele = []
        for i = 1:prod(dms)
            push!(ele, deserialize(io))
        end
        if ity == type2tcode[Tuple]
            return Tuple(ele)
        else
            return reshape(ele, dms...)
        end
    else
        error("unknown type index $ity")
    end
end

############## Tests ##############

mutable struct Coords
    x::Float64
    y::Float64
    z::Float64
end
Coords() = Coords(rand(), rand(), rand())

Array_of_Int = [1, 2]
Array_of_Tuple = [(1, 2), (2, 3)]
Array_of_Any = ["Ab", (1, 2)]
Single_Num = pi
Array_of_Num = randn(3,3)
Single_Struct = Coords()
Array_of_Struct = [Coords() for i in 1:5]
Single_Tuple = ("Ab", [pi, 2.0])
Single_String = "toto"
Array_of_String = ["Ab" "toto"; "titi" "ok"]
Array_of_Char = ['a' 'b'; 'c' 'd']


function round_trip(data)
 
    open("io.bin", "w") do io
        serialize(io, data)
    end

    println("..deserialize..")

    io = open("io.bin", "r")
    data2 = deserialize(io)
    close(io)
    data2
end

data = Array_of_Struct
println("data = $data")
data2 = round_trip(data)
println("data2 = $data2")

Sukera · August 29, 2021, 11:22am

Looking better! A few points:

type2tcode[Any] doesn’t make sense to me - there is no mysterious type left over between tuples, arrays, strings, primitive types and structs. From julias’ POV, generic structs are what should be caught by Any.

what about return only(reinterpret(T, read_vals))? This will throw if there are accidentally more values, catching parsing mistakes loudly and proudly instead of sweeping them under the rug.

Bardo:

function serialize(io, v::T) where {T<:AbstractArray{Any}}
    write(io, UInt8(type2tcode[Any]), UInt8(ndims(v)), UInt32.(size(v))...)
    serialize.(Ref(io), v)
end
function serialize(io, v::AbstractArray{T}) where T
    write(io, UInt8(type2tcode[Struct]), UInt8(ndims(v)), UInt32.(size(v))...)
    write(io, UInt32(fieldcount(T)))
    for name in fieldnames(T)
        sname = String(name)
        write(io, UInt8(ncodeunits(sname)), sname)
        serialize(io, getfield.(v, name))
    end
end

This distinction doesn’t make sense to me - why not just write out the dimensions and then serialize each element, like in the first function? I’m pretty sure you’re never going to be able to dispatch to the second one. It seems like the order of serialization is backwards - by explicitly communicating that whatever you’re writing is an array and its dimensions, followed by the serialization of each element, you could better control what’s read/written. As it’s written now, you basically loose the information that whatever you’ve serialized is an array, which forces you to always write out dimensions. Seems non-ideal. Do you have control over the serialization format?
deserialize isn’t pretty - it again interleaves branching with parsing. Maybe something like the following:

function deserialize(io, ::Type{<: AbstractArray})
    dms = read(io, UInt8)
    dms = only(reinterpret(NTuple{ndms,UInt32}, read(io, sizeof(UInt32)*ndms)))

    # the rest of this function could also be a call like `read(io, E, dms)`, but it doesn't feel worth it to add another function for that if you would only access it through deserialization of arrays in the first place
    E = tcode2type(peek(io, UInt8)) # peek doesn't advance io
    res = Array{E, ndms}(undef, dms)
    for i in eachindex(res)
        res[i]  = deserialize(io, E)
    end
    res
end

function deserialize(io, ::Type{T}) where {T <:Tuple}
    ndms = read(io, UInt8) # this should always be 1 for tuples, right?
    dms = read(io, UInt32) # tuples should only have one dimension/size anyway - that's the number of elements
    ntuple(i -> deserialize(io, T.parameters[i]), 1:dms)
end

# a more accurate name would be "PRIMITIVE" or something like that - writability sounds more like mutability from structs, which isn't what is meant here
const WRITABLE = Union{Float64, Float32, Float16, ...., UInt64, Char}
function deserialize(io, ::Type{T}) where {T <: WRITABLE}
    read(io, T)
end

and similarly for all types you want to handle. That is, write one method per thing you want to deserialize, just like with serialize. Then in the main deserialize function you can do this:

function deserialize(io)
    res = []
    while !eof(io)
        itype = read(io, UInt8)
        push!(res, deserialize(io, tcode2type[itype]))
    end
    return res
end

This will disentangle the deserialization code and you’ll be able to add deserialization support for other types by simply adding a new method for that specific type, without having to find out where in the big deserialize you’d have to add it for it to work properly.

Bardo · August 29, 2021, 12:45pm

type2tcode[Any] doesn’t make sense to me - there is no mysterious type left over between tuples, arrays, strings, primitive types and structs.

True, but it is not used for dispatch in serializer.
Any, like Tuple is used here to tell the deserializer that it has to handle an array of mixed type data, pushed together and then reshaped or tupelized. I could have created a type Mixed instead.
Is there another way to rebuild a tuple?
Same question for struct, which gets rebuild into a named tuple.
Other serializers like CBOR.jl kind of cheat (Julia-only) by using the built-in (de-)serializer for structs. So, while deserializing structs is possible, I cannot understand it by the code.

Sukera · August 29, 2021, 12:52pm

If you iterate over an array of mixed element types, calling serialize on each element will still dispatch to the “runtime” type of the element. That’s thanks to the dynamic nature of julia - each function call selects the most specific method for that object. The array itself may have e.g. type Vector{Union{Int, Float64}}, but that has no bearing on the types of the elements. They still have only Int or only Float64 as their type, not the union of both. That’s why I suggested writing the dimensions for any array type out first (and that it’s an array, but not what kind of elements the array holds), because it decouples the array-ness from the types of each element.

This of course comes at the cost of writing the type tag out with each element, which may not be desirable - in that case, you can limit yourself to homogenous arrays, which could write the element type tag only once. Sadly, dispatching on whether or not the element type is a Union is not possible (and is als discouraged).

Those two are kind of the same - you need the field/element types for reconstruction, so just saying “this is a tuple of length n” or “this is a struct named X with n fields” followed by the elements (and their respective representations with type tag) is imo good enough.

–

Your struct (de-)serialization looks ok to me, just having all those different deserialization methods in one big if-else inside a single function looked weird to me.

Bardo · August 29, 2021, 5:08pm

Chosen on purpose. I must add information that this code is to be used in the exchange of simulation results, long real arrays, grouped in vectors, matrices, and classic structs. As much information as possible shall be written and read together. My grouping is: Writable, mixed (Any and Tuple, descend to elements), otherwise Struct (descend to fields).
The first one catches a mixed array, the second one catches an array of homogeneous structs.
If I write the dimensions first, I lose the ability to take advantage of assumed homogeneity.

Bardo · August 29, 2021, 5:19pm

Hm, in principle again, yes, agree for the mixing. But doesn’t the branching on integer typecodes actually make sense? I can simply extend by adding another typecode. Does changing the typecode to a type and then using dispatch to individual functions improve any performance?