Union splitting and AbstractArray

How to select efficiently all types which can directly processed by write()?
For scalar types I can use

if v isa Real || v is a Char
   write(io, v)
elseif ...

An extension to non-scalar types like

if v isa AbstractArray{Real} || v is a AbstractArray{Char}

does NOT work, but this one:

if v isa AbstractArray && (eltype(v) <: Real || eltype(v) == Char)

Are there better methods?

Why not just use dispatch in this scenario? Can you expand what you mean by “efficiently” in this context? Would you consider a string inefficient for write because it may have variable length?

1 Like

type parameters are invariant in Julia:

julia> v = [1,2,3];

julia> typeof(v)
Vector{Int64} (alias for Array{Int64, 1})

julia> v isa AbstractArray{Real}
false

julia> v isa AbstractArray{<:Real}
true

https://docs.julialang.org/en/v1/manual/types/#Parametric-Abstract-Types

Why not just use dispatch in this scenario?

Could you supply a corresponding code?

Can you expand what you mean by “efficiently” in this context?

I mean without performance hit like dynamic dispatch. The <: operation looks like a search in a possibly large set. Is this true?

Would you consider a string inefficient for write because it may have variable length?

No, could have included it in the first example. But an array of strings can not be fed to write() and must be treated separately.

calling function with multiple methods != dynamic dispatch happening.

You really should try not to do dispatch inside function like this[1], which presumably is the inherited manual way from Python/MATLAB since they don’t have multi dispatch

semantically kinda of I guess? but it doesn’t matter because compiler does the work for you and you get fast execution…

[1]: for completeness, I want to point out technically branching on types is basically the same as defining multiple methods from compiler’s perspective and it optimize branches away based on types, but even just for the readability you still shouldn’t do this everywhere.

1 Like

You really should try not to do dispatch inside function like this

The one shown is just one of many exercises to dispatch a task. There are some dimensions involved:

  • Types which don’t have length(), nor size(), others have length(), the rest both.
  • Types that can directly be fed to write()
  • Singular/array dichotomy, adding tuple as another type to be dispatched.

Doing all these cases by separate functions leads literally to “multiple” dispatches…
But since I want to use it for serialization, by large the bulk is done in long arrays of floats, not in the limited structure of the data.

I think you’re missing some of the fundamental points here, say you have a mywrite(io, data) function, in your case it looks like:

function mywrite(io, data)
    if typeof(data)...
        write()
    elseif
        dosomestuff
        write()
    ...
end

and you’re calling this function inside other function:

function work()
     generate data
     ...
     mywrite(io, data)

You’re worried that having multiple definition of mywrite() would reduce performance? I’m saying it won’t.


Without knowing your full usage, I would try to suggest this pattern:

mywrite(io, v::T_can_be_directly_written) = write(io, v)

# repeat the following for different types that need different pre-proceesing
function mywrite(io, v::T_needs_preproceesing)
     mywrite(io, preprocess(v))
end
1 Like

Not worried about multiple definition writing, but getting over a certain number I feel the risk loosing the overview.

Here is the full version with dispatch inside the function:

"""
io = serialize(v) reinterprets a Julia object into a series of bytes.
v = deserialize(io) recreates the data from a byte stream

An exercise in dispatch style.

Based on
https://de.mathworks.com/matlabcentral/fileexchange/29457-serialize-deserialize
and "julianized" with the experts on
https://discourse.julialang.org/

"""

# Type encoding
itype = [
0   Float64; 
1   Float32; 
2   Float16;
3   Bool;
4   Int8;
5   UInt8;
6   Int16;
7   UInt16;
8   Int32;
9   UInt32;
10  Int64;
11  UInt64
12  Char;
13  String;
100 Tuple;
200 Any
]
WRITABLE = 0:12
STRUCT = 255
itype2type = Dict(itype[:,1] .=> itype[:,2])
type2itype = Dict(itype[:,2] .=> itype[:,1])

# Type and size prefix
function prefix(io, v) 
    #if haskey(type2itype, typeof(v)) && type2itype[typeof(v)] in WRITABLE
    if v isa Real || v isa Char
        println("type2itype[typeof(v)] in WRITABLE, prefix : $(type2itype[typeof(v)]))")
        write(io, UInt8(type2itype[typeof(v)]))
        write(io, UInt8(0))
    #elseif haskey(type2itype, eltype(v)) && type2itype[eltype(v)] in WRITABLE
    elseif v isa AbstractArray && (eltype(v) <: Real || eltype(v) == Char)
        println("type2itype[eltype(v)] in WRITABLE, prefix : $(type2itype[eltype(v)])")
        write(io, UInt8(type2itype[eltype(v)]))
        write(io, UInt8(ndims(v)))
        write(io, UInt32.(size(v))...)
    elseif v isa String
        println("v isa String, prefix : $(type2itype[String]))")
        write(io, UInt8(type2itype[String]))
        write(io, UInt8(1))
        write(io, UInt32(length(v)))
    elseif v isa Tuple
        println("v isa Tuple, prefix: $(type2itype[Tuple]))")
        write(io, UInt8(type2itype[Tuple]))
        write(io, UInt8(1))
        write(io, UInt32(length(v)))
    elseif v isa AbstractArray{Any}
        println("v isa AbstractArray{Any}, prefix: $(type2itype[Any]))")
        write(io, UInt8(type2itype[Any]))
        write(io, UInt8(ndims(v)))
        write(io, UInt32.(size(v))...)
    elseif v isa AbstractArray
        println("v isa AbstractArray, prefix: $STRUCT")
        write(io, UInt8(STRUCT))
        write(io, UInt8(ndims(v)))
        write(io, UInt32.(size(v))...)
    else
        tv = typeof(v)
        tbyte = UInt8(STRUCT)
        println("else, prefix: $(tbyte)")
        write(io, tbyte)
        write(io, UInt8(0))
    end
end

function serialize(io, v)
    println("v=$v")
    prefix(io, v)
    if (v isa Int || v isa Real || v isa Bool || v isa Char) 
        println("(v isa Int || v isa Real || v isa Bool || v isa Char)")
        write(io, v) 
    elseif v isa String
        println("v isa String")
        writestr(io, v)
    elseif v isa Tuple || v isa AbstractArray{Any} || v isa AbstractArray{String}
        println("v isa Tuple || v isa AbstractArray{Any} || v isa AbstractArray{String}")
        serialize.(Ref(io), v)
    elseif v isa AbstractArray
        println("v isa AbstractArray")
        if (first(v) isa Int || first(v) isa Real || first(v) isa Bool || first(v) isa Char) 
            println("(first(v) isa Int || first(v) isa Real || first(v) isa Bool || first(v) isa Char)")
            write(io, v) 
        else
            fc = fieldcount(eltype(v))
            writenum(io, fc, UInt32)
            println("fc=$fc")
            for name in fieldnames(typeof(first(v)))
                sname = String(name)
                len = ncodeunits(sname)
                println("len=$len, sname=$sname")
                writenum(io, len, UInt8)
                writestr(io, sname)
                serialize(io, getfield.(v, name))
            end
        end
    else
        fc = fieldcount(typeof(v))
        write(io, UInt32(fc))
        println("fc=$fc")
        for name in fieldnames(typeof(v))
            sname = String(name)
            len = ncodeunits(sname)
            println("len=$len, sname=$sname")
            writenum(io, len, UInt8)
            writestr(io, sname)
            serialize(io, getfield(v, name))
        end
    end
end

function deserialize(io) 
    ity = Int(readnum(io, UInt8))
    ndms = Int(readnum(io, UInt8))
    dms = ndms == 0 ? 1 : Int.(readnum(io, UInt32, ndms))
    println("ity=$ity, ndms=$ndms, dms=$dms")
    #error("stop")
    if ity in WRITABLE
        println("ity in WRITABLE")
        cls = itype2type[ity]
        return ndms == 0 ? readnum(io, cls) : reshape(readnum(io, cls, prod(dms)), dms...)
        
    elseif ity == STRUCT
        println("ity == STRUCT")
        fname = Symbol[]
        nfld = readnum(io, UInt32)
        fdata = []
        for i = 1:nfld
            fn = readstr(io, readnum(io, UInt8))
            println("fn=$fn")
            push!(fname, Symbol(fn))
            push!(fdata, deserialize(io))
        end
        if ndms == 0
            return NamedTuple(zip.(Ref(fname), zip(fdata...)))
        else
            return reshape(NamedTuple.(zip.(Ref(fname), zip(fdata...))), dms...)
        end
        
    elseif ity == type2itype[String]
        println("ity == type2itype[String]")
        if ndms == 1
            return String(read(io, dms[1]))
        else
            se = String[]
            for i = 1:prod(dms)
                push!(se, deserialize(io))
            end
            return reshape(se, dms...)
        end
        
    elseif ity == type2itype[Any] || ity == type2itype[Tuple]
        println("ity == type2itype[Any] || ity == type2itype[Tuple]")
        istuple = ity == type2itype[Tuple]
        ele = []
        for i = 1:prod(dms)
            push!(ele, deserialize(io))
        end
        if istuple
            return Tuple(ele)
        else
            return reshape(ele, dms...)
        end
    else
        error("unknown type index $ity")
    end
end

############## IO read/write routines ##############
# write number as type T
function writenum(io, num, T)
    write(io, T(num))
end
# read n numbers of type T
function readnum(io, T, n)
    s = sizeof(T)
    f = zeros(UInt8, s*n)
    readbytes!(io, f, s*n)
    return reinterpret(T, f)
end
# read single number of type T
function readnum(io, T)
    s = sizeof(T)
    f = zeros(UInt8, s)
    readbytes!(io, f, s)
    return reinterpret(T, f)[1]
end
# write string
function writestr(io, str)
    write(io, codeunits(str))
end
# read string of length n
function readstr(io, n)
    return String(read(io, n))
end

############## Tests ##############

function round_trip(data)
    open("io.bin", "w") do io
        serialize(io, data)
    end

    println("..deserialize..")

    io = open("io.bin", "r")
    data2 = deserialize(io)
    println("data2=$data2")
    close(io)

    data2
end

mutable struct Coords
    x::Float64
    y::Float64
    z::Float64
end
Coords() = Coords(rand(), rand(), rand())

Array_of_Int = [1, 2]
Array_of_Tuple = [(1, 2), (2, 3)]
Array_of_Any = ["Ab", (1, 2)]
Single_Num = pi
Array_of_Num = randn(3,3)
Single_Struct = Coords()
Array_of_Struct = [Coords() for i in 1:5]
Single_Tuple = ("Ab", [pi, 2.0])
Single_String = "toto"
Array_of_String = ["Ab" "toto"; "titi" "ok"]

round_trip(Array_of_Struct)

And here the version with regular/external dispatch:

"""
io = serialize(v) reinterprets a Julia object into a series of bytes.

An exercise in multiple dispatch style.

Based on
https://de.mathworks.com/matlabcentral/fileexchange/29457-serialize-deserialize
and "julianized" with the experts on
https://discourse.julialang.org/

"""

# Type codes
Struct = Union{Any, AbstractArray}
it = [
0   Float64; 
1   Float32; 
2   Float16;
3   Bool;
4   Char;
5   String;
6   Int8;
7   UInt8;
8   Int16;
9   UInt16;
10  Int32;
11  UInt32;
12  Int64;
13  UInt64
100 Tuple;
255 Struct
] 
byte2type = Dict(UInt8.(it[:,1]) .=> it[:,2])
type2byte = Dict(it[:,2] .=> UInt8.(it[:,1]))

# write-able types
Writable = Union{<:Real, Char, AbstractArray{<:Real}, AbstractArray{Char}}

# Type and size prefix for ndims 0, 1, >=1
function prefix(io, x)  
    tx = typeof(x)
    tbyte = haskey(type2byte, tx) ? type2byte[tx] : type2byte[Struct]
    write(io, tbyte)
    write(io, UInt8(0))
end
function prefix(io, x::String)
    write(io, type2byte[String])
    write(io, UInt8(1))
    write(io, UInt32(length(x)))
end
function prefix(io, x::Tuple)
    write(io, type2byte[Tuple])
    write(io, UInt8(1))
    write(io, UInt32(length(x)))
end
function prefix(io, x::AbstractArray)
    write(io, type2byte[eltype(x)])
    write(io, UInt8(ndims(x)))
    write(io, UInt32.(size(x))...)
end

# Entry function
function serialize(v)
    io = IOBuffer(UInt8[]; append = true)
    _serialize(io, v)
    return take!(io)
end
function serialize(io, v)
    _serialize(io, v)
end

# Writables
function _serialize(io, v::T) where {T <: Writable}
    prefix(io, v)
    write(io, v)
end
# Writables
function _serialize(io, v::String)
    prefix(io, v)
    write(io, codeunits(v))
end
# Tuples / Array of Any
function _serialize(io, v::T) where {T <: Union{Tuple, AbstractArray{Any}}}
    prefix(io, v)
    _serialize.(Ref(io), v)
end
# Array of Struct (stored as Struct of Array)
function _serialize(io, v::AbstractArray)
    prefix(io, v)
    fc = fieldcount(eltype(v))
    write(io, UInt32(fc))
    for name in fieldnames(typeof(first(v)))
        write(io, UInt8(ncodeunits(String(name))))
        write(io, String(name))
        _serialize(io, getfield.(v, name))
    end
end
# Struct
function _serialize(io, v)
    prefix(io, v)
    fc = fieldcount(typeof(v))
    write(io, UInt32(fc))
    for name in fieldnames(typeof(v))
        write(io, UInt8(length(String(name))))
        write(io, String(name))
        _serialize(io, getfield(v, name))
    end
end

# Entry function
function deserialize(io)
	cls = byte2type[readnum(io, UInt8)]
	ret = _deserialize(io, cls)
    #println(ret)
    return ret
end
# Read dimensions
function readdim(io)
    ndms = Int(readnum(io, UInt8))
    return (ndms, ndms == 0 ? 1 : Int.(readnum(io, UInt32, ndms)))
end
# Writables
function _deserialize(io, cls::Type{T}) where {T <: Writable}
    (ndms, dms) = readdim(io)
    return ndms == 0 ? readnum(io, cls) : reshape(readnum(io, cls, prod(dms)), dms...)
end
function _deserialize(io, ::Type{String})
    (ndms, dms) = readdim(io)
    return String(read(io, dms[1]))
end

# Tuples
function _deserialize(io, cls::Type{Tuple})
    (ndms, dms) = readdim(io)
    tele = []
    for i = 1:dms[1]
        cls = byte2type[readnum(io, UInt8)]
        push!(tele, _deserialize(io, cls))
    end 
    return Tuple(tele)
end
# (Array of) struct -> (array of) named tuple
function _deserialize(io, cls::Type{Struct})
    (ndms, dms) = readdim(io)
    fname = Symbol[]
    fdata = []
    nfld = readnum(io, UInt32)
    for i = 1:nfld
        fn = readstr(io, readnum(io, UInt8))
        push!(fname, Symbol(fn))
        cls = byte2type[readnum(io, UInt8)]
        push!(fdata, _deserialize(io, cls))
    end
    if ndms == 0
        return NamedTuple(zip.(Ref(fname), zip(fdata...)))
    else
        return reshape(NamedTuple.(zip.(Ref(fname), zip(fdata...))), dms...)
    end
end

############## IO read/write routines ##############
# write number as type T
function writenum(io, num, T)
    write(io, T(num))
end
# read n numbers of type T
function readnum(io, T, n)
    s = sizeof(T)
    f = zeros(UInt8, s*n)
    readbytes!(io, f, s*n)
    return reinterpret(T, f)
end
# read single number of type T
function readnum(io, T)
    s = sizeof(T)
    f = zeros(UInt8, s)
    readbytes!(io, f, s)
    return reinterpret(T, f)[1]
end
# write string
function writestr(io, str)
    write(io, codeunits(str))
end
# read string of length n
function readstr(io, n)
    return String(read(io, n))
end

############## Tests ##############

Array_of_Int = [1, 2]
Array_of_Tuple = [(1, 2), (2, 3)]
Array_of_Any = ["Ab", (1, 2)]

mutable struct Coords
    x::Float64
    y::Float64
    z::Float64
end
Coords() = Coords(rand(), rand(), rand())
Array_of_Struct = [Coords() for i in 1:5]
Single_Struct = Coords()


    #write(io, serialize(1))
    #write(io, serialize([1,2.0]))
    #write(io, serialize(1.0))
    #write(io, serialize([1.0]))
    #write(io, serialize(randn(3,3))) 
    #write(io, serialize((1.0, 2))) 
    #write(io, serialize(["a" "b"; "c" "d"])) 
    #write(io, serialize("a")) 
    #write(io, serialize(Array_of_Any)) # fixme: hanging
    #write(io, serialize(Array_of_Struct)) 
    #write(io, serialize(Single_Struct))


function round_trip(data)
    open("io.bin", "w") do io
        serialize(io, data)
    end

    io = open("io.bin", "r")
    data2 = deserialize(io)
    println("data2=$data2")

    open("io.bin", "r") do io
        data3 = deserialize(io)
        println("data3=$data3")
    end
    #println("data3=$data3")
end

data = ("Ab", [pi, 2.0])
round_trip(data)

1 Like

so yeah, what you’re having is fine, at this number of branches, it’s almost just a matter of personal preference. (I’d note you want to make look-up table constant btw)

OK, thx.