It is possible to use HDF5 for in-memory serialization of arrays. One advantage is that HDF5 is widely supported. Unfortunately, the required functions donāt seem to be wrapped by HDF5.jl, so it is a bit of work:
using HDF5
using HDF5_jll
const H5LT_FILE_IMAGE_OPEN_RW = 0x0001
const H5LT_FILE_IMAGE_DONT_COPY = 0x0002
const H5LT_FILE_IMAGE_DONT_RELEASE = 0x0004
function H5Fget_file_image(file_id, buf, buf_len)
var"#status#" = ccall((:H5Fget_file_image, HDF5_jll.libhdf5_hl), Cssize_t, (HDF5.hid_t, Ptr{Cvoid}, Csize_t), file_id, buf, buf_len)
var"#status#" < 0 && error("Error getting file image")
return var"#status#"
end
function h5p_set_fapl_core(fapl_id, block_size=1024, backing_store=false)
var"#status#" = ccall((:H5Pset_fapl_core, libhdf5), HDF5.herr_t, (HDF5.hid_t, Csize_t, HDF5.hbool_t), fapl_id, block_size, backing_store)
var"#status#" < 0 && error("Error setting core fapl")
return nothing
end
function get_file_image(file)
buf_len = H5Fget_file_image(file.id, C_NULL, 0)
buf = Vector{UInt8}(undef, buf_len)
buf_len = H5Fget_file_image(file.id, buf, buf_len)
return buf
end
function serialize(array)
fileprop = create_property(HDF5.H5P_FILE_ACCESS)
h5p_set_fapl_core(fileprop)
f_hid = HDF5.h5f_create("name", HDF5.H5F_ACC_TRUNC, HDF5.H5P_DEFAULT, fileprop)
fout = HDF5File(f_hid, "name")
fout["data"] = array
HDF5.flush(fout)
image = get_file_image(fout)
close(fout)
return image
end
function deserialize(data)
flags = H5LT_FILE_IMAGE_DONT_COPY | H5LT_FILE_IMAGE_DONT_RELEASE
f_hid = @ccall libhdf5_hl.H5LTopen_file_image(data::Ptr{UInt8}, length(data)::Csize_t, flags::Cuint)::HDF5.Hid
file = HDF5File(f_hid, "name")
dset = file["data"]
offset = HDF5.h5d_get_offset(dset.id)
shape, _ = HDF5.get_dims(dset)
dtype = HDF5.eltype(dset)
close(dset)
close(file)
return reshape(reinterpret(dtype, @view(data[offset+1:end])), shape)
end
Usage:
julia> a = rand(1000, 500);
julia> data = serialize(a);
julia> a == deserialize(data)
true
julia> @benchmark serialize(a)
BenchmarkTools.Trial:
memory estimate: 3.82 MiB
allocs estimate: 9
--------------
minimum time: 1.040 ms (0.00% GC)
median time: 1.110 ms (0.00% GC)
mean time: 1.163 ms (2.86% GC)
maximum time: 2.831 ms (14.16% GC)
--------------
samples: 4295
evals/sample: 1
julia> @benchmark deserialize(data)
BenchmarkTools.Trial:
memory estimate: 672 bytes
allocs estimate: 15
--------------
minimum time: 122.549 Ī¼s (0.00% GC)
median time: 124.564 Ī¼s (0.00% GC)
mean time: 126.256 Ī¼s (0.00% GC)
maximum time: 8.493 ms (0.00% GC)
--------------
samples: 10000
evals/sample: 1
This is completely untested and definitely needs better error handling. It might be possible to avoid the copy in serialize
. One could also easily use compression for faster transfer over a slow connection.
Edit: The above worked on HDF5.jl version v0.14.2 but doesnāt any more in version v0.15.5. An updated version:
function serialize(array)
fileprop = create_property(HDF5.H5P_FILE_ACCESS)
h5p_set_fapl_core(fileprop)
f_hid = HDF5.h5f_create("name", HDF5.H5F_ACC_TRUNC, HDF5.H5P_DEFAULT, fileprop)
close(fileprop)
fout = HDF5.File(f_hid, "name", false)
fout["data"] = array
HDF5.flush(fout)
image = get_file_image(fout)
close(fout)
return image
end
function deserialize(data)
flags = H5LT_FILE_IMAGE_DONT_COPY | H5LT_FILE_IMAGE_DONT_RELEASE
f_hid = @ccall libhdf5_hl.H5LTopen_file_image(data::Ptr{UInt8}, length(data)::Csize_t, flags::Cuint)::HDF5.hid_t
file = HDF5.File(f_hid, "name")
dset = file["data"]
offset = HDF5.h5d_get_offset(dset.id)
shape, _ = HDF5.get_dims(dset)
dtype = HDF5.eltype(dset)
close(dset)
close(file)
return reshape(reinterpret(dtype, @view(data[offset+1:end])), shape)
end