Writing compound structs with variable length strings to HDF5

Variable length string work with C string types, see the code below:

#!/usr/bin/env julia
# -*- coding: utf-8 -*-

using HDF5, Distributions, Random

function get_test_str(min::Int64, max::Int64)
    alphabet = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
    count = rand(DiscreteUniform(min, max))
    # we are returning a pointer to a string, which is GC-d later
    return count+1, pointer(randstring(count))
end

function get_test_data(count::Int64, min::Int64, max::Int64)
    size,payload = zeros(Int64, count), Vector{Ptr{UInt8}}(undef, count)
    for i = 1:count
        size[i],payload[i] = get_test_str(min, max)
    end
    # we know the byte size, and the memory locations
    return size,payload
end


h5open("julia.h5", "w") do f
    count = 1024
    batch = 1000
    min = 5
    max = 30
    throughput=zeros(Float32, batch)
    # file type
    data_type = HDF5Datatype(HDF5.h5t_copy(HDF5.H5T_C_S1))
    HDF5.h5t_set_cset(data_type, HDF5.H5T_CSET_UTF8)
    HDF5.h5t_set_size(data_type, HDF5.HDF5.H5T_VARIABLE)
    dset = d_create(f, "vlen_dataset", data_type, ((batch+1)*count,), "chunk", (count,) )
    dxpl = p_create(HDF5.H5P_DATASET_XFER)
    for i = 0:batch-1
        size,dataset = get_test_data(count, min, max)
        start = time()
        mem_space = dataspace(dataset)
        file_space = HDF5.hyperslab(dset,i*count+1:(i+1)*count) 
        HDF5.h5d_write(dset, data_type, mem_space, file_space, dxpl, dataset)
        stop = time()
        throughput[i+1] = sum(size) / (1000000*(stop - start))
    end
    print("avg throughput: ", mean(throughput), "MiB/sec")
end

best wishes: steven
h5cpp.org

1 Like