Variable length string work with C string types, see the code below:
#!/usr/bin/env julia
# -*- coding: utf-8 -*-
using HDF5, Distributions, Random
function get_test_str(min::Int64, max::Int64)
alphabet = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
count = rand(DiscreteUniform(min, max))
# we are returning a pointer to a string, which is GC-d later
return count+1, pointer(randstring(count))
end
function get_test_data(count::Int64, min::Int64, max::Int64)
size,payload = zeros(Int64, count), Vector{Ptr{UInt8}}(undef, count)
for i = 1:count
size[i],payload[i] = get_test_str(min, max)
end
# we know the byte size, and the memory locations
return size,payload
end
h5open("julia.h5", "w") do f
count = 1024
batch = 1000
min = 5
max = 30
throughput=zeros(Float32, batch)
# file type
data_type = HDF5Datatype(HDF5.h5t_copy(HDF5.H5T_C_S1))
HDF5.h5t_set_cset(data_type, HDF5.H5T_CSET_UTF8)
HDF5.h5t_set_size(data_type, HDF5.HDF5.H5T_VARIABLE)
dset = d_create(f, "vlen_dataset", data_type, ((batch+1)*count,), "chunk", (count,) )
dxpl = p_create(HDF5.H5P_DATASET_XFER)
for i = 0:batch-1
size,dataset = get_test_data(count, min, max)
start = time()
mem_space = dataspace(dataset)
file_space = HDF5.hyperslab(dset,i*count+1:(i+1)*count)
HDF5.h5d_write(dset, data_type, mem_space, file_space, dxpl, dataset)
stop = time()
throughput[i+1] = sum(size) / (1000000*(stop - start))
end
print("avg throughput: ", mean(throughput), "MiB/sec")
end
best wishes: steven
h5cpp.org