Writing compound structs with variable length strings to HDF5

I am trying to write a compound type that contains a variable length string as a field to HDF5. Following is the approach that I took but the strings written to the H5 file are garbled as shown below:

Creating compound types with non-string types works without any issues. Any pointers on what might be the issue? Thanks.

using HDF5

struct Foo
    x::String
end

function HDF5.datatype(::Type{Foo})
    # compound dtype
    dtype = HDF5.h5t_create(HDF5.H5T_COMPOUND, sizeof(Foo))
    # var len string
    varlenstr_dtype = HDF5Datatype(HDF5.h5t_copy(HDF5.H5T_C_S1))
    HDF5.h5t_set_size(varlenstr_dtype.id, HDF5.H5T_VARIABLE)

    HDF5.h5t_insert(dtype, "x", 0, varlenstr_dtype)
    HDF5.HDF5Datatype(dtype)
end

data = map(Foo, ["hi", "there"])
h5open("/tmp/test.h5", "w") do file
    g = g_create(file, "testGroup")
    dtype = datatype(Foo)
    size = length(data)
    dset = d_create(g, "testDataset", HDF5.HDF5Datatype(dtype), dataspace(data))
    HDF5.h5d_write(dset, dtype,  HDF5.H5P_DEFAULT, HDF5.H5P_DEFAULT, HDF5.H5P_DEFAULT, data)
end

Output of the H5 file:

HDF5 "/tmp/test.h5" {
GROUP "/" {
   GROUP "testGroup" {
      DATASET "testDataset" {
         DATATYPE  H5T_COMPOUND {
            H5T_STRING {
               STRSIZE H5T_VARIABLE;
               STRPAD H5T_STR_NULLTERM;
               CSET H5T_CSET_ASCII;
               CTYPE H5T_C_S1;
            } "x";
         }
         DATASPACE  SIMPLE { ( 2 ) / ( 2 ) }
         DATA {
         (0): {
               "0Mc\006\37777777764\177"
            },
         (1): {
               "PMc\006\37777777764\177"
            }
         }
      }
   }
}
}

Variable length string work with C string types, see the code below:

#!/usr/bin/env julia
# -*- coding: utf-8 -*-

using HDF5, Distributions, Random

function get_test_str(min::Int64, max::Int64)
    alphabet = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
    count = rand(DiscreteUniform(min, max))
    # we are returning a pointer to a string, which is GC-d later
    return count+1, pointer(randstring(count))
end

function get_test_data(count::Int64, min::Int64, max::Int64)
    size,payload = zeros(Int64, count), Vector{Ptr{UInt8}}(undef, count)
    for i = 1:count
        size[i],payload[i] = get_test_str(min, max)
    end
    # we know the byte size, and the memory locations
    return size,payload
end


h5open("julia.h5", "w") do f
    count = 1024
    batch = 1000
    min = 5
    max = 30
    throughput=zeros(Float32, batch)
    # file type
    data_type = HDF5Datatype(HDF5.h5t_copy(HDF5.H5T_C_S1))
    HDF5.h5t_set_cset(data_type, HDF5.H5T_CSET_UTF8)
    HDF5.h5t_set_size(data_type, HDF5.HDF5.H5T_VARIABLE)
    dset = d_create(f, "vlen_dataset", data_type, ((batch+1)*count,), "chunk", (count,) )
    dxpl = p_create(HDF5.H5P_DATASET_XFER)
    for i = 0:batch-1
        size,dataset = get_test_data(count, min, max)
        start = time()
        mem_space = dataspace(dataset)
        file_space = HDF5.hyperslab(dset,i*count+1:(i+1)*count) 
        HDF5.h5d_write(dset, data_type, mem_space, file_space, dxpl, dataset)
        stop = time()
        throughput[i+1] = sum(size) / (1000000*(stop - start))
    end
    print("avg throughput: ", mean(throughput), "MiB/sec")
end

best wishes: steven
h5cpp.org

1 Like