How can i recursively search within a directory to extract h5 file headers

How can I get glob to search within a complex directory structure to extract headers recursively? So far, glob seems to work only in the main directory.

using HDF5
using CSV
using DataFrames
using Glob
 
# Define the directory containing the HDF5 files and the output CSV file path
#directory_path = raw"/Users/silva/Desktop/H5_data_header_test"
 
directory_path = raw"/Volumes/data/Backup_data/Z0422-17_VNC_1/raw_h5/Merlin-6049/2022/06/22/09/"
 
csv_file_path = raw"/Users/silvam2/Desktop/H5_data_header_test/test.csv"
 
# Specify the attributes you want to print
attributes_to_print = ["0-0-0", "0-0-1", "0-0-2", "0-1-0", "0-1-1", "0-1-2", "0-2-0", "0-2-1", "0-2-2"]
 
# Initialize a DataFrame to store the output with FileName and Attribute columns
df = DataFrame(FileName=String[], Attribute=String[])
 
# Collect all possible keys to create DataFrame columns
all_keys = Set{String}()
 
# Collect all attribute data
attribute_data = []
 
# Get the list of all HDF5 files in the directory
h5_files = glob("*.h5", directory_path)
 
if length(h5_files) == 0
    println("No HDF5 files found in the directory.")
end
 
for file_path in h5_files
    # Get the file name from the file path
    file_name = basename(file_path)
    println("Processing file: ", file_name)
   
    h5open(file_path, "r") do h5f
        for attr in attributes_to_print
            if haskey(h5f, attr)
                attr_keys = keys(attrs(h5f[attr]))
                attr_values = values(attrs(h5f[attr]))
                attr_dict = Dict("Attribute" => attr, "FileName" => file_name)
                for (key, value) in zip(attr_keys, attr_values)
                    attr_dict[key] = string(value)
                    push!(all_keys, key)
                end
                push!(attribute_data, attr_dict)
                println("Collected attribute: ", attr)
            else
                println("Attribute not found: ", attr)
                attr_dict = Dict("Attribute" => attr, "FileName" => file_name)
                push!(attribute_data, attr_dict)
            end
        end
    end
end
 
# Add all possible keys as columns to the DataFrame
for key in all_keys
    df[!, key] = String[]
end
 
# Fill the DataFrame with the collected attribute data
for data in attribute_data
    row = [get(data, "FileName", ""), get(data, "Attribute", "")]
    for key in all_keys
        push!(row, get(data, key, ""))
    end
    push!(df, row)
end
 
# Write the DataFrame to a CSV file
CSV.write(csv_file_path, df)
println("CSV file written to: ", csv_file_path)

1 Like

Try

h5_files = glob("**/*.h5", directory_path)
1 Like

Thank you, Florian! It works now :slight_smile:

This has come up before. glob does not search recursively through the entire directory tree–but rather merely one level below the starting directory. See this post for true recursive solution using the walkdir function.