Here’s what I ended up using:
using Gumbo
using Cascadia
using StringEncodings
using DataFrames
function html2df(file; encoding=enc"UTF-8")
if encoding isa AbstractString
encoding = Encoding(encoding)
end
n = parsehtml(read(file, String, encoding))
dfs = DataFrame[]
for table in eachmatch(sel"table", n.root)
# Get column names from table
headers = eachmatch(sel"thead tr th", table) .|> nodeText
# Create dataframe with all columns of String type
types = fill(String, length(headers))
df = DataFrame(types, headers)
# Fill dataframe with rows from the table
for row in eachmatch(sel"tbody tr", table)
row_texts = eachmatch(sel"td", row) .|> nodeText
push!(df, row_texts)
end
push!(dfs, df)
end
return dfs
end
It takes a file name and returns an array of dataframes, one for each table in the file.