You need to flatten the structure manually, something like this:
import HTTP, JSON3, DataFrames.DataFrame
function load_data()
r = HTTP.get("https://raw.githubusercontent.com/hodcroftlab/covariants/master/cluster_tables/EUClusters_data.json")
json = JSON3.read(r.body) # convert HTTP response to a nested dict
variant_names = [ # extracted from the first record
Symbol("20I (Alpha, V1)")
Symbol("20H (Beta, V2)")
Symbol("20J (Gamma, V3)")
Symbol("21A (Delta)")
Symbol("21I (Delta)")
Symbol("21J (Delta)")
Symbol("21K (Omicron)")
Symbol("21L (Omicron)")
Symbol("21B (Kappa)")
Symbol("21D (Eta)")
Symbol("21F (Iota)")
Symbol("21G (Lambda)")
Symbol("21H (Mu)")
Symbol("20B/S:732A")
Symbol("20A/S:126A")
Symbol("20E (EU1)")
Symbol("21C (Epsilon)")
Symbol("20A/S:439K")
Symbol("20A.EU2")
Symbol("20A/S:98F")
Symbol("20C/S:80Y")
Symbol("20B/S:626S")
]
df = DataFrame(
:country => [],
:week => [],
:total_sequences => [],
[name => [] for name in variant_names]... # put each variant as a separate column
# alternatively, create a single column for the dominant variant
)
for (country_name, country) in json["countries"]
for i in 1:length(country[:week])
row = (
country_name,
country[:week][i],
country[:total_sequences][i],
# if you want only the dominant variant, determine it here instead of recording all of them
[haskey(country, variant) ? country[variant][i] : 0 for variant in variant_names]...
)
push!(df, row)
end
end
return df
end
To summarize, DataFrames won’t magically understand the structure of your data, you have to tell it how to interpret it.