If you want to do some tests, I submit a script that recursively reads a json file that contains multi-level nested tables.
The result is a dataframe containing in some cells a dataframe as value.
I add a function for selective unpacking of the columns of interest.
PS. I didnβt test on files other than the attached one that I found on a web page.
julia> using JSON3
julia> using DataFrames
the json string
julia> jdata = """[
{
"name":"bob",
"salary":13000,
"friends":[
{
"name": "sarah",
"salary":10000
},
{
"name": "bill",
"salary":5000
}
]
},
{
"name":"marge",
"salary":10000,
"friends":[
{
"name": "rhonda",
"salary":10000
},
{
"name": "mike",
"salary":5000,
"hobbies":[
{
"name":"surfing",
"frequency":10
},
{
"name":"surfing",
"frequency":15
}
]
}
]
},
{
"name":"joe",
"salary":10000,
"friends":[
{
"name": "harry",
"salary":10000
},
{
"name": "sally",
"salary":5000
}
]
}
]"""
"[\n {\n \"name\":\"bob\",\n \"salary\":13000,\n \"friends\":[\n {\n \"name\": \"sarah\",\n \"salary\":10000\n },\n {\n \"name\": \"bill\",\n " β― 691 bytes β― " \"friends\":[\n {\n \"name\": \"harry\",\n \"salary\":10000\n },\
julia> jsobj = JSON3.read(jdata);
the function to read nested json files to dataframe
julia> function allflatnt(vnt)
for i in eachindex(vnt)
for k in keys(vnt[i])
v=vnt[i][k]
if v isa Vector{<:Dict}
return false
end
end
end
true
end
allflatnt (generic function with 1 method)
julia> function nestdf(ant)
ANT=copy(ant)
for i in eachindex(ANT)
for (k,v) in ANT[i]
if v isa Vector{<:Dict}
if allflatnt(v)
ANT[i]=merge(ANT[i], Dict(k=>DataFrame(Tables.dictrowtable(v))))
else
ANT[i]=merge(ANT[i], Dict(k=>nestdf(v)))
end
end
end
end
DataFrame(Tables.dictrowtable(ANT))
end
nestdf (generic function with 1 method)
julia> ndf=nestdf(jsobj)
3Γ3 DataFrame
Row β name salary friends
β String Int64 DataFrame
ββββββΌβββββββββββββββββββββββββββββββ
1 β bob 13000 2Γ2 DataFrame
2 β marge 10000 2Γ3 DataFrame
3 β joe 10000 2Γ2 DataFrame
the expansion function
function expand(ndf, col)
df2nt(df)=(;zip(Symbol.(names(df)),eachcol(df))...)
rexp=filter(er-> er[col] isa DataFrame, ndf)
foreach(subdf->rename!(n->string(col,'.')*n ,subdf), rexp[:,col])
rt=Tables.dictrowtable(vcat(df2nt.(rexp[:,col])...))
edf=hcat(rexp[:,Not(col)],DataFrame(rt))
iexp=findall.(>(1),map(r-> [er isa Vector ? length(er) : 1 for er in r],values.(eachrow(edf))))
edfp=mapreduce(r->flatten(edf[r:r,:],iexp[r]), vcat,1: nrow(edf))
nedfp=select(filter(er-> !isa(er[col] , DataFrame), ndf),Not(col))
vcat(nedfp,edfp,cols=:union)
end
julia> endf=expand(ndf,:friends)
6Γ5 DataFrame
Row β name salary friends.name friends.salary friends.hobbies
β String Int64 String? Int64? Any
ββββββΌβββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
1 β bob 13000 sarah 10000 missing
2 β bob 13000 bill 5000 missing
3 β marge 10000 rhonda 10000 missing
4 β marge 10000 mike 5000 2Γ2 DataFrame
5 β joe 10000 harry 10000 missing
6 β joe 10000 sally 5000 missing
julia> expand(endf, Symbol("friends.hobbies"))
7Γ6 DataFrame
Row β name salary friends.name friends.salary friends.hobbies.frequency friends.hobbies.name
β String Int64 String? Int64? Int64? String?
ββββββΌβββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
1 β bob 13000 sarah 10000 missing missing
2 β bob 13000 bill 5000 missing missing
3 β marge 10000 rhonda 10000 missing missing
4 β joe 10000 harry 10000 missing missing
5 β joe 10000 sally 5000 missing missing
6 β marge 10000 mike 5000 10 surfing
7 β marge 10000 mike 5000 15 surfing