Flattening YFinance.jl JSON result into a DataFrame

I couldn’t install the package so I decided to try to write something I’ve been thinking about for some time.
I tested it very little. I don’t know how general this is, but it should work for all “table” forms

Edit: this performs better by avoiding some allocations

function expand1(ndf, col)
    df2nt(df)=(;zip(Symbol.(names(df)),eachcol(df))...)
    rexp=findall(er-> er[col] isa DataFrame, eachrow(ndf))
    foreach(subdf->rename!(n->string(col,'.')*n  ,subdf), ndf[rexp,col])
    rt=Tables.dictrowtable(vcat(df2nt.(ndf[rexp,col])...))
    edf=hcat(ndf[rexp,Not(col)],DataFrame(rt))
    iexp=findall.(>(1),map(r-> [er isa Vector ? length(er) : 1 for er in r],values.(eachrow(edf))))
    edfp=mapreduce(r->flatten(edf[r:r,:],iexp[r]), vcat,1: nrow(edf))
    nedfp=select(filter(er-> !isa(er[col] , DataFrame), ndf),Not(col))
    vcat(nedfp,edfp,cols=:union)
end

this should avoid mutations to the original dataframe (sub-dataframe names)

function expand2(ndf, col)
    df2nt(sdf)=(;zip(Symbol.(string.(col,'.',names(sdf))),eachcol(sdf))...)
    rexp=findall(er-> er[col] isa DataFrame, eachrow(ndf))
    rt=Tables.dictrowtable(vcat(df2nt.(ndf[rexp,col])...))
    edf=hcat(ndf[rexp,Not(col)],DataFrame(rt))
    iexp=findall.(>(1),map(r-> [er isa Vector ? length(er) : 1 for er in r],values.(eachrow(edf))))
    edfp=mapreduce(r->flatten(edf[r:r,:],iexp[r]), vcat,1: nrow(edf))
    nedfp=select(filter(er-> !isa(er[col] , DataFrame), ndf),Not(col))
    vcat(nedfp,edfp,cols=:union)
end

nilshg,

I’m refering to YFinance, which is a relatively new package

function's methods to read json file in nested DataFrame

 function allflatnt(vnt::Vector{Dict{Symbol, Any}})
           for i in eachindex(vnt)
               for k in keys(vnt[i])
                   v=vnt[i][k]
                   if v isa Vector{<:Dict} || v isa Dict
                       return false
                   end
               end
           end
           true
       end


       function allflatnt(d::Dict{Symbol, Any})
            for k in keys(d)
                if d[k] isa Vector{<:Dict} || d[k] isa Dict
                    return false
                end
            end

        true
    end




function nestdf(vd::Vector{Dict{Symbol, Any}})
 for i in eachindex(vd)
     for (k,v) in vd[i]
         if v isa Vector{<:Dict} || v isa Dict
             if allflatnt(v)
                tdf=DataFrame(Tables.dictrowtable(v))
                if names(tdf)==["first","second"]
                    rename!(tdf, [:first,:second].=>[:key,:value])
                end
                 vd[i]=merge(vd[i], Dict(k=>tdf))
             else
                 vd[i]=merge(vd[i], Dict(k=>nestdf(v)))
             end
         end
     end
 end
 DataFrame(Tables.dictrowtable(vd))
end


function nestdf(d::Dict{Symbol, Any})
    for (k,v) in d
        if v isa Vector{<:Dict} || v isa Dict
            if allflatnt(v)
                tdf=DataFrame(Tables.dictrowtable(v))
                if names(tdf)==["first","second"]
                    rename!(tdf, [:first,:second].=>[:key,:value])
                end
                d=merge(d, Dict(k=>tdf))
            else
                d=merge(d, Dict(k=>nestdf(v)))
            end
        end
    end
    rename!(DataFrame(Tables.dictrowtable(d)), [:first,:second].=>[:key,:value])
end




nestdf(jsobj::JSON3.Object)=nestdf(copy(jsobj))
nestdf(jsobj::JSON3.Array)=nestdf(copy(jsobj))


I adapted the previous scripts to be able to handle nested structures of JSON.Array and JSON.Object.
I only did some tests on the data corresponding to the “AAPL” ticker

using YFinance, JSON3, DataFrames

aapl_json=get_quoteSummary("AAPL")

Below are some possible “views” with the structure obtained by applying the nestdf() function

julia> ndf=nestdf(aapl_json)
31×2 DataFrame
 Row │ key                                value
     │ Symbol                             DataFrame      
─────┼───────────────────────────────────────────────────
   1 │ cashflowStatementHistoryQuarterly  2×2 DataFrame  
   2 │ industryTrend                      3×2 DataFrame  
   3 │ earningsTrend                      2×2 DataFrame  
   4 │ incomeStatementHistory             2×2 DataFrame  
   5 │ price                              31×2 DataFrame 
   6 │ upgradeDowngradeHistory            2×2 DataFrame  
   7 │ institutionOwnership               2×2 DataFrame  
   8 │ fundOwnership                      2×2 DataFrame
   9 │ summaryDetail                      42×2 DataFrame
  10 │ netSharePurchaseActivity           12×2 DataFrame
  11 │ earnings                           4×2 DataFrame
  12 │ insiderHolders                     2×2 DataFrame
  13 │ calendarEvents                     4×2 DataFrame
  14 │ summaryProfile                     13×2 DataFrame
  15 │ majorDirectHolders                 2×2 DataFrame
  16 │ assetProfile                       20×2 DataFrame
  17 │ recommendationTrend                2×2 DataFrame
  18 │ balanceSheetHistory                2×2 DataFrame
  19 │ financialData                      30×2 DataFrame
  20 │ indexTrend                         5×2 DataFrame
  21 │ balanceSheetHistoryQuarterly       2×2 DataFrame
  22 │ insiderTransactions                2×2 DataFrame
  23 │ cashflowStatementHistory           2×2 DataFrame
  24 │ earningsHistory                    2×2 DataFrame
  25 │ quoteType                          13×2 DataFrame
  26 │ sectorTrend                        3×2 DataFrame
  27 │ esgScores                          36×2 DataFrame
  28 │ secFilings                         2×2 DataFrame
  29 │ defaultKeyStatistics               39×2 DataFrame
  30 │ majorHoldersBreakdown              5×2 DataFrame
  31 │ incomeStatementHistoryQuarterly    2×2 DataFrame

julia> ndf.value[16]
20×2 DataFrame
 Row │ key                        value
     │ Symbol                     Any
─────┼──────────────────────────────────────────────────────────────
   1 │ auditRisk                  4
   2 │ country                    United States
   3 │ overallRisk                1
   4 │ state                      CA
   5 │ phone                      408 996 1010
   6 │ longBusinessSummary        Apple Inc. designs, manufactures…
   7 │ city                       Cupertino
   8 │ zip                        95014
   9 │ industry                   Consumer Electronics
  10 │ address1                   One Apple Park Way
  11 │ shareHolderRightsRisk      1
  12 │ governanceEpochDate        1680307200
  13 │ maxAge                     86400
  14 │ compensationRisk           5
  15 │ compensationAsOfEpochDate  1672444800
  16 │ boardRisk                  1
  17 │ sector                     Technology
  18 │ fullTimeEmployees          164000
  19 │ website                    https://www.apple.com
  20 │ companyOfficers            10×9 DataFrame

julia> ndf.value[16].value[20]
10×9 DataFrame
 Row │ fiscalYear  unexercisedValue  age      name                     title               ⋯
     │ Int64?      DataFrame         Int64?   String                   String              ⋯
─────┼──────────────────────────────────────────────────────────────────────────────────────
   1 │       2022  3×2 DataFrame          61  Mr. Timothy D. Cook      CEO & Director      ⋯
   2 │       2022  3×2 DataFrame          59  Mr. Luca  Maestri        CFO & Sr. VP
   3 │       2022  3×2 DataFrame          58  Mr. Jeffrey E. Williams  Chief Operating Off  
   4 │       2022  3×2 DataFrame          58  Ms. Katherine L. Adams   Sr. VP, Gen. Counse  
   5 │       2022  3×2 DataFrame          55  Ms. Deirdre  O'Brien     Sr. VP of Retail    ⋯
   6 │    missing  3×2 DataFrame     missing  Mr. Chris  Kondo         Sr. Director of Cor  
   7 │    missing  3×2 DataFrame     missing  Mr. James  Wilson        Chief Technology Of  
   8 │    missing  3×2 DataFrame     missing  Ms. Mary  Demby          Chief Information O  
   9 │    missing  3×2 DataFrame     missing  Ms. Nancy  Paxton        Sr. Director of Inv ⋯
  10 │    missing  3×2 DataFrame     missing  Mr. Greg  Joswiak        Sr. VP of Worldwide  
                                                                           5 columns omitted


julia> ndf.value[16].value[20].totalPay[1:3]
3-element Vector{Union{Missing, DataFrame}}:
 3×2 DataFrame
 Row │ key      value      
     │ Symbol   Union…     
─────┼─────────────────────
   1 │ fmt      16.43M
   2 │ longFmt  16,425,933
   3 │ raw      16425933
 3×2 DataFrame
 Row │ key      value     
     │ Symbol   Union…    
─────┼────────────────────
   1 │ fmt      5.02M
   2 │ longFmt  5,019,783
   3 │ raw      5019783
 3×2 DataFrame
 Row │ key      value     
     │ Symbol   Union…    
─────┼────────────────────
   1 │ fmt      5.02M
   2 │ longFmt  5,018,337
   3 │ raw      5018337



some refactoring
allflatnt(vd::Vector{Dict{Symbol, Any}})= all(allflatnt, vd)


allflatnt(d::Dict{Symbol, Any})=all(e->!(last(e) isa Vector{<:Dict} || last(e) isa Dict),d)


function nestdf(vd::Vector{Dict{Symbol, Any}})
  DataFrame(Tables.dictrowtable(nestdict.(vd)))
end

function nestdf(d::Dict{Symbol, Any})
    tdf=DataFrame(Tables.dictrowtable(nestdict(d)))
    if names(tdf)==["first","second"]
        rename!(tdf, [:first,:second].=>[:key,:value])
    end
    #tdf
end

function nestdict(d::Dict{Symbol, Any})
    for (k,v) in d
       if v isa Vector{<:Dict} || v isa Dict
            if allflatnt(v)
                tdf=DataFrame(Tables.dictrowtable(v))
                if names(tdf)==["first","second"]
                    rename!(tdf, [:first,:second].=>[:key,:value])
                end
                d=merge(d, Dict(k=>tdf))
            else
                d=merge(d, Dict(k=>nestdf(v)))
            end
        end
    end
    d 
end



nestdf(jsobj::JSON3.Object)=nestdf(copy(jsobj))
nestdf(jsobj::JSON3.Array)=nestdf(copy(jsobj))



using YFinance, JSON3, DataFrames

aapl_json=get_quoteSummary("AAPL")

ndf=nestdf(aapl_json)
1 Like