Speed up data extraction from large json file

here a discussion about a similar problem. You could try some of the suggested ideas.

here one of the proposals

allflatnt(vd::Vector{Dict{Symbol, Any}})= all(allflatnt, vd)


allflatnt(d::Dict{Symbol, Any})=all(e->!(last(e) isa Vector{<:Dict} || last(e) isa Dict),d)

function nestdf(vd::Vector{Dict{Symbol, Any}})
  DataFrame(Tables.dictrowtable(nestdict.(vd)))
end

function nestdf(d::Dict{Symbol, Any})
    tdf=DataFrame(Tables.dictrowtable(nestdict(d)))
    if names(tdf)==["first","second"]
        rename!(tdf, [:first,:second].=>[:key,:value])
    end
    #tdf
end

function nestdict(d::Dict{Symbol, Any})
    for (k,v) in d
       if v isa Vector{<:Dict} || v isa Dict
            if allflatnt(v)
                tdf=DataFrame(Tables.dictrowtable(v))
                if names(tdf)==["first","second"]
                    rename!(tdf, [:first,:second].=>[:key,:value])
                end
                d=merge(d, Dict(k=>tdf))
            else
                d=merge(d, Dict(k=>nestdf(v)))
            end
        end
    end
    d 
end



nestdf(jsobj::JSON3.Object)=nestdf(copy(jsobj))
nestdf(jsobj::JSON3.Array)=nestdf(copy(jsobj))


using JSON3, DataFrames

filtering the chunk £0-£500, after few seconds you get


julia> jsobj=JSON3.read("grantnav-20231111145216.json")
JSON3.Object{Vector{UInt8}, Vector{UInt64}} with 2 entries:
  :license => "See dataset/license within each grant. This file also contains…  :grants  => Object[{…"

julia> df=nestdf(jsobj)
2×2 DataFrame
 Row │ key      value
     │ Symbol   Union…
─────┼────────────────────────────────────────────
   1 │ grants   6243×29 DataFrame
   2 │ license  See dataset/license within each …

julia> df2=df.value[1]
6243×29 DataFrame
  Row │ plannedDates   filename                 currency  description                        tit ⋯
      │ DataFrame?     String                   String    String                             Str ⋯
──────┼───────────────────────────────────────────────────────────────────────────────────────────
    1 │ 1×5 DataFrame  a001p00000tV2XpAAK.json  GBP       Funding under Sport England's NB…  1 x ⋯
    2 │ 1×5 DataFrame  a001p00000tV2XpAAK.json  GBP       Funding under Sport England's NB…  1x   
    3 │ 1×5 DataFrame  a001p00000tV2XpAAK.json  GBP       Funding under Sport England's NB…  Que  
    4 │ 1×5 DataFrame  a001p00000tV2XpAAK.json  GBP       Funding under Sport England's NB…  Que  
    5 │ 1×5 DataFrame  a001p00000tV2XpAAK.json  GBP       Funding under Sport England's NB…  1x  ⋯
    6 │ 1×5 DataFrame  a001p00000tV2XpAAK.json  GBP       Funding under Sport England's NB…  1 x  
    7 │ 1×5 DataFrame  a001p00000tV2XpAAK.json  GBP       Funding under Sport England's NB…  Que  
    8 │ 1×5 DataFrame  a001p00000tV2XpAAK.json  GBP       Funding under Sport England's NB…  Que  
    9 │ 1×5 DataFrame  a001p00000tV2XpAAK.json  GBP       Funding under Sport England's NB…  Que ⋯
   10 │ 1×5 DataFrame  a001p00000tV2XpAAK.json  GBP       Funding under Sport England's NB…  Que  
   11 │ 1×5 DataFrame  a001p00000tV2XpAAK.json  GBP       Funding under Sport England's NB…  Que  
   12 │ 1×5 DataFrame  a001p00000tV2XpAAK.json  GBP       Funding under Sport England's NB…  1 x  
   13 │ 1×5 DataFrame  a001p00000tV2XpAAK.json  GBP       Funding under Sport England's NB…  Que ⋯
   14 │ 1×5 DataFrame  a001p00000tV2XpAAK.json  GBP       Funding under Sport England's NB…  1 x  
  ⋮   │       ⋮                   ⋮                ⋮                      ⋮                      ⋱
 6231 │ missing        a003W000007E4EqQAK.json  GBP       A Project To Commemorate Ww2 Thr…  Gra  
 6232 │ missing        a003W000007E4EqQAK.json  GBP       The 7th Branch Of The Normandy V…  Gra  
 6233 │ missing        a003W000007E4EqQAK.json  GBP       This Group Will Mark The Anniver…  Gra ⋯
 6234 │ missing        a003W000007E4EqQAK.json  GBP       St Theresa'S Over 50s In St Hele…  Gra  
 6235 │ missing        a003W000007E4EqQAK.json  GBP       This Formally Constituted Theatr…  Gra  
 6236 │ missing        a003W000007E4EqQAK.json  GBP       This Group, Part Of The Territor…  Gra  
 6237 │ missing        a003W000007E4EqQAK.json  GBP       This Registered Charity Is Organ…  Gra ⋯
 6238 │ missing        a003W000007E4EqQAK.json  GBP       This Informal Group Will Use The…  Gra  
 6239 │ missing        a003W000007E4EqQAK.json  GBP       This Informal Group Will Use An …  Gra  
 6240 │ missing        a003W000007E4EqQAK.json  GBP       This School Would Use An Award T…  Gra  
 6241 │ missing        a003W000007E4EqQAK.json  GBP       This Formal Group Would Use An A…  Gra ⋯
 6242 │ missing        a003W000007E4EqQAK.json  GBP       The Project Is To Fund A Reunion…  Gra  
 6243 │ missing        a003W000007E4EqQAK.json  GBP       An Award Will Be Used To Commemo…  Gra  
                                                                  25 columns and 6216 rows omitted

julia> describe(df2)[:,[begin,end]]
29×2 DataFrame
 Row │ variable                           eltype
     │ Symbol                             Type
─────┼───────────────────────────────────────────────────────────────────
   1 │ plannedDates                       Union{Missing, DataFrame}
   2 │ filename                           String
   3 │ currency                           String
   4 │ description                        String
   5 │ title_and_description              String
   6 │ fromOpenCall                       Union{Missing, String}
   7 │ awardDateDateOnly                  String
   8 │ dataset                            DataFrame
   9 │ grantProgramme                     DataFrame
  10 │ beneficiaryLocation                Union{Missing, DataFrame}
  11 │ fundingOrganization                DataFrame
  12 │ title                              String
  13 │ additional_data                    DataFrame
  14 │ dateModified                       Union{Missing, String}
  15 │ recipientOrganization              DataFrame
  16 │ Award Type                         Union{Missing, String}
  17 │ simple_grant_type                  String
  18 │ id                                 String
  19 │ dataType                           String
  20 │ amountAwarded                      Union{Float64, Int64}
  21 │ Recipient Org Location             Union{Missing, DataFrame}
  22 │ dataSource                         Union{Missing, String}
  23 │ awardDate                          String
  24 │ amountAppliedFor                   Union{Missing, Float64, Int64}
  25 │ amountDisbursed                    Union{Missing, Int64}
  26 │ Planned Date                       Union{Missing, DataFrame}
  27 │ National Lottery Heritage Fund A…  Union{Missing, String}
  28 │ Project reference                  Union{Missing, String}
  29 │ classifications                    Union{Missing, DataFrame}

julia> df2.dataset[1,1].value[1]
7×2 DataFrame
 Row │ key                  value
     │ Symbol               Union…
─────┼────────────────────────────────────────────────────────
   1 │ file_size            11072508
   2 │ valid                true
   3 │ json                 /home/datastore/latest_datagette…
   4 │ acceptable_license   true
   5 │ datetime_downloaded  2023-11-11T00:02:10+00:00
   6 │ downloads            true
   7 │ file_type            xlsx