Reading a large JSON file make Julia crashing

Hello everyone,

I’m trying to read and select a subset of a large JSON file from here. The result is a Dataframe of few lines (1600+) according to the game selected. However, I cannot reuse my process as after a first run, too much memory is still used and then Julia crash.

I think there are two main problems here:

  1. I don’t find any way to parse only the subset of the JSON I need.
  2. How to free the memory usage after each run? I tried to use the GC.gc() function but doesn’t seem to help me.

More globally, is there a better way to do so?

Here is the minimal working example:

using DataFrames, JSON3, JSONTables, ZipFile
Base.convert(::Type{JSON3.Array}, x::Vector{JSON3.Object}) = x

# just a utility function to get the zarchive file for a particular event file
function get_events(events::String)
# assuming that the events.zip file is in the tmp directory
    zarchive = ZipFile.Reader("/tmp/events.zip")
    dictio = Dict(zarchive.files[i].name => i for i in eachindex(zarchive.files))
    file_num = dictio[events]
    str = read(zarchive.files[file_num])
    
    return str
end

# function to parse the json file
function createDict(str)

    inDict = JSON3.read(str)
    return inDict
end

# here I take only the subset I need (ie. the data for the game specified)
function create_subset_json(json_data, game_id::Int)
    subset_json_indexes = [i for i in eachindex(json_data) if json_data[i][:matchId] == game_id]
    subset_json = json_data[subset_json_indexes]
    json_data = nothing
    Base.GC.gc()
    subset_json = DataFrame(convert(JSON3.Array,subset_json))
    return subset_json
end

# run everything
function all_process(events::String, game_id::Int)
    json_data = createDict(get_events(events))
    data = create_subset_json(json_data, 2576335)
    json_data = nothing
    Base.GC.gc()
    return data 
end

all_process("events_Italy.json", 2576335)

Best regards,

Thomas

Unless you are working on a very small (RAM) machine, your example code works very well.
After reading in (Italy) and processing without any cleanup (gc) the Julia process only consumes little over 1GB. After GC.gc() little under 1GB. And no problems working with data.
I think there is something else wrong.
What julia version and hardware are in use? (well, if it is something reasonable I don’t have any ideas what it can be, except for severe failures of RAM or disk)

Maybe op’s RAM is jammed by other programs.

Well, on my side I’ve got the following results with the first implementation:

@time all_process("events_Italy.json", 2576335)
13.266353 seconds (14.24 M allocations: 2.486 GiB, 8.24% gc time, 3.13% compilation time)

Which is quite large. I’m concerned by the memory usage because I basically need to conduct the same process for a large number of files. And also ultimately would need to be run on a server so the less memory I use the better.

I’ve tried a really specific process with LazyJSON.jl:

using LazyJSON, ZipFile, DataFrames

# same utility function to access the zipped file
function get_events(events::String)
    zarchive = ZipFile.Reader("/tmp/events.zip")
    dictio = Dict(zarchive.files[i].name => i for i in eachindex(zarchive.files))
    file_num = dictio[events]
    str = read(zarchive.files[file_num])
    
    return str
end

# take only the subset I need
function subset_indexes_lazy(v, game_id::Int)
    json_data = convert(Vector{Any},v)
    json_data = [i for i in v if i["matchId"] == game_id]
end


# A really specific function to populate the dataframe (at this point I know the size and the type)
function transform_to_df(json_subset)
    subset_dic = convert.(Dict, json_subset)

    test_df = DataFrame(:playerId => zeros(length(subset_dic)),
        :matchId => zeros(length(subset_dic)),
        :eventName => "",
        :positions => [Vector{Dict}[] for i in eachindex(subset_dic)],
        :eventId => zeros(length(subset_dic)),
        :subEventName => "",
        :teamId => zeros(length(subset_dic)),
        :id => zeros(length(subset_dic)),
        :matchPeriod => "",
        :subEventId => "",
        :eventSec => zeros(length(subset_dic)),
        :tags => [Vector{Dict}[] for i in eachindex(subset_dic)])

    for i in eachindex(test_df.playerId)
        test_df[i, :playerId] = subset_dic[i]["playerId"]
        test_df[i, :positions] = [convert.(Dict,subset_dic[i]["positions"])]
        test_df[i, :matchId] = subset_dic[i]["matchId"]
        test_df[i, :eventName] = string(subset_dic[i]["eventName"])
        test_df[i, :eventId] = subset_dic[i]["eventId"]
        test_df[i, :subEventName] = string(subset_dic[i]["subEventName"])
        test_df[i, :teamId] = subset_dic[i]["teamId"]
        test_df[i, :id] = subset_dic[i]["id"]
        test_df[i, :matchPeriod] = string(subset_dic[i]["matchPeriod"])
        test_df[i, :subEventId] = string(subset_dic[i]["subEventId"])
        test_df[i, :eventSec] = subset_dic[i]["eventSec"]
        test_df[i, :tags] = [convert.(Dict,subset_dic[i]["tags"])]
    end

    return test_df
end

# Run all the process
function process_lazy(events::String, game_id::Int)
    str = get_events(events)
    v = LazyJSON.parse(str)
    subset_test = subset_indexes_lazy(v, game_id)
    result = transform_to_df(subset_test)
end

Which gives a really more efficient process:

@time test = process_lazy("events_Italy.json", 2576335)
5.757752 seconds (8.22 M allocations: 793.463 MiB, 1.33% gc time, 3.17% compilation time)

It is 2 times faster and use more than 3 times less memory.
Maybe @quinnj do you have an idea what should I change to get the same results with JSON3.jl ? I’m concerned because LazyJSON.jl seems to be abandonned.

I got it down to 738Mb by using narrower width numbers

 julia> describe(df)
12×7 DataFrame
 Row │ variable      mean       min        median     max        nmissing  eltype               
     │ Symbol        Union…     Any        Union…     Any        Int64     DataType             
─────┼──────────────────────────────────────────────────────────────────────────────────────────
   1 │ playerId      91889.5    0          37745.0    376362            0  Int32
   2 │ matchId       2.57634e6  2576335    2.57634e6  2576335           0  UInt32
   3 │ eventName                "Duel"                "Shot"            0  String
   4 │ positions                                                        0  Vector{Vector{Dict}}
   5 │ eventId       5.44691    1          8.0        10                0  UInt8
   6 │ subEventName             ""                    "Whistle"         0  String
   7 │ teamId        3161.42    3161       3161.0     3162              0  Int16
   8 │ id            2.53669e8  253668302  2.53669e8  253670406         0  Int64
   9 │ matchPeriod   2.0        2          2.0        2                 0  Int8
  10 │ subEventId    57.3074    0          80.0       100               0  Int8
  11 │ eventSec      Inf        1.441      1221.5     2.936e3           0  Float16
  12 │ tags                                                             0  Vector{Vector{Dict}}

julia> @time df = process_lazy("events_Italy.json", 2576335) 
  4.342388 seconds (8.18 M allocations: 738.207 MiB, 11.12% gc time, 4.92% compilation time)

With some more insight you can reduce it further

const eventNames = Dict("\"Offside\"" => 9, "\"Save attempt\"" => 8, "\"Pass\"" => 1, "\"Duel\"" => 2, "\"Shot\"" => 5, "\"Others on the ball\"" => 3, "\"Interruption\"" => 4, "\"Foul\"" => 7, "\"Goalkeeper leaving line\"" => 10, "\"Free Kick\"" => 6)

test_df[i, :eventName] = get(eventNames, string(subset_dic[i]["eventName"]), 0)

That saved another 20Mb

julia> @time df = process_lazy("events_Italy.json", 2576335) 
4.134571 seconds (7.80 M allocations: 718.144 MiB, 11.24% gc time, 0.55% compilation time)

a similar technqiue can be used for :subEventName

I leave that as an exercise

Thank you for the tips regarding the use of narrower width numbers, I didn’t know about that!

JSON3.jl isn’t really designed with large file => DataFrame in mind, so I wouldn’t personally be surprised by high memory while doing conversions. When you do JSON3.read, it creates an in-memory structure of the entire json to read. JSONTables.jl just provides a bit of glue to go in between the json representation and the “table” representation.

There have been a few requests before around, and I made this issue, but just haven’t gotten around to putting serious thought into how this should look in JSON3.jl. I’d be happy to brainstorm some ideas if people have them; we could probably port the ideas from LazyJSON.jl to JSON3.jl without too much trouble.

possibly not helpful, but I’ve had good luck with the Binary JSON BSON.jl package with large binary data dumps from FEM runs…

Thank you for your answer!
Yes I understand that currently the memory footprint are still different between JSON3.jl and LazyJSON.jl.
Will stay with the highly specific process with LazyJSON.jl for now then, as this is a really particular use case with this huge JSON file transformation to a quite little DataFrame.

I think you have a really good point here:

  • I have multiple large JSON files to be processed;
  • I need to process data per subset due to the large size of these files;
  • Doing so directly with the JSON files, 90% of the processing time is either on the JSON reading + DataFrame transformation (JSON3.jl) or the filtering of the JSON parsed file (LazyJSON.jl).

So indeed, why not either transforming the last step as a request process within a MongoDB database or transforming the initial JSON files as BSON files to be treated faster.
Will investigate on this.

Finally I didn’t implemented any solution involving directly an external database in order to process the data, but made a king of “internal” database thanks to the low memory footprint of LazyJSON.jl.

I separate the costly part of the process (parsing and converting to a Vector{Any} in order to be able to run the request for a particular game.

using DataFrames, JSON3, JSONTables, ZipFile, LazyJSON


function get_events(events::String)
    zarchive = ZipFile.Reader("/tmp/events.zip")
    dictio = Dict(zarchive.files[i].name => i for i in eachindex(zarchive.files))
    file_num = dictio[events]
    str = read(zarchive.files[file_num])
    
    return str
end

events_test = "events_Italy.json"
str = get_events(events_test)
v = LazyJSON.parse(str);
# Here is our "internal" database: we store the JSON lazily parsed as a vector in order to make our "request" within it.
v = convert(Vector{Any},v)


"""
    function convert_events()
"""
function convert_events(data::DataFrame)

    rename!(data, :id => :event_id,
                    :matchId => :game_id,
                    :matchPeriod => :period_id,
                    :eventSec => :milliseconds,
                    :teamId => :team_id,
                    :playerId => :player_id,
                    :eventId => :type_id,
                    :eventName => :type_name,
                    :subEventId => :subtype_id,
                    :subEventName => :subtype_name)


    data[:, :milliseconds] = data[:,:milliseconds] * 1000
    return data
end



# this is our "request" function
function subset_indexes_lazy(v, game_id::Int)
    json_data = []
    @inbounds for i in eachindex(v)
        if v[i]["matchId"] == game_id 
            push!(json_data, v[i])
        end
    end    
    return json_data
end



function transform_to_df(json_subset)
    subset_dic = convert.(Dict, json_subset)

    test_df = DataFrame(:playerId => zeros(length(subset_dic)),
        :matchId => zeros(length(subset_dic)),
        :eventName => "",
        :positions => [Vector{Dict}[] for i in eachindex(subset_dic)],
        :eventId => zeros(length(subset_dic)),
        :subEventName => "",
        :teamId => zeros(length(subset_dic)),
        :id => zeros(length(subset_dic)),
        :matchPeriod => "",
        :subEventId => "",
        :eventSec => zeros(length(subset_dic)),
        :tags => [Vector{Dict}[] for i in eachindex(subset_dic)])

    for i in eachindex(test_df.playerId)
        test_df[i, :playerId] = subset_dic[i]["playerId"]
        test_df[i, :positions] = [convert.(Dict,subset_dic[i]["positions"])]
        test_df[i, :matchId] = subset_dic[i]["matchId"]
        test_df[i, :eventName] = string(subset_dic[i]["eventName"])
        test_df[i, :eventId] = subset_dic[i]["eventId"]
        test_df[i, :subEventName] = string(subset_dic[i]["subEventName"])
        test_df[i, :teamId] = subset_dic[i]["teamId"]
        test_df[i, :id] = subset_dic[i]["id"]
        test_df[i, :matchPeriod] = string(subset_dic[i]["matchPeriod"])
        test_df[i, :subEventId] = string(subset_dic[i]["subEventId"])
        test_df[i, :eventSec] = subset_dic[i]["eventSec"]
        test_df[i, :tags] = [convert.(Dict,subset_dic[i]["tags"])]
    end

    return test_df
end

# """
#     function events()
# Return a Dataframe with all events in the game
# """
function events(events_Italy, game_id::Int)
    subset_test = subset_indexes_lazy(events_Italy, game_id)
    result = transform_to_df(subset_test)
    data = convert_events(result)
    return data
end



With the result in terms of processing time:

@time events(v, 2576336)
2.015401 seconds (4.70 M allocations: 112.659 MiB)
2 Likes