Finally I didn’t implemented any solution involving directly an external database in order to process the data, but made a king of “internal” database thanks to the low memory footprint of LazyJSON.jl.
I separate the costly part of the process (parsing and converting to a Vector{Any} in order to be able to run the request for a particular game.
using DataFrames, JSON3, JSONTables, ZipFile, LazyJSON
function get_events(events::String)
zarchive = ZipFile.Reader("/tmp/events.zip")
dictio = Dict(zarchive.files[i].name => i for i in eachindex(zarchive.files))
file_num = dictio[events]
str = read(zarchive.files[file_num])
return str
end
events_test = "events_Italy.json"
str = get_events(events_test)
v = LazyJSON.parse(str);
# Here is our "internal" database: we store the JSON lazily parsed as a vector in order to make our "request" within it.
v = convert(Vector{Any},v)
"""
function convert_events()
"""
function convert_events(data::DataFrame)
rename!(data, :id => :event_id,
:matchId => :game_id,
:matchPeriod => :period_id,
:eventSec => :milliseconds,
:teamId => :team_id,
:playerId => :player_id,
:eventId => :type_id,
:eventName => :type_name,
:subEventId => :subtype_id,
:subEventName => :subtype_name)
data[:, :milliseconds] = data[:,:milliseconds] * 1000
return data
end
# this is our "request" function
function subset_indexes_lazy(v, game_id::Int)
json_data = []
@inbounds for i in eachindex(v)
if v[i]["matchId"] == game_id
push!(json_data, v[i])
end
end
return json_data
end
function transform_to_df(json_subset)
subset_dic = convert.(Dict, json_subset)
test_df = DataFrame(:playerId => zeros(length(subset_dic)),
:matchId => zeros(length(subset_dic)),
:eventName => "",
:positions => [Vector{Dict}[] for i in eachindex(subset_dic)],
:eventId => zeros(length(subset_dic)),
:subEventName => "",
:teamId => zeros(length(subset_dic)),
:id => zeros(length(subset_dic)),
:matchPeriod => "",
:subEventId => "",
:eventSec => zeros(length(subset_dic)),
:tags => [Vector{Dict}[] for i in eachindex(subset_dic)])
for i in eachindex(test_df.playerId)
test_df[i, :playerId] = subset_dic[i]["playerId"]
test_df[i, :positions] = [convert.(Dict,subset_dic[i]["positions"])]
test_df[i, :matchId] = subset_dic[i]["matchId"]
test_df[i, :eventName] = string(subset_dic[i]["eventName"])
test_df[i, :eventId] = subset_dic[i]["eventId"]
test_df[i, :subEventName] = string(subset_dic[i]["subEventName"])
test_df[i, :teamId] = subset_dic[i]["teamId"]
test_df[i, :id] = subset_dic[i]["id"]
test_df[i, :matchPeriod] = string(subset_dic[i]["matchPeriod"])
test_df[i, :subEventId] = string(subset_dic[i]["subEventId"])
test_df[i, :eventSec] = subset_dic[i]["eventSec"]
test_df[i, :tags] = [convert.(Dict,subset_dic[i]["tags"])]
end
return test_df
end
# """
# function events()
# Return a Dataframe with all events in the game
# """
function events(events_Italy, game_id::Int)
subset_test = subset_indexes_lazy(events_Italy, game_id)
result = transform_to_df(subset_test)
data = convert_events(result)
return data
end
With the result in terms of processing time:
@time events(v, 2576336)
2.015401 seconds (4.70 M allocations: 112.659 MiB)