I have a simulation model that iteratively produces NamedTuples which I want to aggregate (summing) to a DataFrame. Only a portion of the columns are updated, as I keep statistics for different simulation periods (years).
I read that I may want to avoid creating intermediate DataFrames, thus I work with a NamedTuple.
The MWE below is a bit lengthy, but my question is simple, how can I improve aggregate_yearly_metrics
. It does not seem type stable right nowβ¦
using DataFrames
aggregationTuple = (p_volume=0, s_volume=0, floatvar=0.0, intvar=0)
aggDf = DataFrames.DataFrame()
yearlist = collect(1:2)
#define a dict for colnames lookup
colNamesOfAggDf = Dict{Int,Dict{Symbol,Symbol}}()
for yr in yearlist
hv = Dict{Symbol,Symbol}()
for (symb, val) in pairs(aggregationTuple)
nm = Symbol(string(symb, "_y", yr))
hv[symb] = nm
end
colNamesOfAggDf[yr] = deepcopy(hv)
end
n=100
aggDf[!, :iteration] = collect(1:n)
#define dataframe
for yr in yearlist
for (tplsymb,dfsymb) in colNamesOfAggDf[yr]
aggDf[!, dfsymb] = zero(one(getindex(aggregationTuple, tplsymb)):n)
end
end
function aggregate_yearly_metrics!(aggDfRowNumber::Int, colNamesOfAggDf, yr::Int, aggDf::DataFrames.DataFrame, aggregationTuple::NamedTuple)
for k in keys(aggregationTuple)
v = getindex(aggregationTuple,k)
dfcolname = colNamesOfAggDf[yr][k]
aggregate_yearly_metrics_internal!(aggDf[!,dfcolname],aggDfRowNumber,v)
end
return nothing
end
function aggregate_yearly_metrics_internal!(dfcol,aggDfRowNumber,v)
dfcol[aggDfRowNumber] = dfcol[aggDfRowNumber] + v
return nothing
end
function sim(n_sims,iterations,colNamesOfAggDf,aggDf)
for i = 1:iterations
for j = 1:n_sims
yr = ifelse(rand()<.5,1,2)
#generate some data (originally stemming form a simulation model)
aggregationTuple = (p_volume=1, s_volume=rand(Int), floatvar=rand(), intvar=0)
#aggregate data
aggregate_yearly_metrics!(i, colNamesOfAggDf, yr, aggDf, aggregationTuple)
end
end
return aggDf
end
aggDf
sim(20_000,n,colNamesOfAggDf,aggDf)
@code_warntype aggregate_yearly_metrics!(3, colNamesOfAggDf, 1, aggDf, aggregationTuple)
using BenchmarkTools
@benchmark sim(20_000,n,colNamesOfAggDf,aggDf)