given a list of observations (simulations, trials, results) find the list of “best” results for each simulation.
In the context of DataFrames, using the minilanguage, the problem is solved by calling the following function.
using DataFrames
s=repeat(1:10^6, inner=4)
t=repeat(1:4, 10^6)
r=rand(4*10^6)
df=DataFrame(;s,t,r)
combine(groupby(df, :s),:r=>maximum)
or use this to generate a test dataframe
df=DataFrame((s=Int[],t=Int[],r=Float64[]))
foreach(r->push!(df,r),[(i, t,rand()) for i in 1:10^6 for t in 1:rand(3:8)])
I have tried, going through different ways, to obtain results comparable to that obtained in DataFrames. None of those tried has obtained a comparable result.
What solution could be a competitor that can beat or match the performance of DataFrames?
Below is a list of attempts that are not even remotely competitive
using DataFrames
s=repeat(1:10^6, inner=4)
t=rand(1:100, 4*10^6)
r=rand(4*10^6)
function mgr(df)
grps=groupby(df,:s)
m=Vector{Int}(undef,length(grps))
for (i,g) in enumerate(grps)
m[i]=parentindices(g)[1][argmax(g.r)]
end
df[m,:]
end
function mgr000(df)
df[[parentindices(g)[1][argmax(g.r)] for g in groupby(df,:s)],:]
end
function mgr00(df)
df.idx=1:nrow(df)
grps=groupby(df,:s)
m=Vector{Int}(undef,length(grps))
for (i,g) in enumerate(grps)
m[i]=g.idx[argmax(g.r)]
end
df[m,:]
end
function mgr0(df)
d=Dict{Int, Float64}()
for r in Tables.namedtupleiterator(df)
d[r.s]=max(get!(d,r.s,0), r.r)
end
d
end
function mgr1(df)
d=Dict{Int, Float64}()
for r in eachrow(df)
d[r.s]=max(get!(d,r.s,0), r.r)
end
d
end
function mgr2(df)
d=Dict{Int, Float64}()
for r in 1:nrow(df)
e=df.s[r]
rr=df.r[r]
d[e]=max(get!(d,e,rr), rr)
end
d
end
function mgr3(df)
d=Dict{Int, Float64}()
for (r,e) in enumerate(df.s)
d[e]=max(get!(d,e,0), df.r[r])
end
d
end
function mgr4(s,r)
d=Dict{Int, Float64}()
for i in eachindex(s)
ri=r[i]
e=s[i]
d[e]=max(get!(d,e,ri), ri)
end
d
end
mdf=Matrix(df)
function mgrM(mdf)
d=Dict{Float64, Float64}()
for r in eachrow(mdf)
d[r[1]]=max(get!(d,r[1],0), r[3])
end
d
end
using SplitApplyCombine
vt=tuple.(s,r,t)
map(last,group(x->$vt[x][1], x->vt[x], sortperm(vt)))
using StructArrays
function mgr5(s,r,t)
dfsa=StructArray((;s,r,t))
sdfsa=sort(dfsa)
sdfsa[[findall(!=(0),diff(sdfsa.s));lastindex(sdfsa.s)]]
end
Besides having proposals that are faster than combine(groupby(…)…), I’d like to have some comments on some of the attempts I’ve made and which, while valid in principle, can get better results, by changing the implementation.