Depending on the context, using a lighter weight table seems to outperform DataFrames when including the cost to construct the table:
Setup
const s=repeat(1:10^6, inner=4)
const t=repeat(1:4, 10^6)
const r=rand(4*10^6)
using DataFrames
const df = DataFrame(;s,t,r)
function df_trial(s,t,r)
df = DataFrame(;s,t,r)
combine(groupby(df, :r), :s=>maximum)
end
using TypedTables
const tbl = Table(s=s,t=t,r=r)
function inner_loop!(group_maxima, s, r)
for i in eachindex(s)
@inbounds k = s[i]
@inbounds group_maxima[k] = max(get(group_maxima, k, -Inf), r[i])
end
end
# This is the same as @johnmyleswhite, but uses TypedTables instead
function custom_maximum(df)
s, r = df.s, df.r
group_maxima = Dict{eltype(s), Float64}()
inner_loop!(group_maxima, s, r)
Table(
s = collect(keys(group_maxima)),
r_maximum = collect(values(group_maxima)),
)
end
function tbl_trial(s,t,r)
tbl = Table(s=s,t=t,r=r)
custom_maximum(tbl)
end
tbl_trial(s,t,r)
df_trial(s,t,r)
combine(groupby(df, :s), :r=>maximum)
custom_maximum(tbl)
# create and group a dataframe
julia> @benchmark df_trial($s,$t,$r)
BenchmarkTools.Trial: 24 samples with 1 evaluation.
Range (min … max): 201.259 ms … 234.272 ms ┊ GC (min … max): 4.96% … 9.57%
Time (median): 217.930 ms ┊ GC (median): 8.59%
Time (mean ± σ): 215.847 ms ± 10.645 ms ┊ GC (mean ± σ): 7.98% ± 2.94%
▃ ▃ ▃ █ ▃
█▇▇▁▁▁▇▇▁▇█▁▁▁▁▁▁▇▁▁▁▇▁▁▁▁▁▁▁▇▁▇▁▁█▁▁▁▁▁█▁▇▁▇▁▁▁▁▁█▁▁▁▁▁▁▁▇▁▇ ▁
201 ms Histogram: frequency by time 234 ms <
Memory estimate: 311.97 MiB, allocs estimate: 308.
# create and group a typedtable
julia> @benchmark tbl_trial($s,$t,$r)
BenchmarkTools.Trial: 34 samples with 1 evaluation.
Range (min … max): 140.362 ms … 165.967 ms ┊ GC (min … max): 0.00% … 5.27%
Time (median): 151.275 ms ┊ GC (median): 3.38%
Time (mean ± σ): 151.464 ms ± 5.304 ms ┊ GC (mean ± σ): 2.77% ± 1.79%
█ ▃ ▃ █▃ ▃▃ ▃
▇▁▁▁▁▇▁▁▁▁▇▇▁▁▁▁▇▇▇█▇▁█▇█▇██▇▇▁██▇▁▇▁▁▁▇▁▁▁▁▁▁▇▁▁▁▁▁▁▁▁▁▁▁▁▁█ ▁
140 ms Histogram: frequency by time 166 ms <
Memory estimate: 80.43 MiB, allocs estimate: 58.
# just group an existing DataFrame
julia> @benchmark combine(groupby($df, :s), :r=>maximum)
BenchmarkTools.Trial: 157 samples with 1 evaluation.
Range (min … max): 27.072 ms … 41.158 ms ┊ GC (min … max): 0.00% … 22.72%
Time (median): 30.126 ms ┊ GC (median): 0.00%
Time (mean ± σ): 32.019 ms ± 3.704 ms ┊ GC (mean ± σ): 8.71% ± 9.68%
▃▂▄█▇▂ ▄▂ ▂ ▂
▆▁▅███████▇██▇▁▃▆▃▃▆▃▅▃▃▃▆▃▃▁▃▁▅▃▅█▇▇█▆▅▁▅█▇▃▆▃█▁▁▁▁▁▁▃▃▁▁▃ ▃
27.1 ms Histogram: frequency by time 40.6 ms <
Memory estimate: 55.33 MiB, allocs estimate: 327.
# Just group an existing Table
julia> @benchmark custom_maximum($tbl)
BenchmarkTools.Trial: 34 samples with 1 evaluation.
Range (min … max): 142.760 ms … 172.040 ms ┊ GC (min … max): 0.00% … 2.32%
Time (median): 150.780 ms ┊ GC (median): 3.56%
Time (mean ± σ): 151.268 ms ± 5.719 ms ┊ GC (mean ± σ): 2.94% ± 1.97%
▂ █
▅█▁▅▁▅▅▅▅▅▁▅▅▅█▅▅█▅▅█▁▁▁▁▅▅▁█▁▅▁▁▅▁▅▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▅ ▁
143 ms Histogram: frequency by time 172 ms <
Memory estimate: 80.43 MiB, allocs estimate: 58.