Fastest way to create new column in DataFrames.jl

I found 3 ways to create new columns in DataFrames.jl (two of which are equivalent)

df = DataFrame(a = rand(1_000_000), b= rand(1_000_000))

@time transform!(df, [:a, :b] => (a,b)->a + b)
@time transform!(df, [:a, :b] => (a,b)->a + b)

@time select!(df, :, [:a, :b] => (a,b)->a + b)
@time select!(df, :, [:a, :b] => (a,b)->a + b) # 0.1s

@time df[!, :c] = df[!, :a] .+ df[!, :b] #0.001s
@time df[!, :c] = df[!, :a] .+ df[!, :b]

The last method is much faster using @time. But it is not true when using @benchmark.

Why is @time much worse? When doing data manipulation I don’t tend to put these into smaller functions like this

sumab_df(df) = transform!(df, [:a, :b] => (a,b)->a + b)

@time sumab_df(df)
@time sumab_df(df)

So I tried to create a function meh to see why top 2 are slower. And I found this advice for performance

  1. Using df[!, :c] = fn(df[!, :a], df[!, :b) where possible so you can get performance without putting operations in functions
  2. Use named function rather than anonymous function where possible.

Are these advice on point?

Benchmarking code

@benchmark transform!($df, [:a, :b] => (a,b)->a + b)
# BenchmarkTools.Trial:
#   memory estimate:  7.63 MiB
#   allocs estimate:  97
#   --------------
#   minimum time:     1.638 ms (0.00% GC)
#   median time:      1.835 ms (0.00% GC)
#   mean time:        2.429 ms (16.38% GC)
#   maximum time:     11.841 ms (48.65% GC)
#   --------------
#   samples:          2058
#   evals/sample:     1
@benchmark select!($df, :, [:a, :b] => (a,b)->a + b)
# BenchmarkTools.Trial: 
#   memory estimate:  7.63 MiB
#   allocs estimate:  98
#   --------------
#   minimum time:     1.619 ms (0.00% GC)
#   median time:      1.759 ms (0.00% GC)
#   mean time:        2.316 ms (16.70% GC)
#   maximum time:     11.561 ms (48.30% GC)
#   --------------
#   samples:          2160
#   evals/sample:     1
@benchmark select!($df, :, [:a, :b] => (a,b)->a .+ b)
# BenchmarkTools.Trial: 
#   memory estimate:  7.63 MiB
#   allocs estimate:  98
#   --------------
#   minimum time:     1.642 ms (0.00% GC)
#   median time:      1.760 ms (0.00% GC)
#   mean time:        2.321 ms (16.66% GC)
#   maximum time:     11.834 ms (40.55% GC)
#   --------------
#   samples:          2154
#   evals/sample:     1

@benchmark df[!, :c] = df[!, :a] .+ df[!, :b]
BenchmarkTools.Trial: 
  memory estimate:  7.63 MiB
  allocs estimate:  5
  --------------
  minimum time:     1.605 ms (0.00% GC)
  median time:      1.731 ms (0.00% GC)
  mean time:        2.291 ms (16.50% GC)
  maximum time:     11.518 ms (48.88% GC)
  --------------
  samples:          2180
  evals/sample:     1
# Another way
function meh(df, (cols, fn), outcol)
    cols_vals = getindex.(Ref(df), !, cols)
    df[!, outcol] = fn.(cols_vals...)
    df
end

# passing in anonymouse function is slower
@benchmark meh(df, ([:a, :b] => (a, b)-> a+b), :c)

sumab(a, b) = a + b
# passing in named function is faster
@benchmark meh(df, ([:a, :b] => sumab), :c)

@benchmark df[!, :c] = sumab.(df[!, :a], df[!, :b])

1 Like