Compiler optimizations around DataFrames

I understand that to get type stable code using DataFrames.jl, you need to use Tables.columntable to convert the DF into NamedTuple. I was working on a Pluto notebook that tried to show the memory allocation behavior, but eventually I got pretty lost, as I cannot build a consistent understanding of how Julia compilation works.

The test script I used and its output under Julia 1.10.4 is show below:

using DataFrames, InteractiveUtils

function f_nested(df::DataFrame)
	res = 0
	for i in 1:nrow(df)
		for j in 1:ncol(df)
			res += df[i, j]
		end
	end
	res
end
function f_bycol(df::DataFrame)
	res = 0
	for col in eachcol(df)
		res += sum(col)
	end
	res
end
function f_byrow(df::DataFrame)
	res = 0
	for row in eachrow(df)
		res += sum(row)
	end
	res
end
function run()
	dfi_small = DataFrame(:a => Int.(collect(1:30)))
	dff_small = DataFrame(:a => Float32.(collect(1:30)))

	dfi = DataFrame(:a => Int.(collect(1:100)))
	dff = DataFrame(:a => Float32.(collect(1:100)))
	@code_warntype f_nested(dff)

	f_nested(dfi)
	f_bycol(dfi)
	f_byrow(dfi)

	f_nested(dff)	#it's necessary to warmup again for the float version or we will show memory being allocated from compilation
	f_bycol(dff)
	f_byrow(dff)
	
	"""
f_nested(dfi_small) allocated $(@allocated f_nested(dfi_small)) 
f_bycol(dfi_small) allocated $(@allocated f_bycol(dfi_small)) 
f_byrow(dfi_small) allocated $(@allocated f_byrow(dfi_small)) 

f_nested(dff_small) allocated $(@allocated f_nested(dff_small)) 
f_bycol(dff_small) allocated $(@allocated f_bycol(dff_small)) 
f_byrow(dff_small) allocated $(@allocated f_byrow(dff_small)) 

f_nested(dfi) allocated $(@allocated f_nested(dfi)) 
f_bycol(dfi) allocated $(@allocated f_bycol(dfi)) 
f_byrow(dfi) allocated $(@allocated f_byrow(dfi)) 

f_nested(dff) allocated $(@allocated f_nested(dff)) 
f_bycol(dff) allocated $(@allocated f_bycol(dff)) 
f_byrow(dff) allocated $(@allocated f_byrow(dff))
	"""
end
print(run())

MethodInstance for f_nested(::DataFrame)
  from f_nested(df::DataFrame) @ Main C:\Users\kirby\MyDrive\Documents\fourthwave\scripts\misc\test_allocated.jl:3
Arguments
  #self#::Core.Const(f_nested)
  df::DataFrame
Locals
  @_3::Union{Nothing, Tuple{Int64, Int64}}
  res::Any
  @_5::Union{Nothing, Tuple{Int64, Int64}}
  i::Int64
  j::Int64
Body::Any
1 ─       (res = 0)
β”‚   %2  = Main.nrow(df)::Int64
β”‚   %3  = (1:%2)::Core.PartialStruct(UnitRange{Int64}, Any[Core.Const(1), Int64])
β”‚         (@_3 = Base.iterate(%3))
β”‚   %5  = (@_3 === nothing)::Bool
β”‚   %6  = Base.not_int(%5)::Bool
└──       goto #7 if not %6
2 β”„ %8  = @_3::Tuple{Int64, Int64}
β”‚         (i = Core.getfield(%8, 1))
β”‚   %10 = Core.getfield(%8, 2)::Int64
β”‚   %11 = Main.ncol(df)::Int64
β”‚   %12 = (1:%11)::Core.PartialStruct(UnitRange{Int64}, Any[Core.Const(1), Int64])
β”‚         (@_5 = Base.iterate(%12))
β”‚   %14 = (@_5 === nothing)::Bool
β”‚   %15 = Base.not_int(%14)::Bool
└──       goto #5 if not %15
3 β”„ %17 = @_5::Tuple{Int64, Int64}
β”‚         (j = Core.getfield(%17, 1))
β”‚   %19 = Core.getfield(%17, 2)::Int64
β”‚   %20 = res::Any
β”‚   %21 = Base.getindex(df, i, j)::Any
β”‚         (res = %20 + %21)
β”‚         (@_5 = Base.iterate(%12, %19))
β”‚   %24 = (@_5 === nothing)::Bool
β”‚   %25 = Base.not_int(%24)::Bool
└──       goto #5 if not %25
4 ─       goto #3
5 β”„       (@_3 = Base.iterate(%3, %10))
β”‚   %29 = (@_3 === nothing)::Bool
β”‚   %30 = Base.not_int(%29)::Bool
└──       goto #7 if not %30
6 ─       goto #2
7 β”„       return res

f_nested(dfi_small) allocated 0 
f_bycol(dfi_small) allocated 0
f_byrow(dfi_small) allocated 960

f_nested(dff_small) allocated 960
f_bycol(dff_small) allocated 32
f_byrow(dff_small) allocated 2400

f_nested(dfi) allocated 1104
f_bycol(dfi) allocated 32
f_byrow(dfi) allocated 4304

f_nested(dff) allocated 3200
f_bycol(dff) allocated 32
f_byrow(dff) allocated 8000

There are two observations I don’t understand

  • even though @code_warntype shows type unstable code, Julia 1.10 is able to optimize for small integer arrays (but not small float arrays) to eliminate memory allocations.
  • even though only one method instance is created for each of the 3 functions, calling it with either an integer or float DataFrame will result in two separate compilations, as suggested by amount of memory allocated. You can verify this by commenting out the second set of warmup calls.
    • EDIT: I believe this could be caused by compilation of int and float versions of sum
1 Like

Try profiling the allocations with the allocation profiler. That should tell you where exactly the allocations are happening. If they don’t happen in your code, write it off as measurement error, I guess.

Yeah, this sounds vaguely similar to a type of measurement error I already observed:

So it seems that profiling any function that returns a Float64 value will report at least a single 16-byte allocation. But if the function returns Int (instead of Float64), no allocation is reported!

1 Like