Help with performance tuning this dataframe aggregation

I’ve been working on an application and have narrowed down the piece that appears to be taking the most time.

in this following code, it’s generating dataframes d,e,f

l = 10000
data = DataFrame([Int64, Int64, Int64, Int64, Int64],[:x,:y,:z,:m,:n],l)
data.x = rand(1:100,l)
data.y = rand(1:100,l)
data.z = rand(0:1, l)

function looptest(data,loopsize)
	summary = DataFrame([Int64, Int64, Float64],[:cfg, :count, :ave],0)
	for i in 1:loopsize
		for j in 1:loopsize
			cfg = i * j
			data.m = convert.(Int64, floor.(data.x ./ i))
			data.n = convert.(Int64, floor.(data.y ./ j))
			d = by(data, [:m, :n], df -> minimum(df[:z]))
			e = by(data, [:m, :n], nrow)
			f = join(d,e[e.x1 .== cfg,[:m,:n]],on = [:m,:n])
			push!(summary,[cfg size(f,1) mean(f.x1)])
		end
	end 
	return summary
end

julia> @btime looptest(data,4)
  562.855 ms (7902541 allocations: 362.75 MiB)

The rest of the code takes about 5ms, but this loop accunts for the majority. Is there something here I can do to improve this performance?

I was thinking at least there should be a way to generate f without the interim e table, but couldn’t figure that out.

Thanks.

You can use a DataFrame constructor in your by command. That way you cut the number of DataFrame constructors down.

p = by(data, [:m, :n], d -> DataFrame(a = minimum(d[:z]), nrow = nrow(d)))

Ultimately, its the join that’s costing you. I would suggest both DataFrames at once with the constructor above, then subset based on the :nrow variable. That should help the performance. Though a DataFramesMeta approach might be easier and faster.

I would say that you should also pre-allocate your summary dataframe, since you know the loopsize. that would mean arrays would have to re-size less frequently. However the way DataFrame’s setindex works is a bit weird at the moment, especially with regards to rows. See this issue.

Thanks for the help, I couldn’t figure out how to simultaneously get the minimum and the nrow count.

But even so, it’s only slightly better.

l = 10000
data = DataFrame([Int64, Int64, Int64, Int64, Int64],[:x,:y,:z,:m,:n],l)
data.x = rand(1:100,l)
data.y = rand(1:100,l)
data.z = rand(0:1, l)

function looptest(data,loopsize)
	summary = DataFrame([Int64, Int64, Float64],[:cfg, :count, :ave],0)
	for i in 1:loopsize
		for j in 1:loopsize
			cfg = i * j
			data.m = convert.(Int64, floor.(data.x ./ i))
			data.n = convert.(Int64, floor.(data.y ./ j))
			d = by(data, [:m, :n], df -> minimum(df[:z]))
			e = by(data, [:m, :n], nrow)
			f = join(d,e[e.x1 .== cfg,[:m,:n]],on = [:m,:n])
			push!(summary,[cfg size(f,1) mean(f.x1)])
		end
	end 
	return summary
end

function looptest2(data,loopsize)
	summary = DataFrame([Int64, Int64, Float64],[:cfg, :count, :ave],0)
	for i in 1:loopsize
		for j in 1:loopsize
			cfg = i * j
			data.m = convert.(Int64, floor.(data.x ./ i))
			data.n = convert.(Int64, floor.(data.y ./ j))
			p = by(data, [:m, :n], d -> DataFrame(a = minimum(d[:z]), nrow = nrow(d)))
			push!(summary,[cfg size(f,1) mean(p.a[p.nrow .== cfg,:])])
		end
	end 
	return summary
end

julia> @btime looptest(data,4)
  611.216 ms (7906012 allocations: 362.80 MiB)

julia> @btime looptest2(data,4)
  483.848 ms (4476416 allocations: 208.60 MiB)

DataFramesMeta’s grouped operations are more specialized than DataFrames’s ones. See this PR. So that might be a better solution.

I would try pre-allocating the summary DataFrame and then doing

for k in ncol(summary)
    summary[i + j,k] = ... 
end

instead of push!

The push has almost no effect. I tested this before I posted.

Here is the loop without the push. Almost no difference

function looptest3(data,loopsize)
	for i in 1:loopsize
		for j in 1:loopsize
			cfg = i * j
			data.m = convert.(Int64, floor.(data.x ./ i))
			data.n = convert.(Int64, floor.(data.y ./ j))
			p = by(data, [:m, :n], d -> DataFrame(a = minimum(d[:z]), nrow = nrow(d)))
		end
	end 
end

julia> @btime looptest3(data,4)
  781.232 ms (4476093 allocations: 208.43 MiB)

And here are the two operations by themselves

julia> @btime   let a 
                             push!(summary,[cfg size(f,1) mean(f.x1)])
                         end
  611.815 ns (9 allocations: 288 bytes)

julia> @btime  let a
                           p = by(data, [:m, :n], d -> DataFrame(a = minimum(d[:z]), nrow = nrow(d)))
                        end
  11.769 ms (137683 allocations: 5.42 MiB)


I’m getting around the same benchmarks for the DataFramesMeta approach.

julia> function foo(data)
       @linq data |>
       groupby([:m, :n])  |>
       based_on(a = minimum(:z), nrow = length(:z));
       end

It’s worth nothing that the grouping on its own is about half the time

julia> function foo(data)
       @linq data |>
       groupby([:m, :n])  |>
       based_on(a = minimum(:z), nrow = length(:z));
       end

julia> function  goo(df)
       groupby(df, [:m, :n])
       end

julia> @btime goo($data);
  3.612 ms (87501 allocations: 2.46 MiB)

julia> @btime foo($data);
  7.896 ms (114904 allocations: 4.35 MiB)

julia> @btime by($data, [:m, :n], d -> DataFrame(a = minimum(d[:z]), nrow = nrow(d)));
  8.541 ms (115784 allocations: 4.37 MiB)

You could probably improve performance by leaving DataFrames, but at the cost of more complicated code. Sorry I couldn’t be of more help.

Its worth nothing that grouped DataFrame operations are difficult everywhere. In Stata they are very slow given what you expect from the hand-optimized mata they use for everything.

Thanks for the help. I tried a quick test using a simple array but it was actually worse for some reason. Anyways, it’s very helpful to know I wasn’t doing something fundamentally wrong. That’s important information for a newbie like me. It means I’m making progress!

function looptest5(data,loopsize)
	summary = DataFrame([Int64, Int64, Float64],[:cfg, :count, :ave], loopsize^2)
	summarycounter = 0
	for i in 1:loopsize
		for j in 1:loopsize
			cfg = i * j
			data.m = convert.(Int64, floor.(data.x ./ i))
			data.n = convert.(Int64, floor.(data.y ./ j))
			temp = Array{Int64}(undef,size(data,1))
			local counter = 0
			for a in unique(data.m)
				for b in unique(data[data.m .== a,:n])
					results = data[(data.m .== a) .& (data.n .== b),:z]
					if size(results,1) == cfg
						counter += 1
						temp[counter] = minimum(results)
					end
				end 
			end
			summarycounter += 1
			summary[summarycounter,:cfg] = cfg
			summary[summarycounter,:count] = counter
			summary[summarycounter,:ave] = mean(temp[1:counter])
		end
	end 
	return summary
end

julia> @btime looptest5(data,4)
  1.127 s (529235 allocations: 235.43 MiB)

Hi,

I think you are doing this a little bit to complicated. Especially the inner loops over columns m and n aren’t necessary.

Using Query.jl following solution is much faster:

function looptest7(data,loopsize)
	summary = DataFrame([Int64, Int64, Float64],[:cfg, :count, :ave],0)
	for i in 1:loopsize
		data.m .= fld.(data.x, i)
		for j in 1:loopsize
			cfg = i * j
			data.n .= fld.(data.y, j)

			x = @from i in data begin
				@group i.z by {i.m, i.n} into g
				@where length(g) == cfg
				@select minimum(g)
				@collect
			end

			push!(summary, [cfg length(x) mean(x)])
		end
	end
	return summary
end

julia> @btime looptest7(data,4)
  42.480 ms (184858 allocations: 13.03 MiB)

The not shared looptest6 was a similar to looptest5 but without using Query.jl and only using groupby. looptest7 is twice as fast, because groupby on DataFrames isn’t typestable… (see: Type of groupby(df,id) elements are Any)

You could try this PR and have your function return a named tuple instead of a data frame. Not sure whether it will be merged for the next release.

Thanks @laborg . That made a huge difference. I guess I’ll have to investigate the Query syntax more. So much to learn!