I’ve been working on an application and have narrowed down the piece that appears to be taking the most time.
in this following code, it’s generating dataframes d,e,f
l = 10000
data = DataFrame([Int64, Int64, Int64, Int64, Int64],[:x,:y,:z,:m,:n],l)
data.x = rand(1:100,l)
data.y = rand(1:100,l)
data.z = rand(0:1, l)
function looptest(data,loopsize)
summary = DataFrame([Int64, Int64, Float64],[:cfg, :count, :ave],0)
for i in 1:loopsize
for j in 1:loopsize
cfg = i * j
data.m = convert.(Int64, floor.(data.x ./ i))
data.n = convert.(Int64, floor.(data.y ./ j))
d = by(data, [:m, :n], df -> minimum(df[:z]))
e = by(data, [:m, :n], nrow)
f = join(d,e[e.x1 .== cfg,[:m,:n]],on = [:m,:n])
push!(summary,[cfg size(f,1) mean(f.x1)])
end
end
return summary
end
julia> @btime looptest(data,4)
562.855 ms (7902541 allocations: 362.75 MiB)
The rest of the code takes about 5ms, but this loop accunts for the majority. Is there something here I can do to improve this performance?
I was thinking at least there should be a way to generate f without the interim e table, but couldn’t figure that out.
Thanks.