cd $(mktemp -d)
git clone https://github.com/abelsiqueira/TulipaEnergyModel.jl .
git checkout mwe-discourse
julia --project
pkg> instantiate
julia> include("mwe.jl")
This will print instruction and the timing on the โTinyโ data:
Current best:
0.001040 seconds (13.03 k allocations: 1005.875 KiB)
Also decent:
0.001191 seconds (16.63 k allocations: 1.159 MiB)
Older strategy
0.003551 seconds (282.17 k allocations: 12.629 MiB)
Leftjoin strategy
0.023693 seconds (99.80 k allocations: 8.522 MiB, 84.62% gc time)
You change search for input_dir in the file, and comment out the line with the โEUโ path. The output for me are:
Current best:
99.465536 seconds (4.59 M allocations: 2.132 GiB, 0.17% gc time)
Also decent:
100.750630 seconds (5.93 M allocations: 2.181 GiB, 0.44% gc time)
Older strategy
# Gave up after maybe 15 minutes
The leftjoin strategy simply kills my VSCode or my terminal after ~1 minute.
function add_incoming_term!(df_cons, df_flows)
Tmin = min(minimum(first.(df_flows.time_block)),minimum(first.(df_cons.time_block)))
Tmax = max(maximum(last.(df_flows.time_block)),maximum(last.(df_cons.time_block)))
Tspan = Tmin:Tmax
nT = length(Tspan)
pre_incoming = Vector{AffExpr}(undef, nT)
df_cons.incoming_term .= AffExpr(0.0)
g_flows = groupby(df_flows, [:rp, :to])
g_cons = groupby(df_cons, [:rp, :asset])
for ((rp, to), sdf) in pairs(g_cons)
haskey(g_flows, (;rp, to)) || continue
pre_incoming .= AffExpr(0.0)
for row in eachrow(g_flows[(;rp, to)]), t in row.time_block
pre_incoming[t-Tmin+1] += row.flow * 3.14 * rp
end
for row in eachrow(sdf)
row.incoming_term = sum(pre_incoming[t-Tmin+1] for
t in row.time_block; init=AffExpr(0.0))
end
end
end
After playing around with the data a little, I think this zip(gd1, gd2) method will be more effective than a leftjoin.
One thing i didnโt get about OPโs data before looking at it is that the leftjoin is many-to-many and therefore the output of leftjoin is huge. So zip-ing the two grouped data frames together is a very good idea.
As a small piece of advice,
for row in eachrow(sdf)
is going to be slow (due to the type instability of data frames mentioned above). and itโs better to replace this with a function taking in vectors directly.
Note that groupby(df, ...) is very optimized. You can improve this by making df_flows.rp etc. a PooledArray from PooledArrays.jl . This should make the groupby faster, but I donโt think thatโs the bottleneck.
df_cons.incoming_term .= AffExpr(0)
cons=sort(df_cons,[:rp,:asset])
flows=sort(df_flows,[:rp,:to])
function ss(frp,fto,crp,cass, ftb,ctb, flow,it)
idr=[searchsorted(collect(zip(frp,fto)), (a,r)) for (a,r) in zip(crp,cass)]
id=findall(!isempty,idr)
for i in id
len = [length(โฉ(ctb[i] , fl))*ฯ for fl in ftb[idr[i]]]
it[i]=len' * flow[idr[i]]
end
end
ss(flows.rp,flows.to,cons.rp,cons.asset,flows.time_block,cons.time_block,flows.flow, cons.incoming_term)
@Dan, the function has an error because it is overwriting the pre_incoming variable for different values of from. I changed it to the code below, and it appears to be right, I will have to check later. Overall this was blazingly fast. It went down from 90s from the previous best to 0.9s. So 100x speedup. Thanks a lot for the help.
Here is the modified version:
function add_incoming_term!(df_cons, df_flows)
Tmin = min(minimum(first.(df_flows.time_block)), minimum(first.(df_cons.time_block)))
Tmax = max(maximum(last.(df_flows.time_block)), maximum(last.(df_cons.time_block)))
Tspan = Tmin:Tmax
nT = length(Tspan)
from_list = unique(df_flows.from)
from_lookup = Dict(a => i for (i, a) in enumerate(from_list))
nA = length(from_list)
pre_incoming = Matrix{AffExpr}(undef, nT, nA)
df_cons.incoming_term .= AffExpr(0.0)
g_flows = groupby(df_flows, [:rp, :to])
g_cons = groupby(df_cons, [:rp, :asset])
for ((rp, to), sdf) in pairs(g_cons)
haskey(g_flows, (; rp, to)) || continue
pre_incoming .= AffExpr(0.0)
for row in eachrow(g_flows[(; rp, to)]), t in row.time_block
j = from_lookup[row.from]
pre_incoming[t-Tmin+1, j] = row.flow * 3.14 * rp
end
for row in eachrow(sdf)
row.incoming_term =
sum(sum(pre_incoming[t-Tmin+1, :]) for t in row.time_block; init = AffExpr(0.0))
end
end
end
@rocco_sprmnt21, thanks for the suggestion. Very interesting, and very fast for the smaller cases. But for the really big case, it slows down to 9s, vs 0.9 from the solution supplied by @Dan. It also allocates 35G according to @time.
I have made minor modification to the call itself. Here is the code:
function rocco_strategy(df_cons, df_flows)
df_cons.incoming_term_rocco .= AffExpr(0)
# sort!(df_cons, [:rp, :asset])
# sort!(df_flows, [:rp, :to])
function cons_inc_term!(frp, fto, crp, cass, ftb, ctb, flow, it)
rp_to = collect(zip(frp, fto))
idr = [searchsorted(rp_to, (a, r)) for (a, r) in zip(crp, cass)]
id = findall(!isempty, idr)
for i in id
len = [length(โฉ(ctb[i], fl)) * 3.14 * crp[i] for fl in ftb[idr[i]]]
it[i] = len' * flow[idr[i]]
end
end
cons_inc_term!(
df_flows.rp,
df_flows.to,
df_cons.rp,
df_cons.asset,
df_flows.time_block,
df_cons.time_block,
df_flows.flow,
df_cons.incoming_term_rocco,
)
end
The comparison:
Dan strategy
0.386765 seconds (6.13 M allocations: 379.483 MiB, 24.98% gc time)
Rocco strategy
7.545171 seconds (551.91 k allocations: 35.770 GiB, 3.86% gc time)
Since the performance evaluation depends on the size of the data, could you provide dummy data of the appropriate size?
I add a couple of observations on my proposal:
It would be useful to have a searchsorted() method that works on an iterator like zip() without having to do collect(). Does anyone have any idea how the collect could be avoided?;
as regards the sorting of the two dataframes, only one is necessary for the use of searchsorted: the one on which the searches are carried out.
this requires and exploits (I donโt know if in a useful way in the end) the ordering of both dataframes
function sorted_cons_inc_term!(frp,fto,crp,cass, ftb,ctb, flow,it)
rp_to = tuple.(frp,fto)
l=0
for i in eachindex(crp)
f=searchsortedfirst((@view rp_to[l+1:end]),(crp[i],cass[i]))
l=searchsortedlast((@view rp_to[l+1:end]),(crp[i],cass[i]))
if !isempty(f:l)
len = [length(โฉ(ctb[i] , fl))*ฯ*crp[i] for fl in @view ftb[f:l]]
it[i]=len' * @view flow[f:l]
end
end
end
Unfortunately I canโt easily use these instructions.
cd $(mktemp -d)
git clone https://github.com/abelsiqueira/TulipaEnergyModel.jl .
git checkout mwe-discourse
julia --project
pkg> instantiate
julia> include("mwe.jl")
Should they be used in the shell or in the REPL?
I have a PC with Windows 10(11?) system, is that ok?
Indeed the function had an error for a while with overwriting in pre_incoming, then I changed it from pre_incoming = to pre_incoming += in the code, which seems to achieve the same as the change you suggested. Maybe you tested my code before it was with the +=. If this is so, then the += method should be a little faster than the matrix change you suggested. So you can check it out.
(the change is in my post - just edited it in at some point yesterday).
No, they assume Linux. But you can also clone directly from VSCode: GitHub - abelsiqueira/TulipaEnergyModel.jl and then switch to branch mwe-discourse.
The file mwe.jl is the relevant file, and the input_dir defines which data is being used.
Indeed, I have the old one - I check the e-mail first, so I might miss the edits.
I will test this change after I implement the problem in the bigger picture. Summing these AffExpr terms individually is slow, so the allocation might be warranted. Letโs see.
This expression allocates because of slicing pre_incoming instead of using a view. Also, the initial value is probably spurious as the whole array is zeroed. So this should do it:
and a last optimization to check would be to switch the axes of the matrix, as summing rows and cols can vary in speed (but this might be negligible here).
I hadnโt read the details of the problem carefully enough and didnโt take advantage of the specifics.
I finally managed to create some tables from the repository clone and did some testing on these.
The scripts I used require the tables to be sorted by the :tb column as well as :rp and :asset.
Iโm not 100% sure I built the same tables you used in the benchmark.
This is the measurement I found anyway
julia> @btime inc_term!(dfj,gc,gf)
209.512 ms (3394343 allocations: 218.71 MiB)
PS
I saved the tables created in the repository as CSV. When rereading them, using a DataFrame as a sink, the ranges are understood as String15. I had to rebuild the UnitRange type by hand.
Iโm wondering if anyone knows a way to do this using system functions.
I havenโt checked your latest solution @rocco_sprmnt21, so I donโt know how it compares. It is not easy to test it with the current code, so I will have to park it for now.
With the improvements that we made, we decided that it is good enough for now, and moved to finished some other more pressing objectives. We will come back to performance later, so I might revisit this in the future.