In order to improve performance and make the code a little cleaner, below is a refactoring of calc_out! with no findfirst calls:
function calc_out2!(df)
dfc = select(df, [:col1, :col2] .=> categorical;
renamecols=false)
dfc.n = rownumber.(eachrow(dfc))
# prepare graph
r,c = length.(levels.((dfc.col1, dfc.col2)))
nodelastrow = zeros(Int, r+c)
g = SimpleGraph(r+c)
for row in eachrow(dfc)
o, d = row.col1.ref, row.col2.ref
nodelastrow[o] = nodelastrow[r+d] = row.n
add_edge!(g, row.col1.ref, r+row.col2.ref)
end
# heavy lifting
cc = connected_components(g)
edgelastrow = maximum.(getindex.(Ref(nodelastrow), cc))
edgestrings = unwrap.(dfc.col1[edgelastrow]) .* unwrap.(dfc.col2[edgelastrow])
nodetoc = foldl((r,i)->(r[cc[i]] .= i; r), eachindex(cc); init=Vector{Int}(undef, r+c))
df.out = [edgestrings[nodetoc[t.ref]] for t in dfc.col1]
df
end
The same result as the previous version.