Grouping by values in either of two columns

In order to improve performance and make the code a little cleaner, below is a refactoring of calc_out! with no findfirst calls:

function calc_out2!(df)
    dfc = select(df, [:col1, :col2] .=> categorical; 
      renamecols=false)
    dfc.n = rownumber.(eachrow(dfc))
           
    # prepare graph
    r,c = length.(levels.((dfc.col1, dfc.col2)))
    nodelastrow = zeros(Int, r+c)
    g = SimpleGraph(r+c)
    for row in eachrow(dfc)
        o, d = row.col1.ref, row.col2.ref
        nodelastrow[o] = nodelastrow[r+d] = row.n
        add_edge!(g, row.col1.ref, r+row.col2.ref)
    end
    # heavy lifting
    cc = connected_components(g)
           
    edgelastrow = maximum.(getindex.(Ref(nodelastrow), cc))
    edgestrings = unwrap.(dfc.col1[edgelastrow]) .* unwrap.(dfc.col2[edgelastrow])
    nodetoc = foldl((r,i)->(r[cc[i]] .= i; r), eachindex(cc); init=Vector{Int}(undef, r+c))
    df.out = [edgestrings[nodetoc[t.ref]] for t in dfc.col1]
    df
end

The same result as the previous version.

1 Like