Grouping by values in either of two columns

Hi,

I wonder if there is an efficient way of doing the following in Julia DataFrames.
Given a DataFrame

df = DataFrame(col1 = ["a", "a", "b", "b", "c", "d"],
               col2 = ["p", "q", "q", "r", "s", "t"])

I want to create a new column col3 as follows

df_new = DataFrame(col1 = ["a", "a", "b", "b", "c", "d"], 
                   col2 = ["p", "q", "q", "r", "s", "t"],
                   col3 = ["br", "br", "br", "br", "cs", "dt"])

The values of col3 are computed as follows:

  1. rows are grouped such that each of them has common values in either of col1 and col2 with (at least one) some of the rows.
  2. For such groups, the values of col3 are computed as the :col1 * :col2 of the last row.

In the above example, if we group the rows w.r.t. col1 and apply the rule 2, the col3 is given by

col3 = ["aq", "aq", "br", "br", "cs", "dt"]

whereas grouping in terms of col2 leads to

col3 = ["ap", "bq", "bq", "br", "cs", "dt"]

I want to take the union of the two relations above (= w.r.t col1 or col2). Schematically, we get
(1st ~ 2nd and 3rd ~ 4th) + (2nd ~ 3rd) = (1st ~ 2nd ~ 3rd ~ 4th) ,
so that col3 becomes

col3 = ["br", "br", "br", "br", "cs", "dt"]

This is a non-standard grouping condition. You would have to write a function that identifies groups yourself. This should be efficient and relatively simple.

The condition feels like it is boiling down to finding connected components in a graph where col1 and col2 are edge origin and destinations.

It’s a bit involved, but gets to the right answer:

using LightGraphs, DataFrames, CategoricalArrays
df = DataFrame(col1 = ["a", "a", "b", "b", "c", "d"],
               col2 = ["p", "q", "q", "r", "s", "t"])

function calc_out!(df)
    dfc = select(df, [:col1, :col2] .=> categorical; 
                 renamecols=false)
    dfc.n = rownumber.(eachrow(dfc))
    r,c = length.(levels.((dfc.col1, dfc.col2)))
    g = SimpleGraph(r+c)
    for row in eachrow(dfc)
        add_edge!(g, row.col1.ref, r+row.col2.ref)
    end
    cc = connected_components(g)
    lastrow = zeros(Int, length(cc))
    foreach(eachrow(dfc)) do row
        lastrow[findfirst(βˆ‹(row.col1.ref), cc)] = row.n
        lastrow[findfirst(βˆ‹(r + row.col2.ref), cc)] = row.n
    end
    df.out = map(eachrow(dfc)) do row
        ff = findfirst(βˆ‹(row.col1.ref), cc)
        unwrap(dfc.col1[lastrow[ff]])*unwrap(dfc.col2[lastrow[ff]])
    end
    df
end

And with this function:

julia> calc_out!(df)
6Γ—3 DataFrame

 Row β”‚ col1    col2    out    
     β”‚ String  String  String 
─────┼────────────────────────
   1 β”‚ a       p       br
   2 β”‚ a       q       br
   3 β”‚ b       q       br
   4 β”‚ b       r       br
   5 β”‚ c       s       cs
   6 β”‚ d       t       dt
1 Like

The criterion seems of dubious associativity.
Limiting ourselves to the proposed example, the IterTools groupby function seems to do the trick (possibly after having sorted the table rows)

using IterTools

TS=tuple.(df.col1,df.col2)

IterTools.groupby(let s=[TS[1]]; c-> s = !isempty(intersect(c, last(s))) ? push!(s,c) : [c]  end, TS) |>collect
using DataFrames

df = DataFrame(col1 = ["a", "a", "b", "b", "c", "d","t"],
               col2 = ["p", "q", "q", "r", "s", "t","u"])


using IterTools
julia> function grpgrph(c1,c2)
           c12=.*(c1,c2)
           dgrp=IterTools.groupby(let s=[c12[1]]; c-> s = !isempty(intersect(c, last(s))) ?  push!(s,c) : [c]  end, c12)
           vcat([accumulate(coalesce,reverse(v)) for v in dgrp]...)   
       end
grpgrph (generic function with 1 method)

julia> 

julia> 

julia> transform(df,[:col1,:col2]=>grpgrph=>:c1c2)
7Γ—3 DataFrame
 Row β”‚ col1    col2    c1c2   
     β”‚ String  String  String 
─────┼────────────────────────
   1 β”‚ a       p       br
   2 β”‚ a       q       br
   3 β”‚ b       q       br
   4 β”‚ b       r       br
   5 β”‚ c       s       cs
   6 β”‚ d       t       tu
   7 β”‚ t       u       tu

in the case of many subgroups

function grpgrph(c1,c2)
    c12=.*(c1,c2)
    dgrp=IterTools.groupby(let s=[c12[1]]; c-> s = !isempty(intersect(c, last(s))) ?  push!(s,c) : [c]  end, c12)
    reduce(vcat, accumulate(coalesce,reverse(v)) for v in dgrp)
end

I thank you all for the feedback. While taking a closer look at all the replies (sorry that I am relatively new to Julia…), I edited my post a bit to explain the example better.

Thanks a lot for your function.
Could you please tell me what β€œref” in row.col1.ref? Where should I refer to?

I am currently check if this works in the cases where the number of vertices equals or approaches the typemax of the underlying graph element (e.g. , a SimpleGraph{UInt8} with 127 vertices).

Thanks a lot for your work. Indeed, the relation to link rows looks bizarre.
As far as I have understood your function, it seems that the result depends on for which column to sort the rows. I understand that you showed some limiting example.

The ref field contains the category number associated with the value in this row cell (as a categorical type). This number is used as the vertex number in the generated graph.

1 Like

The relationship between the lines is now clear. What is missing to give a specifically suitable proposal is the input β€œdomain”.
Try this version.
It should β€œwork” for cases like the following:

julia> df
9Γ—2 DataFrame
 Row β”‚ col1    col2   
     β”‚ String  String
─────┼────────────────
   1 β”‚ a       p
   2 β”‚ a       q
   3 β”‚ b       q
   4 β”‚ b       r
   5 β”‚ c       s
   6 β”‚ d       t
   7 β”‚ t       u
   8 β”‚ c       q
   9 β”‚ d       v

julia> transform(df,[:col1,:col2]=>grpgrph1=>:c1c2)
9Γ—3 DataFrame
 Row β”‚ col1    col2    c1c2   
     β”‚ String  String  String
─────┼────────────────────────
   1 β”‚ a       p       cq
   2 β”‚ a       q       cq
   3 β”‚ b       q       cq
   4 β”‚ b       r       cq
   5 β”‚ c       s       cq
   6 β”‚ d       t       dv
   7 β”‚ t       u       dv
   8 β”‚ c       q       cq
   9 β”‚ d       v       dv

function grpgrph1(c1,c2)
    v=Vector{Set{String}}()
    rows=Vector{Set{Int}}()
    push!(v,Set([df.col1[1],df.col2[1]]))
    push!(rows,Set([1]))
    c3=similar(c1)
    for (i,t) in enumerate(zip(c1,c2))
        fl=true
        for j in eachindex(v)
            if !isempty(intersect(v[j],t))
                push!(v[j],t...)
                push!(rows[j],i)
                fl=false
            end
        end
            if fl
                push!(v,Set(t))
                push!(rows,Set([i]))
            end
    end
    for i in eachindex(rows)
        m=maximum(rows[i])
        for r in rows[i]
            c3[r]=c1[m]*c2[m]
        end
    end
    c3
end

If it’s right for you and of interest, I can illustrate the idea.
You can probably choose a more suitable data structure, but you can try to think about this later if it’s worth it.

1 Like

In order to improve performance and make the code a little cleaner, below is a refactoring of calc_out! with no findfirst calls:

function calc_out2!(df)
    dfc = select(df, [:col1, :col2] .=> categorical; 
      renamecols=false)
    dfc.n = rownumber.(eachrow(dfc))
           
    # prepare graph
    r,c = length.(levels.((dfc.col1, dfc.col2)))
    nodelastrow = zeros(Int, r+c)
    g = SimpleGraph(r+c)
    for row in eachrow(dfc)
        o, d = row.col1.ref, row.col2.ref
        nodelastrow[o] = nodelastrow[r+d] = row.n
        add_edge!(g, row.col1.ref, r+row.col2.ref)
    end
    # heavy lifting
    cc = connected_components(g)
           
    edgelastrow = maximum.(getindex.(Ref(nodelastrow), cc))
    edgestrings = unwrap.(dfc.col1[edgelastrow]) .* unwrap.(dfc.col2[edgelastrow])
    nodetoc = foldl((r,i)->(r[cc[i]] .= i; r), eachindex(cc); init=Vector{Int}(undef, r+c))
    df.out = [edgestrings[nodetoc[t.ref]] for t in dfc.col1]
    df
end

The same result as the previous version.

1 Like

A refactoring of the function and a question:

Refact

function grpgrph1(c1,c2)
    v=Vector{Set{String}}()
    rows=Vector{Vector{Int}}()
    push!(v,Set((df.col1[1],df.col2[1])))
    push!(rows,[1])
    c3=similar(c1)
    for (r,(t1,t2)) in enumerate(zip(c1,c2))
        fl=true
        for j in eachindex(v)
            if t1 in v[j] || t2 in v[j] 
                push!(v[j],t1,t2)
                push!(rows[j],r)
                fl=false
            end
        end
        if fl
            push!(v,Set((t1,t2)))
            push!(rows,[r])
        end
    end
    for i in eachindex(rows)
        m=maximum(rows[i])
        for r in rows[i]
            c3[r]=c1[m]*c2[m]
        end
    end
    df.c3=c3
    df
end

To determine the rows to group, is it enough to verify that any of the two elements of the pair is in any pair of the group or should the first element be compared with the first elements and the second element with the second elements?

In this second case the function would have to be changed like this


function grpgrph2(c1,c2)
    o=Vector{Set{String}}()
    d=Vector{Set{String}}()
    rows=Vector{Set{Int}}()
    push!(o,Set(Ref(df.col1[1])))
    push!(d,Set(Ref(df.col2[1])))
    push!(rows,Set([1]))
    c3=similar(c1)
    function isconnected(o,d,rows,or,dr,r)
        fl=false
        for j in eachindex(o)
            # if or in o[j] || dr in d[j]
            #     push!(o[j],or)
            #     push!(d[j],dr)
            #     push!(rows[j],r)
            #     fl=true
            # end
            if or in o[j] 
                push!(d[j],dr)
                push!(rows[j],r)
                fl=true
            elseif dr in d[j]
                push!(o[j],or)
                push!(rows[j],r)
                fl=true
            end
        end
        fl
    end
    for (r,(or,dr)) in enumerate(zip(c1,c2))
        if !isconnected(o,d,rows,or,dr,r)
            push!(o,Set(Ref(or)))
            push!(d,Set(Ref(dr)))
            push!(rows,Set([r]))
        end
    end
    for i in eachindex(rows)
        m=maximum(rows[i])
        for r in rows[i]
            c3[r]=c1[m]*c2[m]
        end
    end
    df.c3=c3
    df
end

But if in the data to be processed it doesn’t happen that some value is in both columns, then the first version which is more concise and efficient is also fine.

1 Like

Thank you very much for the update. It is interesting to see that the function works even in the case where the number of the vertices is larger than 127 (= Int8(typemax(Int8))). I just tried applying calc_out2! to df given by, for example,

using Random
df = DataFrame(col1 = [randstring(2) for i in 1:256], col2 = [randstring(2) for j in 1:256])

, where the total number of elements in unique(df.col1) and unique(df.col2) is larger than 127.

Thank you very much for your update. I wrote down the (intermmediate/final) output from the function step by step. I see, it is nice to build vectors containing sets of the group elements and row numbers, respectively.

I expect that the two columns are independent of each other and the comparison is to be performed column-wise.

Here is a script that does not depend on any package.
I tried to measure the efficiency on the following df and after many attempts it seems to be more efficient (allocates about half) than calc_out2!

df = DataFrame(col1 = [randstring(2) for i in 1:256], 
col2 = [randstring(2) for j in 1:256], c3="")
Contains a sneaky bug

function rowsconnected(df)
    function adj(n,lk1)
        n >lk1 ? g2[n-lk1].gi1 : g1[n].gi2
    end

    function connecten(n,lk1, notyetused, conn)
        append!(conn,n>lk1 ? getfield(g2[n-lk1],:rows) : getfield(g1[n],:rows))
        deleteat!(notyetused,searchsortedfirst(notyetused,n))
        for ns in adj(n,lk1)
            (searchsortedfirst(notyetused,ns) < searchsortedlast(notyetused,ns)) && connecten(ns,lk1,notyetused,conn)
            # ns ∈ notyetused && connected(ns,lk1,notyetused,conn)
        end
    end
    g1=groupby(df,:col1)
    lk1=length(keys(g1))
    g2=groupby(df,:col2)
    df.gi1=groupindices(g1)
    df.gi2=groupindices(g2).+lk1
    notyetused=Int[df.gi1;df.gi2]
    unique!(notyetused)
    for n in notyetused
            conn=Int[]
            connecten(n,lk1,notyetused, conn)
            unique!(conn)
            m=maximum(conn)
            for r in conn
                df.c3[r]=df.col1[m]*df.col2[m]
            end
    end
    df
end

I haven’t been able to find a better solution for this point yet.
That is, find a function ssf(sorted_arr,x) that returns true if x is in sortedarr or false otherwise.

(searchsortedfirst(notyetused,ns) < searchsortedlast(notyetused,ns))

I add a correct version of the script without dependencies.

indipendent

function rowsconnected2(df)

    function connecten(n,lk1, notyetused, conn)
        if n >lk1
            g2n=g2[n-lk1]
            adj=g2n.gi1
            rn=getfield(g2n,:rows)
        else
            g1n=g1[n]
            adj=g1n.gi2
            rn=getfield(g1n,:rows)
        end
        # adj= n >lk1 ? g2[n-lk1].gi1 : g1[n].gi2
        # rn = n > lk1 ? getfield(g2[n-lk1],:rows) : getfield(g1[n],:rows)
        append!(conn,rn)
        deleteat!(notyetused,searchsortedfirst(notyetused,n))
        for ns in adj
            (searchsortedfirst(notyetused,ns) <= searchsortedlast(notyetused,ns)) && connecten(ns,lk1,notyetused,conn)
            # ns ∈ notyetused && connected(ns,lk1,notyetused,conn)
        end
    end
    g1=groupby(df,:col1)
    lk1=length(keys(g1))
    g2=groupby(df,:col2)
    df.gi1=groupindices(g1)
    df.gi2=groupindices(g2).+lk1
    notyetused=Int[df.gi1;df.gi2]
    unique!(notyetused)

    while !isempty(notyetused)
        n=first(notyetused)
        conn=Int[]
            connecten(n,lk1,notyetused, conn)
            unique!(conn)
            m=maximum(conn)
            for r in conn
                df.c3[r]=df.col1[m]*df.col2[m]
            end
    end
    df
end

and a modified one of calc_out

calc_out3!

function calc_out3!(df)
    g1=groupby(df,:col1)
    g2=groupby(df,:col2)
    df.c1=groupindices(g1)
    df.c2=groupindices(g2).+length(keys(g1))
    # prepare graph
    r,c = length.((keys(g1),keys(g2)))
    g = SimpleGraph(r+c)
    add_edge!.(Ref(g), zip(df.c1,df.c2))
    cc = connected_components(g)
    
    for i in eachindex(cc)
        #nv=mapreduce( ic->ic>r ? getfield(g2[ic-r],:rows) : getfield(g1[ic],:rows) , vcat, cc[i])

        nv=Int[]
        foreach(e->e > r ? append!(nv,getfield(g2[e-r],:rows)) : append!(nv,getfield(g1[e],:rows)),cc[i])
      
        # for e in cc[i]
        #     e > r ? append!(nv,getfield(g2[e-r],:rows)) : append!(nv,getfield(g1[e],:rows))
        # end
       
        unique!(nv)
        m=maximum(nv)
        for j in nv
            df.c3[j]=df.col1[m]*df.col2[m]
        end
    end
    df
end

Is it possible and how to modify the indipendent to make it as fast as the calc_out3?