Counting in dataframes

optimist · June 5, 2023, 6:56pm

I have a dataframe in which I need to count the number of occurences of values in a given list. Suppose I have a dataframe df:

df= DataFrame(:B1 => [1,1,2,2,3,4,5], :B2 => [1,7,7,2,2,5,5],:B3 => [2,2,2,3,3,3,3] )

7×3 DataFrame
 Row │ B1     B2     B3    
     │ Int64  Int64  Int64 
─────┼─────────────────────
   1 │     1      1      2
   2 │     1      7      2
   3 │     2      7      2
   4 │     2      2      3
   5 │     3      2      3
   6 │     4      5      3
   7 │     5      5      3

And I want to produce df2, where the column VAL has numbers 1 to 7 and I want to produce a count of them against the entries in columns of the dataframe df.

df2 = DataFrame(:VAL => collect(1:7),:B1 => [2,2,1,1,1,0,0], :B2 => [1,2,2,0,0,2,2],:B3 => [0,3,4,0,0,0,0])

7×4 DataFrame
 Row │ VAL    B1     B2     B3    
     │ Int64  Int64  Int64  Int64 
─────┼────────────────────────────
   1 │     1      2      1      0
   2 │     2      2      2      3
   3 │     3      1      2      4
   4 │     4      1      0      0
   5 │     5      1      0      0
   6 │     6      0      2      0
   7 │     7      0      2      0

How can I do this?

bkamins · June 5, 2023, 7:10pm

You could probably write a custom function doing this, but here is a solution using predefined functions only:

julia> @chain [rename!(combine(groupby(df, "B$i"), nrow => :VAL), ["VAL", "B$i"]) for i in 1:3] begin
           outerjoin(_..., on=:VAL)
           coalesce.(0)
           sort!(:VAL)
       end
6×4 DataFrame
 Row │ VAL    B1     B2     B3
     │ Int64  Int64  Int64  Int64
─────┼────────────────────────────
   1 │     1      2      1      0
   2 │     2      2      2      3
   3 │     3      1      0      4
   4 │     4      1      0      0
   5 │     5      1      2      0
   6 │     7      0      2      0

optimist · June 5, 2023, 7:18pm

bkamins:

@chain [rename!(combine(groupby(df, "B$i"), nrow => :VAL), ["VAL", "B$i"]) for i in 1:3] begin
           outerjoin(_..., on=:VAL)
           coalesce.(0)
           sort!(:VAL)
       end

Thank you!

juliohm · June 5, 2023, 10:54pm

Whenever you need to count entries in a table to produce another table with these counts, you are after what is called contingency tables:

Dan · June 6, 2023, 12:10am

Another method is to use countmap from StatsBase which is a good function to know:

using StatsBase

df2 = DataFrame([
  1:7 get.(permutedims(countmap.(eachcol(df))),1:7,0)], 
  ["VAL",names(df)...])

giving df2 as in the OP.

rocco_sprmnt21 · June 6, 2023, 9:38pm

rng=(:)(extrema(union(eachcol(df)...))...)

m=[count(==(j), df[:,i]) for j in rng , i in 1:ncol(df)]  #used the function called in the title :smile:

DataFrame([rng;;m],["val";names(df)...])

rocco_sprmnt21 · June 7, 2023, 7:08am

just for fun

udf=stack(df,Cols(:))
push!(udf,("",6))
gudf=groupby(udf,:value)
cgudf=combine(gudf, x->unique(combine(groupby(x,:variable),:variable=>length=>:count, :value)))
select(unstack(cgudf, :variable, :count, fill=0), Not(""))

or


udf=stack(df,Cols(:))
cgudf=combine(groupby(udf,:value), x->combine(groupby(x,:variable),:variable=>length=>:count, :value))
push!(unstack(cgudf, :variable, :count, combine=first,fill=0),(6,0,0,0))

#-----------------

udf=stack(df,Cols(:))
cgudf=combine(groupby(udf,:value), x->combine(groupby(x,:variable),:variable=>length=>:count, :value))
dfu=unstack(cgudf, :variable, :count, combine=first,fill=0)
lac=[(i,0,0,0) for i in (:)(extrema(dfu.value)...) if i ∉ dfu.value]
push!(dfu,lac...)

The following one is perhaps less convoluted than the others

#-----------------

  udf=stack(df,Cols(:))
  cg=combine(groupby(udf, [:value,:variable]),nrow)
  ucg=unstack(cg,:variable,:nrow, fill=0)
  push!(ucg,(6,0,0,0))


#------------------------

rocco_sprmnt21 · June 7, 2023, 1:49pm

a comparison between the different proposals for the small example df

function tr17(df)
    m=[length(findall(==(j), df[:,i])) for j in 1:7 , i in 1:3]
    #m=(count(==(j), df[:,i]) for j in 1:7 , i in 1:ncol(df))
    DataFrame([1:7;;m],["val";names(df)...])
end


julia> @btime tr17(df)
  5.550 μs (155 allocations: 9.26 KiB)
7×4 DataFrame
 Row │ val    B1     B2     B3    
     │ Int64  Int64  Int64  Int64
─────┼────────────────────────────
   1 │     1      2      1      0
   2 │     2      2      2      3
   3 │     3      1      0      4
   4 │     4      1      0      0
   5 │     5      1      2      0
   6 │     6      0      0      0
   7 │     7      0      2      0
julia> using DataFramesMeta

julia> @btime @chain [rename!(combine(groupby(df, "B$i"), nrow => :VAL), ["VAL", "B$i"]) for i in 1:3] begin
                  outerjoin(_..., on=:VAL)
                  coalesce.(0)
                  sort!(:VAL)
              end
  185.800 μs (1306 allocations: 79.44 KiB)
6×4 DataFrame
 Row │ VAL    B1     B2     B3    
     │ Int64  Int64  Int64  Int64
─────┼────────────────────────────
   1 │     1      2      1      0
   2 │     2      2      2      3
   3 │     3      1      0      4
   4 │     4      1      0      0
   5 │     5      1      2      0
   6 │     7      0      2      0

julia> using StatsBase

julia> @btime df2 = DataFrame([
         1:7 get.(permutedims(countmap.(eachcol(df))),1:7,0)],
         ["VAL",names(df)...])
  3.943 μs (64 allocations: 5.29 KiB)
7×4 DataFrame
 Row │ VAL    B1     B2     B3    
     │ Int64  Int64  Int64  Int64
─────┼────────────────────────────
   1 │     1      2      1      0
   2 │     2      2      2      3
   3 │     3      1      0      4
   4 │     4      1      0      0
   5 │     5      1      2      0
   6 │     6      0      0      0
   7 │     7      0      2      0



julia> function stunst(df)
         udf=stack(df,Cols(:))
         cg=combine(groupby(udf, [:value,:variable]),nrow)
         ucg=unstack(cg,:variable,:nrow, fill=0)
         push!(ucg,(6,0,0,0))
       end
stunst (generic function with 1 method)

julia> @btime stunst(df)
  80.400 μs (531 allocations: 34.33 KiB)
7×4 DataFrame
 Row │ value  B1     B2     B3    
     │ Int64  Int64  Int64  Int64
─────┼────────────────────────────
   1 │     1      2      1      0
   2 │     2      2      2      3
   3 │     3      1      0      4
   4 │     4      1      0      0
   5 │     5      1      2      0
   6 │     7      0      2      0
   7 │     6      0      0      0

Topic		Replies	Views
Counts of unique values per group in a DataFrame Data question , dataframes	3	10201	May 25, 2020
How to count the number of categories present in a column of a DataFrame Data question , dataframes	8	4490	September 18, 2022
Pandas value_counts() equivalent in base Julia or in Data frame package? General Usage	1	246	January 27, 2023
Counting Value Occurences in Time Range in DataFrames Data dataframes	3	434	April 28, 2021
Efficiently finding the frequency of patterns in DataFrame columns New to Julia dataframes , dictionaries , splitapplycombine	12	1551	January 1, 2022

Counting in dataframes

Related topics