Generating group id column in DataFrames?

Here is an example data.

julia> b = DataFrame(city = ["BJ", "BJ", "BJ", "HF", "HF", "TS", "TS"], year = [1990, 1991, 1992, 1990, 1991, 1990, 1991], cpi = [1, 2, 3, 4, 5, 2, 1])
7×3 DataFrame
 Row │ city    year   cpi
     │ String  Int64  Int64
─────┼──────────────────────
   1 │ BJ       1990      1
   2 │ BJ       1991      2
   3 │ BJ       1992      3
   4 │ HF       1990      4
   5 │ HF       1991      5
   6 │ TS       1990      2
   7 │ TS       1991      1

In Stata, we can generate the group variable

egen id = group(city)

How can I generate id in Julia?

Here is my solution,

julia> gd = groupby(b, :city);
julia> insertcols!(b, 1, :id => gd.groups)
7×4 DataFrame
 Row │ id     city    year   cpi
     │ Int64  String  Int64  Int64
─────┼─────────────────────────────
   1 │     1  BJ       1990      1
   2 │     1  BJ       1991      2
   3 │     1  BJ       1992      3
   4 │     2  HF       1990      4
   5 │     2  HF       1991      5
   6 │     3  TS       1990      2
   7 │     3  TS       1991      1

Anyone has another solution to this problem?

1 Like

Another way:

b.id = groupby(b, :city).groups

The groups field is internal (and undocumented). Better use the groupindices function, via transform!(groupby(df, :year), groupindices => :id) or df.id = groupindices(groupby(df, :year)).

4 Likes

The second option you provided works well but not this one quoted.
Could you please confirm?

It works here. What error do you get?

Thanks very much. This is what I want. This answer is what I am waiting for. Because sometimes we may transform the dataframe and then want to generate id.

julia> df
7×4 DataFrame
 Row │ city    year   cpi    id
     │ String  Int64  Int64  Int64?
─────┼──────────────────────────────
   1 │ BJ       1992      3       1
   2 │ BJ       1991      2       1
   3 │ HF       1991      5       2
   4 │ TS       1991      1       3
   5 │ BJ       1990      1       1
   6 │ HF       1990      4       2
   7 │ TS       1990      2       3

julia> gd = groupby(df, :city)
GroupedDataFrame with 3 groups based on key: city
First Group (3 rows): city = "BJ"
 Row │ city    year   cpi    id
     │ String  Int64  Int64  Int64?
─────┼──────────────────────────────
   1 │ BJ       1992      3       1
   2 │ BJ       1991      2       1
   3 │ BJ       1990      1       1
⋮
Last Group (2 rows): city = "TS"
 Row │ city    year   cpi    id
     │ String  Int64  Int64  Int64?
─────┼──────────────────────────────
   1 │ TS       1991      1       3
   2 │ TS       1990      2       3

julia> sort!(df, :year)
7×4 DataFrame
 Row │ city    year   cpi    id
     │ String  Int64  Int64  Int64?
─────┼──────────────────────────────
   1 │ BJ       1990      1       1
   2 │ HF       1990      4       2
   3 │ TS       1990      2       3
   4 │ BJ       1991      2       1
   5 │ HF       1991      5       2
   6 │ TS       1991      1       3
   7 │ BJ       1992      3       1

julia> df.id = groupindices(groupby(df, :city))
7-element Vector{Union{Missing, Int64}}:
 1
 2
 3
 1
 2
 3
 1

julia> df
7×4 DataFrame
 Row │ city    year   cpi    id
     │ String  Int64  Int64  Int64?
─────┼──────────────────────────────
   1 │ BJ       1990      1       1
   2 │ HF       1990      4       2
   3 │ TS       1990      2       3
   4 │ BJ       1991      2       1
   5 │ HF       1991      5       2
   6 │ TS       1991      1       3
   7 │ BJ       1992      3       1

Perfect. Thanks.

but this solution does not work

julia> transform!(groupby(df, :city), groupindices => :id)
ERROR: ArgumentError: Unrecognized column selector: DataFrames.groupindices => :id
Stacktrace:
 [1] normalize_selection(idx::DataFrames.Index, sel::Any, renamecols::Bool)
   @ DataFrames ~/.julia/packages/DataFrames/zqFGs/src/abstractdataframe/selection.jl:339
 [2] _combine_prepare_norm(gd::GroupedDataFrame{DataFrame}, cs_vec::Vector{Any}, keepkeys::Bool, ungroup::Bool, copycols::Bool, keeprows::Bool, renamecols::Bool)
   @ DataFrames ~/.julia/packages/DataFrames/zqFGs/src/groupeddataframe/splitapplycombine.jl:68
 [3] _combine_prepare(gd::GroupedDataFrame{DataFrame}, ::Base.RefValue{Any}; keepkeys::Bool, ungroup::Bool, copycols::Bool, keeprows::Bool, renamecols::Bool)
   @ DataFrames ~/.julia/packages/DataFrames/zqFGs/src/groupeddataframe/splitapplycombine.jl:50
 [4] select(::GroupedDataFrame{DataFrame}, ::Union{Regex, AbstractString, Function, Signed, Symbol, Unsigned, Pair, Type, All, Between, Cols, InvertedIndex, AbstractVecOrMat}, ::Vararg{Union{Regex, AbstractString, Function, Signed, Symbol, Unsigned, Pair, Type, All, Between, Cols, InvertedIndex, AbstractVecOrMat}}; copycols::Bool, keepkeys::Bool, ungroup::Bool, renamecols::Bool)
   @ DataFrames ~/.julia/packages/DataFrames/zqFGs/src/groupeddataframe/splitapplycombine.jl:725
 [5] transform!(gd::GroupedDataFrame{DataFrame}, args::Union{Regex, AbstractString, Function, Signed, Symbol, Unsigned, Pair, Type, All, Between, Cols, InvertedIndex, AbstractVecOrMat}; ungroup::Bool, renamecols::Bool)
   @ DataFrames ~/.julia/packages/DataFrames/zqFGs/src/groupeddataframe/splitapplycombine.jl:784
 [6] transform!(gd::GroupedDataFrame{DataFrame}, args::Union{Regex, AbstractString, Function, Signed, Symbol, Unsigned, Pair, Type, All, Between, Cols, InvertedIndex, AbstractVecOrMat})
   @ DataFrames ~/.julia/packages/DataFrames/zqFGs/src/groupeddataframe/splitapplycombine.jl:782
 [7] top-level scope
   @ REPL[313]:1

caused by: MethodError: no method matching getindex(::DataFrames.Index, ::Pair{typeof(groupindices), Symbol})
Closest candidates are:
  getindex(::DataFrames.AbstractIndex, ::InvertedIndex) at ~/.julia/packages/DataFrames/zqFGs/src/other/index.jl:224
  getindex(::DataFrames.AbstractIndex, ::All) at ~/.julia/packages/DataFrames/zqFGs/src/other/index.jl:227
  getindex(::DataFrames.AbstractIndex, ::AbstractRange{Int64}) at ~/.julia/packages/DataFrames/zqFGs/src/other/index.jl:208
  ...
Stacktrace:
 [1] normalize_selection(idx::DataFrames.Index, sel::Any, renamecols::Bool)
   @ DataFrames ~/.julia/packages/DataFrames/zqFGs/src/abstractdataframe/selection.jl:336
 [2] _combine_prepare_norm(gd::GroupedDataFrame{DataFrame}, cs_vec::Vector{Any}, keepkeys::Bool, ungroup::Bool, copycols::Bool, keeprows::Bool, renamecols::Bool)
   @ DataFrames ~/.julia/packages/DataFrames/zqFGs/src/groupeddataframe/splitapplycombine.jl:68
 [3] _combine_prepare(gd::GroupedDataFrame{DataFrame}, ::Base.RefValue{Any}; keepkeys::Bool, ungroup::Bool, copycols::Bool, keeprows::Bool, renamecols::Bool)
   @ DataFrames ~/.julia/packages/DataFrames/zqFGs/src/groupeddataframe/splitapplycombine.jl:50
 [4] select(::GroupedDataFrame{DataFrame}, ::Union{Regex, AbstractString, Function, Signed, Symbol, Unsigned, Pair, Type, All, Between, Cols, InvertedIndex, AbstractVecOrMat}, ::Vararg{Union{Regex, AbstractString, Function, Signed, Symbol, Unsigned, Pair, Type, All, Between, Cols, InvertedIndex, AbstractVecOrMat}}; copycols::Bool, keepkeys::Bool, ungroup::Bool, renamecols::Bool)
   @ DataFrames ~/.julia/packages/DataFrames/zqFGs/src/groupeddataframe/splitapplycombine.jl:725
 [5] transform!(gd::GroupedDataFrame{DataFrame}, args::Union{Regex, AbstractString, Function, Signed, Symbol, Unsigned, Pair, Type, All, Between, Cols, InvertedIndex, AbstractVecOrMat}; ungroup::Bool, renamecols::Bool)
   @ DataFrames ~/.julia/packages/DataFrames/zqFGs/src/groupeddataframe/splitapplycombine.jl:784
 [6] transform!(gd::GroupedDataFrame{DataFrame}, args::Union{Regex, AbstractString, Function, Signed, Symbol, Unsigned, Pair, Type, All, Between, Cols, InvertedIndex, AbstractVecOrMat})
   @ DataFrames ~/.julia/packages/DataFrames/zqFGs/src/groupeddataframe/splitapplycombine.jl:782
 [7] top-level scope
   @ REPL[313]:1

Why doesn’t this code work?

@transform! df :id = groupindices(groupby(df, :city))

What version of DataFrames are you on?

This doesn’t work because here :city is pared as the column city, so it’s equivalent to calling

groupby(df, ["BJ", "BJ", "BJ", "HF", "HF", "TS", "TS"])

For proper escaping rules here, do

julia> @transform! df :id = groupindices(groupby(df, ^(:city)))
7×4 DataFrame
 Row │ city    year   cpi    id     
     │ String  Int64  Int64  Int64? 
─────┼──────────────────────────────
   1 │ BJ       1990      1       1
   2 │ BJ       1991      2       1
   3 │ BJ       1992      3       1
   4 │ HF       1990      4       2
   5 │ HF       1991      5       2
   6 │ TS       1990      2       3
   7 │ TS       1991      1       3

See docs here.

(test) pkg> st
      Status `~/test/Project.toml`
  [a93c6f00] DataFrames v1.3.4
  [1313f7d8] DataFramesMeta v0.11.0
  [9d5cd8c9] FixedEffectModels v1.6.3
  [612083be] Queryverse v0.7.0
  [ce6b1742] RDatasets v0.7.7
  [d519eb52] RegressionTables v0.5.4
  [bd369af6] Tables v1.7.0

I am getting the same error message as Raymond got here.

versioninfo()
(@v1.7) pkg> st DataFrames
      Status `C:\Users\jrafa\.julia\environments\v1.7\Project.toml`
  [a93c6f00] DataFrames v1.3.4

julia> versioninfo()
Julia Version 1.7.3
Commit 742b9abb4d (2022-05-06 12:58 UTC)
Platform Info:
  OS: Windows (x86_64-w64-mingw32)
  CPU: Intel(R) Core(TM) i7-1065G7 CPU @ 1.30GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-12.0.1 (ORCJIT, icelake-client)
Environment:
  JULIA_PKG_USE_CLI_GIT = true
  JULIA_STACKFRAME_FUNCTION_COLOR = blue
  JULIA_WARN_COLOR = cyan
  JULIA_EDITOR = code.cmd -g
  JULIA_NUM_THREADS = 8

I run the example in doc, but gets error.

(test) pkg> st
      Status `~/test/Project.toml`
  [a93c6f00] DataFrames v1.3.4
  [1313f7d8] DataFramesMeta v0.11.0
  [9d5cd8c9] FixedEffectModels v1.6.3
  [612083be] Queryverse v0.7.0
  [ce6b1742] RDatasets v0.7.7
  [d519eb52] RegressionTables v0.5.4
  [bd369af6] Tables v1.7.0

julia> df = DataFrame(x = [1, 1, 2, 2], y = [1, 2, 101, 102]);

julia> @select(df, :x2 = :x, :x3 = ^(:x))
ERROR: MethodError: no method matching ^(::Vector{Int64})
Closest candidates are:
  ^(::Union{AbstractChar, AbstractString}, ::Integer) at /Applications/Julia-1.7.app/Contents/Resources/julia/share/julia/base/strings/basic.jl:721
  ^(::Rational, ::Integer) at /Applications/Julia-1.7.app/Contents/Resources/julia/share/julia/base/rational.jl:475
  ^(::Number, ::Missing) at /Applications/Julia-1.7.app/Contents/Resources/julia/share/julia/base/missing.jl:124

Yikes. Please file an issue so I can track it

Ah sorry this has been fixed on master (soon to be 1.4.0) but it still fails on 1.3.4.

1 Like

df = DataFrame(city = ["BJ", "BJ", "BJ", "HF", "HF", "TS", "TS"], 
year = [1990, 1991, 1992, 1990, 1991, 1990, 1991], cpi = [1, 2, 3, 4, 5, 2, 1])

# groupindices(groupby(df,:city))

grp=groupby(df,:city)

vcat(grp..., source="groupid")