How to reshape wide to long?

Dear all,

I want to reshape the wide data to panel data, here is an example.

julia> df = DataFrame(id = [1, 2, 3], gdp1999 = [2, 3, 4], gdp2000 = [3, 4, 5],
                      gdp2001 = [6, 7, 8], emp1999 = [3, 2, 1], emp2000 = [4, 7, 9],
                      emp2001 = [5, 4, 2])
3×7 DataFrame
 Row │ id     gdp1999  gdp2000  gdp2001  emp1999  emp2000  emp2001
     │ Int64  Int64    Int64    Int64    Int64    Int64    Int64
─────┼─────────────────────────────────────────────────────────────
   1 │     1        2        3        6        3        4        5
   2 │     2        3        4        7        2        7        4
   3 │     3        4        5        8        1        9        2

what I want is

 Row │ id     year       gdp     emp
     │ Int64  SubStrin…  Int64?  Int64?
─────┼──────────────────────────────────
   1 │     1  1999            2       3
   2 │     1  2000            3       4
   3 │     1  2001            6       5
   4 │     2  1999            3       2
   5 │     2  2000            4       7
   6 │     2  2001            7       4
   7 │     3  1999            4       1
   8 │     3  2000            5       9
   9 │     3  2001            8       2

I try like this, But I don’t think it is a good way. Anyone has better idea to do this?

julia> df = DataFrame(id = [1, 2, 3], gdp1999 = [2, 3, 4], gdp2000 = [3, 4, 5],
                             gdp2001 = [6, 7, 8], emp1999 = [3, 2, 1], emp2000 = [4, 7, 9],
                             emp2001 = [5, 4, 2])
3×7 DataFrame
 Row │ id     gdp1999  gdp2000  gdp2001  emp1999  emp2000  emp2001
     │ Int64  Int64    Int64    Int64    Int64    Int64    Int64
─────┼─────────────────────────────────────────────────────────────
   1 │     1        2        3        6        3        4        5
   2 │     2        3        4        7        2        7        4
   3 │     3        4        5        8        1        9        2

julia> df1 = stack(df, Between(:gdp1999, :gdp2001), [:id], variable_name = :year, value_name = "gdp")
9×3 DataFrame
 Row │ id     year     gdp
     │ Int64  String   Int64
─────┼───────────────────────
   1 │     1  gdp1999      2
   2 │     2  gdp1999      3
   3 │     3  gdp1999      4
   4 │     1  gdp2000      3
   5 │     2  gdp2000      4
   6 │     3  gdp2000      5
   7 │     1  gdp2001      6
   8 │     2  gdp2001      7
   9 │     3  gdp2001      8

julia> @rtransform! df1 :year = SubString(:year, 4, 7)
9×3 DataFrame
 Row │ id     year       gdp
     │ Int64  SubStrin…  Int64
─────┼─────────────────────────
   1 │     1  1999           2
   2 │     2  1999           3
   3 │     3  1999           4
   4 │     1  2000           3
   5 │     2  2000           4
   6 │     3  2000           5
   7 │     1  2001           6
   8 │     2  2001           7
   9 │     3  2001           8

julia> df2 = stack(df, Between(:emp1999, :emp2001), [:id], variable_name = :year, value_name = :emp)
9×3 DataFrame
 Row │ id     year     emp
     │ Int64  String   Int64
─────┼───────────────────────
   1 │     1  emp1999      3
   2 │     2  emp1999      2
   3 │     3  emp1999      1
   4 │     1  emp2000      4
   5 │     2  emp2000      7
   6 │     3  emp2000      9
   7 │     1  emp2001      5
   8 │     2  emp2001      4
   9 │     3  emp2001      2

julia> @rtransform! df2 :year = SubString(:year, 4, 7)
9×3 DataFrame
 Row │ id     year       emp
     │ Int64  SubStrin…  Int64
─────┼─────────────────────────
   1 │     1  1999           3
   2 │     2  1999           2
   3 │     3  1999           1
   4 │     1  2000           4
   5 │     2  2000           7
   6 │     3  2000           9
   7 │     1  2001           5
   8 │     2  2001           4
   9 │     3  2001           2

julia> df = outerjoin(df1, df2, on = [:id, :year])
9×4 DataFrame
 Row │ id     year       gdp     emp
     │ Int64  SubStrin…  Int64?  Int64?
─────┼──────────────────────────────────
   1 │     1  1999            2       3
   2 │     2  1999            3       2
   3 │     3  1999            4       1
   4 │     1  2000            3       4
   5 │     2  2000            4       7
   6 │     3  2000            5       9
   7 │     1  2001            6       5
   8 │     2  2001            7       4
   9 │     3  2001            8       2

julia> sort!(df, :id)
9×4 DataFrame
 Row │ id     year       gdp     emp
     │ Int64  SubStrin…  Int64?  Int64?
─────┼──────────────────────────────────
   1 │     1  1999            2       3
   2 │     1  2000            3       4
   3 │     1  2001            6       5
   4 │     2  1999            3       2
   5 │     2  2000            4       7
   6 │     2  2001            7       4
   7 │     3  1999            4       1
   8 │     3  2000            5       9
   9 │     3  2001            8       2

it is not exactly the most intuitive and direct procedure in the world, but so be it

dfp=permutedims(df,1,strict=false)
dfp1=select(dfp,2:4,:id=>ByRow(x->(x[1:3], x[4:7]))=>[:t,:year])
st=stack(dfp1,1:3)
unstack(st,[:variable,:year],:t,:value)

an other way


dfp=permutedims(df,1,strict=false)
dfp2=select(dfp,2:4=>ByRow(Base.vect)=>:val,:id=>ByRow(x->(x[1:3], x[4:7]))=>[:t,:year])
flatten(unstack(dfp2,:year,:t,:val),[:gdp,:emp])
julia> dfp=permutedims(df,1,strict=false)
ERROR: MethodError: no method matching permutedims(::DataFrame, ::Int64; strict=false)
Closest candidates are:

on DataFrames@1.3.3. Besides, permutedims needs the first column to by either symbols or strings, no? (edit: that is what the not yet officially released strict keyword is about)

The kwarg strict set to false could manage that:

• strict : if true (the default), an error will be raised if
the values contained in the src_namescol are not all
Symbol or all AbstractString, or can all be converted to
String using convert. If false then any values are
accepted and the will be changed to strings using the
string function.

julia> dfp=permutedims(df,1,strict=false)
6×4 DataFrame
 Row │ id       1      2      3     
     │ String   Int64  Int64  Int64
─────┼──────────────────────────────
   1 │ gdp1999      2      3      4
   2 │ gdp2000      3      4      5
   3 │ gdp2001      6      7      8
   4 │ emp1999      3      2      1
   5 │ emp2000      4      7      9
   6 │ emp2001      5      4      2

julia> dfp=permutedims(df,1)
ERROR: ArgumentError: all elements of src_namescol must support conversion to String

this the status

(v1.7) pkg> st DataFrames
      Status `C:\Users\sprmn\.julia\v1.7\Project.toml`
  [a93c6f00] DataFrames v1.4.0 `https://github.com/JuliaData/DataFrames.jl.git#main`
1 Like

Right. That’s handy of course.

But it’s a yet unreleased feature. You are on the main branch, not a release :slight_smile:

try this


dfs=stack(df,2:7)

dfp1=select(dfs,[1,3],:variable=>ByRow(x->(x[1:3], x[4:7]))=>[:t,:year])
unstack(dfp1,[:id,:year],:t,:value)

PS
In hindsight, this seems, for the case in question, to be the canonical way of proceeding

2 Likes

Thanks very much. it works well.

using DataFrames, DataFramesMeta
df = DataFrame(id=[1, 2, 3],
    gdp1999=[2, 3, 4], gdp2000=[3, 4, 5], gdp2001=[6, 7, 8],
    emp1999=[3, 2, 1], emp2000=[4, 7, 9], emp2001=[5, 4, 2],
    infla1999=[2, 1, 4], infla2000=[3, 4, 5], infla2001=[5, 6, 7])

df1 = stack(df, Not(:id))
@rselect! df1 :id :value :t = :variable[1:end-4] :year = :variable[end-3:end]
unstack(df1, [:id, :year], :t, :value)

Thanks so much.

one more small generalization beyond yours, which works with version 1.4.0 # main.
I haven’t played with this feature with previous versions of Dataframes so I don’t know if it works with the others

julia> df = DataFrame(id = [1, 2, 3], gdp1999 = [2, 3, 4], gdp2000 = [3, 4, 5], gdp2001 = [6, 7, 8], 
                      emp1999 = [3, 2, 1], emp2000 = [4, 7, 9], emp2001 = [5, 4, 2],
                      infla1999=[2, 1, 4], infla2002=[3, 4, 5], infla2001=[5, 6, 7])

julia> unstack(dfp1,[:id,:year],:t,:value)
12×5 DataFrame
 Row │ id     year    gdp      emp      infla
     │ Int64  String  Int64?   Int64?   Int64?
─────┼──────────────────────────────────────────
   1 │     1  1999          2        3        2
   2 │     2  1999          3        2        1
   3 │     3  1999          4        1        4
   4 │     1  2000          3        4  missing
   5 │     2  2000          4        7  missing
   6 │     3  2000          5        9  missing
   7 │     1  2001          6        5        5
   8 │     2  2001          7        4        6
   9 │     3  2001          8        2        7
  10 │     1  2002    missing  missing        3
  11 │     2  2002    missing  missing        4
  12 │     3  2002    missing  missing        5

julia> unstack(dfp1,[:id,:year],:t,:value, fill="\U1F64A")
12×5 DataFrame
 Row │ id     year    gdp  emp  infla 
     │ Int64  String  Any  Any  Any   
─────┼────────────────────────────────
   1 │     1  1999    2    3    2
   2 │     2  1999    3    2    1
   3 │     3  1999    4    1    4
   4 │     1  2000    3    4    🙊
   5 │     2  2000    4    7    🙊
   6 │     3  2000    5    9    🙊
   7 │     1  2001    6    5    5
   8 │     2  2001    7    4    6
   9 │     3  2001    8    2    7
  10 │     1  2002    🙊   🙊   3
  11 │     2  2002    🙊   🙊   4
  12 │     3  2002    🙊   🙊   5

I have written a function that uses column names instead of numbers and can reshape multiple groups at the same time:

function pivot_longer(df::AbstractDataFrame,bycol::Symbol,cols::Vector{Symbol};period = :Period)
    # temp = nothing
     df_longer = nothing
     m = nothing
    for col ∈ cols
        temp = select(df,Regex(String(col)))
        for colname ∈ names(temp)
            m = match(Regex("($(col))(?<Value>\\d+)"),String(colname))
        if m == nothing
            println("Column $(col) has no regex match")
        else
            rename!(temp,colname=>Symbol(m[2]))
        end
        end
        temp[!,bycol] = df[!,bycol]
        temp = stack(temp,Not(bycol),variable_eltype=String)
        rename!(temp,:variable => period, :value => Symbol(m[1]))
        if df_longer == nothing
            df_longer = temp
        else
            df_longer = outerjoin(df_longer, temp, on = [bycol,period],makeunique=true)
        end
    end
    return df_longer
end

Results to:

julia> pivot_longer(df,:id,[:gdp, :emp],period=:year)
9×4 DataFrame
 Row │ id     year    gdp     emp
     │ Int64  String  Int64?  Int64?
─────┼───────────────────────────────
   1 │     1  1999         2       3
   2 │     2  1999         3       2
   3 │     3  1999         4       1
   4 │     1  2000         3       4
   5 │     2  2000         4       7
   6 │     3  2000         5       9
   7 │     1  2001         6       5
   8 │     2  2001         7       4
   9 │     3  2001         8       2
1 Like

This function is very useful. Thanks.

This function is very useful. Thanks.