Reshape long

Matthew · November 29, 2021, 8:14pm

Hi everyone,

I’m loving Julia but am having some difficulties reshaping my dataframe to a “long” format.

df = DataFrame(ID = 1:3, 
    sex = [0,1,0], 
    inc80 = [5500,2200,2000], 
    inc81 = incB,
    inc82 = incC, 
    ue80 = [0,1,0],
    ue81 = [1,0,0],
    ue82 = [0,0,1]);

in Stata, I would use:
reshape long inc ue, i(id) j(year)

and return

    Row │ ID     year   sex     inc       ue 
        │ Int64  Int64   Int64  Float64   Int64  
   ─────┼─────────────────────────────────────────────────
     1  │     1	80	0	5000	0
     2  │     1	81	0	5500	1
     3  │     1	82	0	6000	0
     4  │     2	80	1	2000	1
     5  │     2	81	1	2200	0
     6  │     2	82	1	3300	0
     7  │     3	80	0	3000	0
     8  │     3	81	0	2000	0
     9  │     3	82	0	1000	1

I’ve come moderately close with:

sort(stack(df, Not([:ID,:sex])))
---
 Row │ ID     sex    variable  value
     │ Int64  Int64  String    Int64
─────┼───────────────────────────────
   1 │     1      0  inc80      5500
   2 │     1      0  inc81      6500
   3 │     1      0  inc82      7500
   4 │     1      0  ue80          0
   5 │     1      0  ue81          1
   6 │     1      0  ue82          0
   7 │     2      1  inc80      2200
   8 │     2      1  inc81      3200
   9 │     2      1  inc82      4200
  10 │     2      1  ue80          1
  11 │     2      1  ue81          0
  12 │     2      1  ue82          0
  13 │     3      0  inc80      2000
  14 │     3      0  inc81      3000
  15 │     3      0  inc82      4000
  16 │     3      0  ue80          0
  17 │     3      0  ue81          0
  18 │     3      0  ue82          1

It’s like I want to join each variable together by their respective year for each id.

I’m hoping to find an alternative to Douglass or even find a quick/dirty way of doing accomplishing this. TIA

pdeffebach · November 29, 2021, 8:25pm

I think the problem is that you can’t stack multiple variables at once in DataFrames. There is an issue for it here. Until that’s resolved I think it’s not possible.

It also doesn’t do that automatic promotion to numbers, giving 80, 81, and 82, which is a nice feature of Stata.

So yeah. best bet is probably to do two separate stacks and then join them together. Comment on the linked issue to keep track of it, please.

julia> df = DataFrame(ID = 1:3, 
           sex = [0,1,0], 
           inc80 = [5500,2200,2000], 
           inc81 = [100, 200, 300],
           inc82 = [500, 600, 700], 
           ue80 = [0,1,0],
           ue81 = [1,0,0],
           ue82 = [0,0,1]);

julia> df2 = select(df, :sex, r"inc*")
3×4 DataFrame
 Row │ sex    inc80  inc81  inc82 
     │ Int64  Int64  Int64  Int64 
─────┼────────────────────────────
   1 │     0   5500    100    500
   2 │     1   2200    200    600
   3 │     0   2000    300    700

julia> stack(df2, r"inc*")
9×3 DataFrame
 Row │ sex    variable  value 
     │ Int64  String    Int64 
─────┼────────────────────────
   1 │     0  inc80      5500
   2 │     1  inc80      2200
   3 │     0  inc80      2000
   4 │     0  inc81       100
   5 │     1  inc81       200
   6 │     0  inc81       300
   7 │     0  inc82       500
   8 │     1  inc82       600
   9 │     0  inc82       700

Matthew · November 29, 2021, 9:42pm

Okay! Thanks a lot for the feedback and notes.

I created a very basic function to deal with the simple case provided above. Maybe I can make it more generic…

function myfun(df, identity, cons, features)
  df2 = select(df, identity, cons, Regex(features[1]));
  df3 = select(df, identity, Regex(features[2]));


  a = stack(df2,variable_name = features[1], Regex(features[1]));
  b = stack(df3, Regex(features[2]), variable_name = features[2], value_name = :val);

  @chain a begin
    groupby(:ID)
    @transform!(ind = 1:length(:sex))
  end;
    

  @chain b begin
    groupby(:ID)
    @transform!(ind = 1:length(:ue))
  end;
    
  res = outerjoin(a,b, on =[:ID,:ind])

  return(res)
  
end

jmboehm · November 29, 2021, 11:06pm

As an aside, you can expand the expression that Douglass gives you:

macroexpand(Main, :(Douglass.@d_str("reshape_long :inc :ue, i(:ID) j(:year)") ))

If you find a way that’s more performant (or if you come across other problems), I’d appreciate if you filed an issue on the repo. I’m sure there’s lots of low-hanging fruit.

Matthew · November 30, 2021, 3:15pm

Posting an attempt of a more general function, though it’s much slower than Douglass:

function reshape_long(df, id, cons, stubs)

  stub_len = length(stubs);
  df_vector = [DataFrame() for _ in 1:stub_len];

  iter = 1
  for i in 1:length(df_vector)
    if iter ==1

      df_vector[i] = select(df, [id, cons], Regex(stubs[i]))
      iter += 1

    else
      df_vector[i] = select(df, [id], Regex(stubs[i]))

    end
  end

  df_vector2 = [DataFrame() for _ in 1:stub_len];

  for i in 1:length(df_vector2)
    df_vector2[i] =  stack(df_vector[i], variable_name = stubs[i], Regex(stubs[i]),  value_name = Symbol("value_$i") );

    df_vector2[i] = combine(groupby(df_vector2[i], id), sdf -> sort(sdf), id => eachindex => :index)
  end



  res = [DataFrame() for _ in 1:(stub_len-1)];


  iter = 1
  for i in 1:length(res)

    if iter ==1
      res[i] = outerjoin(df_vector2[i],df_vector2[i+1], on = [id,:index] )
      iter +=1
    else
      res[i] = outerjoin(res[1], df_vector2[i+1], on = [id, :index])
    end


  end

  d = res[end]
  symbs = Symbol(stubs[1])
  d[!, symbs] = getproperty.(match.(r"[0-9]+",d[!, symbs]), :match)


  rename!(d,symbs => :year)
  select!(d, Not([ Symbol(stubs[i]) for i in 2:length(stubs) ]))
  rename!(d, [(Symbol("value_$i")=>Symbol(stubs[i])) for i in 1:length(stubs)])
  select!(d, Not(:index, ))

  return(d)

end

Example: reshape_long(df, :ID, :sex, ["inc", "ue"])

Topic		Replies	Views
How to reshape wide to long? General Usage	13	941	June 24, 2022
Memory efficient melting and stacking? Reshaping Wide to Long Performance	11	1517	November 7, 2020
Converting from Wide to Long format Data question	1	1743	July 12, 2017
Pivot a dataframe to wide format with values in multiple columns New to Julia question , dataframes	15	1811	September 10, 2020
New DataFrame whose columns are values of a column and grouped by another column General Usage dataframes	2	328	May 14, 2021

Reshape long

Related topics