Reshape long

Hi everyone,

I’m loving Julia but am having some difficulties reshaping my dataframe to a “long” format.

df = DataFrame(ID = 1:3, 
    sex = [0,1,0], 
    inc80 = [5500,2200,2000], 
    inc81 = incB,
    inc82 = incC, 
    ue80 = [0,1,0],
    ue81 = [1,0,0],
    ue82 = [0,0,1]); 

in Stata, I would use:
reshape long inc ue, i(id) j(year)

and return

    Row │ ID     year   sex     inc       ue 
        │ Int64  Int64   Int64  Float64   Int64  
   ─────┼─────────────────────────────────────────────────
     1  │     1	80	0	5000	0
     2  │     1	81	0	5500	1
     3  │     1	82	0	6000	0
     4  │     2	80	1	2000	1
     5  │     2	81	1	2200	0
     6  │     2	82	1	3300	0
     7  │     3	80	0	3000	0
     8  │     3	81	0	2000	0
     9  │     3	82	0	1000	1

I’ve come moderately close with:

sort(stack(df, Not([:ID,:sex])))
---
 Row │ ID     sex    variable  value
     │ Int64  Int64  String    Int64
─────┼───────────────────────────────
   1 │     1      0  inc80      5500
   2 │     1      0  inc81      6500
   3 │     1      0  inc82      7500
   4 │     1      0  ue80          0
   5 │     1      0  ue81          1
   6 │     1      0  ue82          0
   7 │     2      1  inc80      2200
   8 │     2      1  inc81      3200
   9 │     2      1  inc82      4200
  10 │     2      1  ue80          1
  11 │     2      1  ue81          0
  12 │     2      1  ue82          0
  13 │     3      0  inc80      2000
  14 │     3      0  inc81      3000
  15 │     3      0  inc82      4000
  16 │     3      0  ue80          0
  17 │     3      0  ue81          0
  18 │     3      0  ue82          1

It’s like I want to join each variable together by their respective year for each id.

I’m hoping to find an alternative to Douglass or even find a quick/dirty way of doing accomplishing this. TIA

I think the problem is that you can’t stack multiple variables at once in DataFrames. There is an issue for it here. Until that’s resolved I think it’s not possible.

It also doesn’t do that automatic promotion to numbers, giving 80, 81, and 82, which is a nice feature of Stata.

So yeah. best bet is probably to do two separate stacks and then join them together. Comment on the linked issue to keep track of it, please.

julia> df = DataFrame(ID = 1:3, 
           sex = [0,1,0], 
           inc80 = [5500,2200,2000], 
           inc81 = [100, 200, 300],
           inc82 = [500, 600, 700], 
           ue80 = [0,1,0],
           ue81 = [1,0,0],
           ue82 = [0,0,1]);

julia> df2 = select(df, :sex, r"inc*")
3×4 DataFrame
 Row │ sex    inc80  inc81  inc82 
     │ Int64  Int64  Int64  Int64 
─────┼────────────────────────────
   1 │     0   5500    100    500
   2 │     1   2200    200    600
   3 │     0   2000    300    700

julia> stack(df2, r"inc*")
9×3 DataFrame
 Row │ sex    variable  value 
     │ Int64  String    Int64 
─────┼────────────────────────
   1 │     0  inc80      5500
   2 │     1  inc80      2200
   3 │     0  inc80      2000
   4 │     0  inc81       100
   5 │     1  inc81       200
   6 │     0  inc81       300
   7 │     0  inc82       500
   8 │     1  inc82       600
   9 │     0  inc82       700
2 Likes

Okay! Thanks a lot for the feedback and notes.

I created a very basic function to deal with the simple case provided above. Maybe I can make it more generic…

function myfun(df, identity, cons, features)
  df2 = select(df, identity, cons, Regex(features[1]));
  df3 = select(df, identity, Regex(features[2]));


  a = stack(df2,variable_name = features[1], Regex(features[1]));
  b = stack(df3, Regex(features[2]), variable_name = features[2], value_name = :val);

  @chain a begin
    groupby(:ID)
    @transform!(ind = 1:length(:sex))
  end;
    

  @chain b begin
    groupby(:ID)
    @transform!(ind = 1:length(:ue))
  end;
    
  res = outerjoin(a,b, on =[:ID,:ind])

  return(res)
  
end

As an aside, you can expand the expression that Douglass gives you:

macroexpand(Main, :(Douglass.@d_str("reshape_long :inc :ue, i(:ID) j(:year)") ))

If you find a way that’s more performant (or if you come across other problems), I’d appreciate if you filed an issue on the repo. I’m sure there’s lots of low-hanging fruit.

2 Likes

Posting an attempt of a more general function, though it’s much slower than Douglass:

function reshape_long(df, id, cons, stubs)

  stub_len = length(stubs);
  df_vector = [DataFrame() for _ in 1:stub_len];

  iter = 1
  for i in 1:length(df_vector)
    if iter ==1

      df_vector[i] = select(df, [id, cons], Regex(stubs[i]))
      iter += 1

    else
      df_vector[i] = select(df, [id], Regex(stubs[i]))

    end
  end

  df_vector2 = [DataFrame() for _ in 1:stub_len];

  for i in 1:length(df_vector2)
    df_vector2[i] =  stack(df_vector[i], variable_name = stubs[i], Regex(stubs[i]),  value_name = Symbol("value_$i") );

    df_vector2[i] = combine(groupby(df_vector2[i], id), sdf -> sort(sdf), id => eachindex => :index)
  end



  res = [DataFrame() for _ in 1:(stub_len-1)];


  iter = 1
  for i in 1:length(res)

    if iter ==1
      res[i] = outerjoin(df_vector2[i],df_vector2[i+1], on = [id,:index] )
      iter +=1
    else
      res[i] = outerjoin(res[1], df_vector2[i+1], on = [id, :index])
    end


  end

  d = res[end]
  symbs = Symbol(stubs[1])
  d[!, symbs] = getproperty.(match.(r"[0-9]+",d[!, symbs]), :match)


  rename!(d,symbs => :year)
  select!(d, Not([ Symbol(stubs[i]) for i in 2:length(stubs) ]))
  rename!(d, [(Symbol("value_$i")=>Symbol(stubs[i])) for i in 1:length(stubs)])
  select!(d, Not(:index, ))

  return(d)

end


Example: reshape_long(df, :ID, :sex, ["inc", "ue"])