Applying group selection conditions using groupeddataframes

Here’s a solution that uses groupby (but not strictly split/apply/combine) which takes ~2/3 of the time of your solution in the toy example. I am also using some optimizations on your logic:

function process1(df)
  # initialize result dataframe
  df_result = copy(df[1:2,:]; copycols=true);

  for k = 1:maximum(df.id)

    df_temp = df[df.id .== k, :]

    # Keep only groups with members status == 1 and status == 2
    N = count(i->(i == 1),df_temp.status) + count(i->(i == 2),df_temp.status)
    if N != 2
        continue
    end

    # Keep only groups with working age status1
    stat1_age = df_temp[df_temp.status .== 1, :age]
    if (stat1_age[1] > 61 || stat1_age[1] < 25)
        continue
    end

    append!(df_result,df_temp)

  end

  delete!(df_result,1:2);
end
function process2(df)
  # initialize result dataframe
  df_result = copy(df[1:2,:]; copycols=true)

  for group in groupby(df, :id)

    # Keep only groups with members status == 1 and status == 2
    if 1 ∈ group.status && 2 ∈ group.status # faster than counting
      stat1_age = group[findfirst(x -> x == 1, group.status), :age] # Only using the first occurrence
      if 25 ≤ stat1_age ≤ 61
        append!(df_result, group)
      end
    end

  end

  delete!(df_result, 1:2)

  return df_result
end
julia> @btime process1($df)
  9.877 μs (129 allocations: 11.33 KiB)
5×3 DataFrame
│ Row │ id    │ age   │ status │
│     │ Int64 │ Int64 │ Int64  │
├─────┼───────┼───────┼────────┤
│ 1   │ 1     │ 53    │ 1      │
│ 2   │ 1     │ 52    │ 2      │
│ 3   │ 1     │ 17    │ 3      │
│ 4   │ 2     │ 31    │ 1      │
│ 5   │ 2     │ 29    │ 2      │

julia> @btime process2($df)
  6.997 μs (101 allocations: 9.05 KiB)
5×3 DataFrame
│ Row │ id    │ age   │ status │
│     │ Int64 │ Int64 │ Int64  │
├─────┼───────┼───────┼────────┤
│ 1   │ 1     │ 53    │ 1      │
│ 2   │ 1     │ 52    │ 2      │
│ 3   │ 1     │ 17    │ 3      │
│ 4   │ 2     │ 31    │ 1      │
│ 5   │ 2     │ 29    │ 2      │

Comparing to xiaodai’s answers:

julia> @btime processx($df)
  68.657 μs (408 allocations: 28.48 KiB)
5×3 DataFrame
│ Row │ id    │ age   │ status │
│     │ Int64 │ Int64 │ Int64  │
├─────┼───────┼───────┼────────┤
│ 1   │ 1     │ 53    │ 1      │
│ 2   │ 1     │ 52    │ 2      │
│ 3   │ 1     │ 17    │ 3      │
│ 4   │ 2     │ 31    │ 1      │
│ 5   │ 2     │ 29    │ 2      │
julia> @btime processx2($df)
  81.531 μs (537 allocations: 37.75 KiB)
5×3 DataFrame
│ Row │ id    │ age   │ status │
│     │ Int64 │ Int64 │ Int64  │
├─────┼───────┼───────┼────────┤
│ 1   │ 1     │ 53    │ 1      │
│ 2   │ 1     │ 52    │ 2      │
│ 3   │ 1     │ 17    │ 3      │
│ 4   │ 2     │ 31    │ 1      │
│ 5   │ 2     │ 29    │ 2      │

I tried other solutions with filter!, but that is much slower.

1 Like