Here’s a solution that uses groupby
(but not strictly split/apply/combine) which takes ~2/3 of the time of your solution in the toy example. I am also using some optimizations on your logic:
function process1(df)
# initialize result dataframe
df_result = copy(df[1:2,:]; copycols=true);
for k = 1:maximum(df.id)
df_temp = df[df.id .== k, :]
# Keep only groups with members status == 1 and status == 2
N = count(i->(i == 1),df_temp.status) + count(i->(i == 2),df_temp.status)
if N != 2
continue
end
# Keep only groups with working age status1
stat1_age = df_temp[df_temp.status .== 1, :age]
if (stat1_age[1] > 61 || stat1_age[1] < 25)
continue
end
append!(df_result,df_temp)
end
delete!(df_result,1:2);
end
function process2(df)
# initialize result dataframe
df_result = copy(df[1:2,:]; copycols=true)
for group in groupby(df, :id)
# Keep only groups with members status == 1 and status == 2
if 1 ∈ group.status && 2 ∈ group.status # faster than counting
stat1_age = group[findfirst(x -> x == 1, group.status), :age] # Only using the first occurrence
if 25 ≤ stat1_age ≤ 61
append!(df_result, group)
end
end
end
delete!(df_result, 1:2)
return df_result
end
julia> @btime process1($df)
9.877 μs (129 allocations: 11.33 KiB)
5×3 DataFrame
│ Row │ id │ age │ status │
│ │ Int64 │ Int64 │ Int64 │
├─────┼───────┼───────┼────────┤
│ 1 │ 1 │ 53 │ 1 │
│ 2 │ 1 │ 52 │ 2 │
│ 3 │ 1 │ 17 │ 3 │
│ 4 │ 2 │ 31 │ 1 │
│ 5 │ 2 │ 29 │ 2 │
julia> @btime process2($df)
6.997 μs (101 allocations: 9.05 KiB)
5×3 DataFrame
│ Row │ id │ age │ status │
│ │ Int64 │ Int64 │ Int64 │
├─────┼───────┼───────┼────────┤
│ 1 │ 1 │ 53 │ 1 │
│ 2 │ 1 │ 52 │ 2 │
│ 3 │ 1 │ 17 │ 3 │
│ 4 │ 2 │ 31 │ 1 │
│ 5 │ 2 │ 29 │ 2 │
Comparing to xiaodai’s answers:
julia> @btime processx($df)
68.657 μs (408 allocations: 28.48 KiB)
5×3 DataFrame
│ Row │ id │ age │ status │
│ │ Int64 │ Int64 │ Int64 │
├─────┼───────┼───────┼────────┤
│ 1 │ 1 │ 53 │ 1 │
│ 2 │ 1 │ 52 │ 2 │
│ 3 │ 1 │ 17 │ 3 │
│ 4 │ 2 │ 31 │ 1 │
│ 5 │ 2 │ 29 │ 2 │
julia> @btime processx2($df)
81.531 μs (537 allocations: 37.75 KiB)
5×3 DataFrame
│ Row │ id │ age │ status │
│ │ Int64 │ Int64 │ Int64 │
├─────┼───────┼───────┼────────┤
│ 1 │ 1 │ 53 │ 1 │
│ 2 │ 1 │ 52 │ 2 │
│ 3 │ 1 │ 17 │ 3 │
│ 4 │ 2 │ 31 │ 1 │
│ 5 │ 2 │ 29 │ 2 │
I tried other solutions with filter!
, but that is much slower.