Dataframe delete duplicate with condition

I’m trying to get from this df

df = DataFrame(A = ["jj","jj","ab","ac","ac"], B = [0, 1, 0, 0, 1], C = ["M", "M", "F", "M", "M"])
#==
5×3 DataFrame
│ Row │ A      │ B     │ C      │
│     │ String │ Int64 │ String │
├─────┼────────┼───────┼────────┤
│ 1   │ jj     │ 0     │ M      │
│ 2   │ jj     │ 1     │ M      │
│ 3   │ ab     │ 0     │ F      │
│ 4   │ ac     │ 0     │ M      │
│ 5   │ ac     │ 1     │ M      │
==#

to this df2

df2 = DataFrame(A = ["jj","ab","ac"], B = [1, 0, 1], C = ["M", "F", "M"])
#==
3×3 DataFrame
│ Row │ A      │ B     │ C      │
│     │ String │ Int64 │ String │
├─────┼────────┼───────┼────────┤
│ 1   │ jj     │ 1     │ M      │
│ 2   │ ab     │ 0     │ F      │
│ 3   │ ac     │ 1     │ M      │
==#

Basically, I need to dedup the dataframe on column A, but on the condition that column B == 0. I thought I could loop over a grouped data frame and filter but I was not aware you could not delete from a sub dataframe (e.g., this error #ERROR: ArgumentError: SubDataFrame does not support deleting row) .

for g in groupby(df, :A)
    if size(g, 1) > 1
        filter!(row->row[:B] == 1, g)
    end
end
by(df, :A) do sbdf
    (size(sbdf, 1)>1) ? sbdf[sbdf.B.==1,:] : sbdf
end

For a large dataframe, there are potentially faster solutions, see Split-apply-combine · DataFrames.jl

3 Likes

nice. thank you