Splitting time periods in a DataFrame

kobusherbst · October 26, 2022, 5:37am

I have a DataFrame with episodes of exposure of the form:

PersonId::Int64
DateOfBirth::Date
Sex::Int64
StartDate::Date
EndDate::Date

I would like to split each record on January 1, of each calendar year between :StartDate and :EndDate, e.g. where year start is >:StartDate && <= :EndDate. An episode can span several years.
Then I would like to split each record on each annual birthday of the person, if the birthday is >:StartDate && <= :EndDate
When an episode is split, the end date of the left-hand portion will be split date - 1 day, and the start date of the right-hand portion will be equal to the split date. Note: This could result in a right-hand portion where start date equals end date, if the split date equals the end date - this is fine.
:PersonId, :DateOfBirth, :Sex should be retained for all resulting records.
For those familiar with Stata, this is equivalent to the stsplit command.

I can think of a brute force method of iterating across all records, testing the year span of each episode and appending split records to an empty DataFrame, but I guess there might be a more elegant way of doing this using the amazing features of DataFrames.

Any help will be appreciated, thank you.

jules · October 26, 2022, 8:42am

Regardless of the function to split (this is just a rather quick and dirty try) one way is to group on all columns but those to change and return vectors in the combine call, they will be auto-concatenated. There are probably better ways to do it, though

using DataFrames, DataFrameMacros, Dates

df = DataFrame(
    PersonId = [1, 2],
    DateOfBirth = [Date(1989, 7, 15), Date(1965, 2, 3)],
    Sex = [0, 1],
    StartDate = [Date(2020, 02, 01), Date(2021, 09, 15)],
    EndDate = [Date(2022, 10, 13), Date(2022, 10, 24)]
)

function splits(startdate, enddate, birthdate)
    first_of_januarys = ceil(startdate, Year):Year(1):enddate
    birthdays = [Date(y, month(birthdate), day(birthdate)) for y in year(startdate):year(enddate)
        if startdate < Date(y, month(birthdate), day(birthdate)) < enddate
    ]
    all_days = sort(unique([startdate; enddate; first_of_januarys; birthdays]))
    starts = all_days[1:end-1]
    starts[2:end] .+= Day(1)
    ends = all_days[2:end]
    starts, ends, 1:length(starts)
end

@combine(
    groupby(df, Not([:StartDate, :EndDate])),
    @astable :StartDate, :EndDate, :Section = splits(only(:StartDate), only(:EndDate), only(:DateOfBirth))
)

9×6 DataFrame
 Row │ PersonId  DateOfBirth  Sex    StartDate   EndDate     Section 
     │ Int64     Date         Int64  Date        Date        Int64   
─────┼───────────────────────────────────────────────────────────────
   1 │        1  1989-07-15       0  2020-02-01  2020-07-15        1
   2 │        1  1989-07-15       0  2020-07-16  2021-01-01        2
   3 │        1  1989-07-15       0  2021-01-02  2021-07-15        3
   4 │        1  1989-07-15       0  2021-07-16  2022-01-01        4
   5 │        1  1989-07-15       0  2022-01-02  2022-07-15        5
   6 │        1  1989-07-15       0  2022-07-16  2022-10-13        6
   7 │        2  1965-02-03       1  2021-09-15  2022-01-01        1
   8 │        2  1965-02-03       1  2022-01-02  2022-02-03        2
   9 │        2  1965-02-03       1  2022-02-04  2022-10-24        3

rocco_sprmnt21 · October 26, 2022, 11:17am

function splitep(bd, sd, ed)

    y1l=Year(sd)+ Year(dayofyear(bd) < dayofyear(sd))
    y2l=Year(ed)- Year(dayofyear(bd) > dayofyear(ed))
    
    edl=Date.(y1l:Year(1):y2l, Month(bd),Day(bd)-Day(1))
    
    y1r=Year(sd)+ Year(dayofyear(sd)>1)
    y2r=Year(ed)
    sdr=Date.(y1r:Year(1):y2r, Month(1),Day(1))
    edr=Date.(Year.(sdr))-Day(1)
    
    E=sort([edl;edr;ed])
    S=[sd;E[1:end-1]+Day(1)]

    [S,E]    
end

cols=[:DateOfBirth, :StartDate,:EndDate]


cdf=combine(df ,[:PersonId, :Sex, :DateOfBirth], cols=>ByRow(splitep)=>cols[2:end])

flatten(cdf, cols[2:end])


julia> flatten(cdf, cols[2:end])
9×5 DataFrame
 Row │ PersonId  Sex    DateOfBirth  StartDate   EndDate    
     │ Int64     Int64  Date         Date        Date
─────┼──────────────────────────────────────────────────────
   1 │        1      0  1989-07-15   2020-02-01  2020-07-14
   2 │        1      0  1989-07-15   2020-07-15  2020-12-31
   3 │        1      0  1989-07-15   2021-01-01  2021-07-14
   4 │        1      0  1989-07-15   2021-07-15  2021-12-31
   5 │        1      0  1989-07-15   2022-01-01  2022-07-14
   6 │        1      0  1989-07-15   2022-07-15  2022-10-13
   7 │        2      1  1965-02-03   2021-09-15  2021-12-31
   8 │        2      1  1965-02-03   2022-01-01  2022-02-02
   9 │        2      1  1965-02-03   2022-02-03  2022-10-24

kobusherbst · October 28, 2022, 5:11am

Thank you for the code, unfortunately, the split should happen on the split date, so that, for example, the split on year start should Jan 1 as the start date not Jan 2.

kobusherbst · October 28, 2022, 5:17am

Thank you I like this solution, but unfortunately it throws an error:

ERROR: ArgumentError: Day: 0 out of range (1:31)

I could also not test the code in the edge case if the BirthDay falls on Jan 1, and the start date of the period equals the birth date.

I used this test DataFrame:

df = DataFrame(
    PersonId=[1, 2, 3],
    DateOfBirth=[Date(1989, 7, 15), Date(1965, 2, 3), Date(2020,1,1)],
    Sex=[0, 1, 0],
    StartDate=[Date(2020, 02, 01), Date(2021, 09, 15), Date(2020,1,1)],
    EndDate=[Date(2022, 10, 13), Date(2022, 10, 24), Date(2022, 10, 24)]
)

jules · October 28, 2022, 5:42am

Then you just need to change the addition of 1 to each start date except the first to a subtraction of 1 day from each end date except the last

kobusherbst · October 28, 2022, 6:52am

Thank you. For the record this is the modified function:

function splits(startdate, enddate, birthdate)
    if startdate == enddate
        return [startdate], [enddate], [1]
    end
    first_of_januarys = ceil(startdate, Year):Year(1):enddate
    birthdays = [Date(y, month(birthdate), day(birthdate)) for y in year(startdate):year(enddate)
        if startdate < Date(y, month(birthdate), day(birthdate)) < enddate
    ]
    all_days = sort(unique([startdate; enddate; first_of_januarys; birthdays]))
    starts = all_days[1:end-1]
    ends = all_days[2:end]
    ends[1:end-1] .-= Day(1)
    starts, ends, 1:length(starts)
end

It also handles edge cases perfectly, but I had to add the if statement at the start otherwise records with startdate == enddate are dropped. Thank you.

rocco_sprmnt21 · October 28, 2022, 1:11pm

ok. What should the output be in this case?

kobusherbst · October 28, 2022, 2:05pm

For PersonId = 3

PersonId	DateofBirth	StartDate	EndDate
3	2020-01-01	2020-01-01	2020-12-31
3	2020-01-01	2021-01-01	2021-12-31
3	2020-01-01	2022-01-01	2022-10-24

For this test data:

df = DataFrame(
    PersonId=[1, 2, 3, 4],
    DateOfBirth=[Date(1989, 7, 15), Date(1965, 2, 3), Date(2000, 1, 1), Date(2000, 12, 31)],
    Sex=[0, 1, 0, 1],
    StartDate=[Date(2020, 02, 01), Date(2021, 09, 15), Date(2020, 1, 1), Date(2001, 12, 1)],
    EndDate=[Date(2022, 10, 13), Date(2022, 10, 24), Date(2022, 10, 24), Date(2001, 12, 1)]
)

Person 4 has an episode that start and end of the same day, that episode should be passed as is to the output.

rocco_sprmnt21 · October 28, 2022, 3:32pm

Try this one, albeit a little convoluted. But the specifications are not entirely “linear”

 4×5 DataFrame
 Row │ PersonId  DateOfBirth  Sex    StartDate   EndDate    
     │ Int64     Date         Int64  Date        Date
─────┼──────────────────────────────────────────────────────
   1 │        1  1989-07-15       0  2020-02-01  2022-10-13
   2 │        2  1965-02-03       1  2021-09-15  2022-10-24
   3 │        3  2000-01-01       0  2020-01-01  2022-10-24
   4 │        4  2000-12-31       1  2001-12-01  2001-12-01

julia> function splitep(bd, sd, ed)

           bdltsd=dayofyear(bd) < dayofyear(sd)
           bdeqsd=dayofyear(bd) == dayofyear(sd)

           bdgted=dayofyear(bd) > dayofyear(ed)

           y1l=Year(sd)+ Year(bdltsd)
           y2l=Year(ed)- Year(bdgted)

           edl=Date.(y1l:Year(1):y2l, Month(bd),max(Day(bd)-Day(1),Day(1)))

           y1r=Year(sd)+ Year(dayofyear(sd)>1)
           y2r=Year(ed)
           sdr=Date.(y1r:Year(1):y2r, Month(1),Day(1))
           edr=Date.(Year.(sdr))-Day(1)*(!bdeqsd)

           E=!bdeqsd ? sort([edl;edr;ed]) : [edr[2:end];ed]
           S=[sd;E[1:end-1]+Day(1)]
           [S,E]
       end
splitep (generic function with 1 method)

julia> cols=[:DateOfBirth, :StartDate,:EndDate]
3-element Vector{Symbol}:
 :DateOfBirth
 :StartDate
 :EndDate

julia> cdf=combine(df ,[:PersonId, :Sex, :DateOfBirth], cols=>ByRow(splitep)=>cols[2:end])
4×5 DataFrame
 Row │ PersonId  Sex    DateOfBirth  StartDate                          EndDate                      ⋯
     │ Int64     Int64  Date         Array…                             Array…                       ⋯
─────┼────────────────────────────────────────────────────────────────────────────────────────────────
   1 │        1      0  1989-07-15   [Date("2020-02-01"), Date("2020-…  [Date("2020-07-14"), Date("2 ⋯
   2 │        2      1  1965-02-03   [Date("2021-09-15"), Date("2022-…  [Date("2021-12-31"), Date("2  
   3 │        3      0  2000-01-01   [Date("2020-01-01"), Date("2021-…  [Date("2021-01-01"), Date("2  
   4 │        4      1  2000-12-31   [Date("2001-12-01")]               [Date("2001-12-01")]
                                                                                      1 column omitted

julia> flatten(cdf, cols[2:end])
13×5 DataFrame
 Row │ PersonId  Sex    DateOfBirth  StartDate   EndDate    
     │ Int64     Int64  Date         Date        Date
─────┼──────────────────────────────────────────────────────
   1 │        1      0  1989-07-15   2020-02-01  2020-07-14
   2 │        1      0  1989-07-15   2020-07-15  2020-12-31
   3 │        1      0  1989-07-15   2021-01-01  2021-07-14
   4 │        1      0  1989-07-15   2021-07-15  2021-12-31
   5 │        1      0  1989-07-15   2022-01-01  2022-07-14
   6 │        1      0  1989-07-15   2022-07-15  2022-10-13
   7 │        2      1  1965-02-03   2021-09-15  2021-12-31
   8 │        2      1  1965-02-03   2022-01-01  2022-02-02
   9 │        2      1  1965-02-03   2022-02-03  2022-10-24
  10 │        3      0  2000-01-01   2020-01-01  2021-01-01
  11 │        3      0  2000-01-01   2021-01-02  2022-01-01
  12 │        3      0  2000-01-01   2022-01-02  2022-10-24
  13 │        4      1  2000-12-31   2001-12-01  2001-12-01

kobusherbst · October 29, 2022, 10:10am

For reference this is the final splitting code that also provides for the special case when someone is born on a leap day and is more flexible allowing the specification of the columns containing the start and end dates and the birthday date. All other columns are passed as is to the output dataframe.

using DataFrames
using Dates

function splits(startdate, enddate, birthdate)
    if startdate == enddate
        return (sd=[startdate], ed=[enddate])
    end
    first_of_januarys = ceil(startdate, Year):Year(1):enddate
    birthdays = [Date(y, month(birthdate), !Dates.isleapyear(y) && month(birthdate) == 2 && day(birthdate) == 29 ? 28 : day(birthdate)) 
                for y in year(startdate):year(enddate) 
                if startdate < Date(y, month(birthdate), !Dates.isleapyear(y) && month(birthdate) == 2 && day(birthdate) == 29 ? 28 : day(birthdate)) < enddate]
    all_days = sort(unique([startdate; enddate; first_of_januarys; birthdays]))
    starts = all_days[1:end-1]
    ends = all_days[2:end]
    ends[1:end-1] .-= Day(1)
    return (sd=starts, ed=ends)
end
"""
    splitdf(df::AbstractDataFrame, dob::Symbol, startdate::Symbol, enddate::Symbol)::AbstractDataFrame

Returns a dataframe of the same structure as df, with the episodes represented by startdate and enddate split on birthdays and calendar year transitions.
If startdate == enddate, no split happens
Assumes no record has startdate > enddate
"""
function splitdf(df::AbstractDataFrame, dob::Symbol, startdate::Symbol, enddate::Symbol)::AbstractDataFrame
    cdf = combine(df, AsTable(Not([startdate, enddate])) => identity => AsTable, [startdate, enddate, dob] => ByRow(splits) => AsTable)
    return rename(flatten(cdf, [:sd, :ed]), (:sd => startdate), (:ed => enddate))
end

Topic		Replies	Views
Split DateTime column into separate Date and Time Columns General Usage dataframes	20	1543	April 30, 2022
Using DataFrames to split overlapping datetime intervals into non overlapping intervals General Usage question , dataframes , dataframesmeta	9	366	July 23, 2023
Create a GroupedDataFrame by the relations of rows rather than the values of the rows in a column, e.g `groupby` consecutive dates? New to Julia question , dataframes , grouped-data	14	707	March 29, 2023
Grouping consecutive sick days Data dates , dataframes	14	979	October 6, 2021
How to filter Date data column in a DataFrame General Usage dates , dataframes	3	226	February 20, 2024

Splitting time periods in a DataFrame

Related topics