Splitting time periods in a DataFrame

I have a DataFrame with episodes of exposure of the form:

PersonId::Int64
DateOfBirth::Date
Sex::Int64
StartDate::Date
EndDate::Date
  1. I would like to split each record on January 1, of each calendar year between :StartDate and :EndDate, e.g. where year start is >:StartDate && <= :EndDate. An episode can span several years.
  2. Then I would like to split each record on each annual birthday of the person, if the birthday is >:StartDate && <= :EndDate
  3. When an episode is split, the end date of the left-hand portion will be split date - 1 day, and the start date of the right-hand portion will be equal to the split date. Note: This could result in a right-hand portion where start date equals end date, if the split date equals the end date - this is fine.
  4. :PersonId, :DateOfBirth, :Sex should be retained for all resulting records.
  5. For those familiar with Stata, this is equivalent to the stsplit command.

I can think of a brute force method of iterating across all records, testing the year span of each episode and appending split records to an empty DataFrame, but I guess there might be a more elegant way of doing this using the amazing features of DataFrames.

Any help will be appreciated, thank you.

1 Like

Regardless of the function to split (this is just a rather quick and dirty try) one way is to group on all columns but those to change and return vectors in the combine call, they will be auto-concatenated. There are probably better ways to do it, though :slight_smile:

using DataFrames, DataFrameMacros, Dates

df = DataFrame(
    PersonId = [1, 2],
    DateOfBirth = [Date(1989, 7, 15), Date(1965, 2, 3)],
    Sex = [0, 1],
    StartDate = [Date(2020, 02, 01), Date(2021, 09, 15)],
    EndDate = [Date(2022, 10, 13), Date(2022, 10, 24)]
)

function splits(startdate, enddate, birthdate)
    first_of_januarys = ceil(startdate, Year):Year(1):enddate
    birthdays = [Date(y, month(birthdate), day(birthdate)) for y in year(startdate):year(enddate)
        if startdate < Date(y, month(birthdate), day(birthdate)) < enddate
    ]
    all_days = sort(unique([startdate; enddate; first_of_januarys; birthdays]))
    starts = all_days[1:end-1]
    starts[2:end] .+= Day(1)
    ends = all_days[2:end]
    starts, ends, 1:length(starts)
end
@combine(
    groupby(df, Not([:StartDate, :EndDate])),
    @astable :StartDate, :EndDate, :Section = splits(only(:StartDate), only(:EndDate), only(:DateOfBirth))
)
9Γ—6 DataFrame
 Row β”‚ PersonId  DateOfBirth  Sex    StartDate   EndDate     Section 
     β”‚ Int64     Date         Int64  Date        Date        Int64   
─────┼───────────────────────────────────────────────────────────────
   1 β”‚        1  1989-07-15       0  2020-02-01  2020-07-15        1
   2 β”‚        1  1989-07-15       0  2020-07-16  2021-01-01        2
   3 β”‚        1  1989-07-15       0  2021-01-02  2021-07-15        3
   4 β”‚        1  1989-07-15       0  2021-07-16  2022-01-01        4
   5 β”‚        1  1989-07-15       0  2022-01-02  2022-07-15        5
   6 β”‚        1  1989-07-15       0  2022-07-16  2022-10-13        6
   7 β”‚        2  1965-02-03       1  2021-09-15  2022-01-01        1
   8 β”‚        2  1965-02-03       1  2022-01-02  2022-02-03        2
   9 β”‚        2  1965-02-03       1  2022-02-04  2022-10-24        3
function splitep(bd, sd, ed)

    y1l=Year(sd)+ Year(dayofyear(bd) < dayofyear(sd))
    y2l=Year(ed)- Year(dayofyear(bd) > dayofyear(ed))
    
    edl=Date.(y1l:Year(1):y2l, Month(bd),Day(bd)-Day(1))
    
    y1r=Year(sd)+ Year(dayofyear(sd)>1)
    y2r=Year(ed)
    sdr=Date.(y1r:Year(1):y2r, Month(1),Day(1))
    edr=Date.(Year.(sdr))-Day(1)
    
    E=sort([edl;edr;ed])
    S=[sd;E[1:end-1]+Day(1)]

    [S,E]    
end

cols=[:DateOfBirth, :StartDate,:EndDate]


cdf=combine(df ,[:PersonId, :Sex, :DateOfBirth], cols=>ByRow(splitep)=>cols[2:end])

flatten(cdf, cols[2:end])


julia> flatten(cdf, cols[2:end])
9Γ—5 DataFrame
 Row β”‚ PersonId  Sex    DateOfBirth  StartDate   EndDate    
     β”‚ Int64     Int64  Date         Date        Date
─────┼──────────────────────────────────────────────────────
   1 β”‚        1      0  1989-07-15   2020-02-01  2020-07-14
   2 β”‚        1      0  1989-07-15   2020-07-15  2020-12-31
   3 β”‚        1      0  1989-07-15   2021-01-01  2021-07-14
   4 β”‚        1      0  1989-07-15   2021-07-15  2021-12-31
   5 β”‚        1      0  1989-07-15   2022-01-01  2022-07-14
   6 β”‚        1      0  1989-07-15   2022-07-15  2022-10-13
   7 β”‚        2      1  1965-02-03   2021-09-15  2021-12-31
   8 β”‚        2      1  1965-02-03   2022-01-01  2022-02-02
   9 β”‚        2      1  1965-02-03   2022-02-03  2022-10-24

Thank you for the code, unfortunately, the split should happen on the split date, so that, for example, the split on year start should Jan 1 as the start date not Jan 2.

Thank you I like this solution, but unfortunately it throws an error:

ERROR: ArgumentError: Day: 0 out of range (1:31)

I could also not test the code in the edge case if the BirthDay falls on Jan 1, and the start date of the period equals the birth date.

I used this test DataFrame:

df = DataFrame(
    PersonId=[1, 2, 3],
    DateOfBirth=[Date(1989, 7, 15), Date(1965, 2, 3), Date(2020,1,1)],
    Sex=[0, 1, 0],
    StartDate=[Date(2020, 02, 01), Date(2021, 09, 15), Date(2020,1,1)],
    EndDate=[Date(2022, 10, 13), Date(2022, 10, 24), Date(2022, 10, 24)]
)

Then you just need to change the addition of 1 to each start date except the first to a subtraction of 1 day from each end date except the last

Thank you. For the record this is the modified function:

function splits(startdate, enddate, birthdate)
    if startdate == enddate
        return [startdate], [enddate], [1]
    end
    first_of_januarys = ceil(startdate, Year):Year(1):enddate
    birthdays = [Date(y, month(birthdate), day(birthdate)) for y in year(startdate):year(enddate)
        if startdate < Date(y, month(birthdate), day(birthdate)) < enddate
    ]
    all_days = sort(unique([startdate; enddate; first_of_januarys; birthdays]))
    starts = all_days[1:end-1]
    ends = all_days[2:end]
    ends[1:end-1] .-= Day(1)
    starts, ends, 1:length(starts)
end

It also handles edge cases perfectly, but I had to add the if statement at the start otherwise records with startdate == enddate are dropped. Thank you.

ok. What should the output be in this case?

For PersonId = 3

PersonId Sex DateofBirth StartDate EndDate
3 0 2020-01-01 2020-01-01 2020-12-31
3 0 2020-01-01 2021-01-01 2021-12-31
3 0 2020-01-01 2022-01-01 2022-10-24

For this test data:

df = DataFrame(
    PersonId=[1, 2, 3, 4],
    DateOfBirth=[Date(1989, 7, 15), Date(1965, 2, 3), Date(2000, 1, 1), Date(2000, 12, 31)],
    Sex=[0, 1, 0, 1],
    StartDate=[Date(2020, 02, 01), Date(2021, 09, 15), Date(2020, 1, 1), Date(2001, 12, 1)],
    EndDate=[Date(2022, 10, 13), Date(2022, 10, 24), Date(2022, 10, 24), Date(2001, 12, 1)]
)

Person 4 has an episode that start and end of the same day, that episode should be passed as is to the output.

Try this one, albeit a little convoluted. But the specifications are not entirely β€œlinear”

 4Γ—5 DataFrame
 Row β”‚ PersonId  DateOfBirth  Sex    StartDate   EndDate    
     β”‚ Int64     Date         Int64  Date        Date
─────┼──────────────────────────────────────────────────────
   1 β”‚        1  1989-07-15       0  2020-02-01  2022-10-13
   2 β”‚        2  1965-02-03       1  2021-09-15  2022-10-24
   3 β”‚        3  2000-01-01       0  2020-01-01  2022-10-24
   4 β”‚        4  2000-12-31       1  2001-12-01  2001-12-01

julia> function splitep(bd, sd, ed)

           bdltsd=dayofyear(bd) < dayofyear(sd)
           bdeqsd=dayofyear(bd) == dayofyear(sd)

           bdgted=dayofyear(bd) > dayofyear(ed)

           y1l=Year(sd)+ Year(bdltsd)
           y2l=Year(ed)- Year(bdgted)

           edl=Date.(y1l:Year(1):y2l, Month(bd),max(Day(bd)-Day(1),Day(1)))

           y1r=Year(sd)+ Year(dayofyear(sd)>1)
           y2r=Year(ed)
           sdr=Date.(y1r:Year(1):y2r, Month(1),Day(1))
           edr=Date.(Year.(sdr))-Day(1)*(!bdeqsd)

           E=!bdeqsd ? sort([edl;edr;ed]) : [edr[2:end];ed]
           S=[sd;E[1:end-1]+Day(1)]
           [S,E]
       end
splitep (generic function with 1 method)

julia> cols=[:DateOfBirth, :StartDate,:EndDate]
3-element Vector{Symbol}:
 :DateOfBirth
 :StartDate
 :EndDate

julia> cdf=combine(df ,[:PersonId, :Sex, :DateOfBirth], cols=>ByRow(splitep)=>cols[2:end])
4Γ—5 DataFrame
 Row β”‚ PersonId  Sex    DateOfBirth  StartDate                          EndDate                      β‹―
     β”‚ Int64     Int64  Date         Array…                             Array…                       β‹―
─────┼────────────────────────────────────────────────────────────────────────────────────────────────
   1 β”‚        1      0  1989-07-15   [Date("2020-02-01"), Date("2020-…  [Date("2020-07-14"), Date("2 β‹―
   2 β”‚        2      1  1965-02-03   [Date("2021-09-15"), Date("2022-…  [Date("2021-12-31"), Date("2  
   3 β”‚        3      0  2000-01-01   [Date("2020-01-01"), Date("2021-…  [Date("2021-01-01"), Date("2  
   4 β”‚        4      1  2000-12-31   [Date("2001-12-01")]               [Date("2001-12-01")]
                                                                                      1 column omitted

julia> flatten(cdf, cols[2:end])
13Γ—5 DataFrame
 Row β”‚ PersonId  Sex    DateOfBirth  StartDate   EndDate    
     β”‚ Int64     Int64  Date         Date        Date
─────┼──────────────────────────────────────────────────────
   1 β”‚        1      0  1989-07-15   2020-02-01  2020-07-14
   2 β”‚        1      0  1989-07-15   2020-07-15  2020-12-31
   3 β”‚        1      0  1989-07-15   2021-01-01  2021-07-14
   4 β”‚        1      0  1989-07-15   2021-07-15  2021-12-31
   5 β”‚        1      0  1989-07-15   2022-01-01  2022-07-14
   6 β”‚        1      0  1989-07-15   2022-07-15  2022-10-13
   7 β”‚        2      1  1965-02-03   2021-09-15  2021-12-31
   8 β”‚        2      1  1965-02-03   2022-01-01  2022-02-02
   9 β”‚        2      1  1965-02-03   2022-02-03  2022-10-24
  10 β”‚        3      0  2000-01-01   2020-01-01  2021-01-01
  11 β”‚        3      0  2000-01-01   2021-01-02  2022-01-01
  12 β”‚        3      0  2000-01-01   2022-01-02  2022-10-24
  13 β”‚        4      1  2000-12-31   2001-12-01  2001-12-01
1 Like

For reference this is the final splitting code that also provides for the special case when someone is born on a leap day and is more flexible allowing the specification of the columns containing the start and end dates and the birthday date. All other columns are passed as is to the output dataframe.

using DataFrames
using Dates

function splits(startdate, enddate, birthdate)
    if startdate == enddate
        return (sd=[startdate], ed=[enddate])
    end
    first_of_januarys = ceil(startdate, Year):Year(1):enddate
    birthdays = [Date(y, month(birthdate), !Dates.isleapyear(y) && month(birthdate) == 2 && day(birthdate) == 29 ? 28 : day(birthdate)) 
                for y in year(startdate):year(enddate) 
                if startdate < Date(y, month(birthdate), !Dates.isleapyear(y) && month(birthdate) == 2 && day(birthdate) == 29 ? 28 : day(birthdate)) < enddate]
    all_days = sort(unique([startdate; enddate; first_of_januarys; birthdays]))
    starts = all_days[1:end-1]
    ends = all_days[2:end]
    ends[1:end-1] .-= Day(1)
    return (sd=starts, ed=ends)
end
"""
    splitdf(df::AbstractDataFrame, dob::Symbol, startdate::Symbol, enddate::Symbol)::AbstractDataFrame

Returns a dataframe of the same structure as df, with the episodes represented by startdate and enddate split on birthdays and calendar year transitions.
If startdate == enddate, no split happens
Assumes no record has startdate > enddate
"""
function splitdf(df::AbstractDataFrame, dob::Symbol, startdate::Symbol, enddate::Symbol)::AbstractDataFrame
    cdf = combine(df, AsTable(Not([startdate, enddate])) => identity => AsTable, [startdate, enddate, dob] => ByRow(splits) => AsTable)
    return rename(flatten(cdf, [:sd, :ed]), (:sd => startdate), (:ed => enddate))
end