Fuzzy inexact merge

The SortMerge.jl package is designed exactly for this purpose:

using DataFrames, CSV, SortMerge, Dates

url="""https://gist.githubusercontent.com/kafisatz/ed1f42fb4b5fd1ce1cf3a8a12b8e80b6/raw/70df7040c550bbdedd168c90a78caf4ee2808881/hadata.csv"""
fi = download(url)
df0 = CSV.File(fi) |> DataFrame

df1 = filter(x->x.entity_id == "kitchen_temperature",df0)
df2 = filter(x->x.entity_id == "buero_temperature",df0)

j = sortmerge(df1.datetime, df2.datetime, Minute(5),
              sd=(v1, v2, i1, i2, threshold) -> begin
              diff = v1[i1] - v2[i2]
              (abs(diff) > threshold)  &&  (return sign(diff))
              return 0
              end)

# Rows common to both dataframes (i.e., inner join)
tmp = hcat(df1[j[1], :], df2[j[2], :], makeunique=true)

# Print maximum clock difference between matched rows
println(Minute(maximum(abs.(df1[j[1], :datetime] .- df2[j[2], :datetime]))))

# Rows from first dataframe without matching rows in the second one:
tmp = df1[countmatch(j, 1) .== 0, :]

# Rows from second dataframe without matching rows in the first one:
tmp = df2[countmatch(j, 2) .== 0, :]

2 Likes