I have tried several ways of producing the desired result. Performance is different.
I would like someone (@aplavin ?) to explain to me where / how / why flexijoinsβ leftjoin does better than the jf() function.
julia> using Tables, FlexiJoins, IntervalSets
julia> v11=rand(1:100, 10^3);
julia> v12=rand(1:2, 10^3);
julia> v21=rand(1:100, 10^4);
julia> v22=rand(101:200, 10^4);
julia> v23=rand(2:3,10^4);
julia> tbl1=rowtable((id=v11,b=v12));
julia> tbl2=rowtable((id1=v21,id2=v22,c=v23));
julia> using DataFrames,DataFramesMeta
julia> df1=DataFrame(id=v11,b=v12);
julia> df2=DataFrame(id1=v21,id2=v22,c=v23);
######## benchmarks##############
julia> using BenchmarkTools
julia> @btime @rsubset(crossjoin(df1, df2), :id1 < :id < :id2);
218.115 ms (590 allocations: 622.76 MiB)
julia> using StatsBase
julia> @btime begin
j=pairwise((x,y)->ifelse( y.id1<x.id<y.id2, merge(x,y), nothing), tbl1,tbl2)
[r for r in j if !isnothing(r)] |> rowtable
end
670.530 ms (37 allocations: 949.31 MiB)
julia> @btime [(x,y) for (x,y) in Iterators.product(tbl1,tbl2) if y.id1<=x.id<=y.id2];
218.760 ms (22 allocations: 366.20 MiB)
julia> @btime [merge(x,y) for (x,y) in Iterators.product(tbl1,tbl2) if y.id1<=x.id<=y.id2] |> DataFrame;
285.113 ms (56 allocations: 562.11 MiB)
julia> function jf(tbl1,tbl2)
df=NamedTuple[]
for x in tbl1
X=x.id
for y in tbl2
if y.id1<=X<=y.id2
push!(df,merge(x,y))
end
end
end
return df
end
jf (generic function with 1 method)
julia> @btime jf(tbl1,tbl2);
146.250 ms (5135583 allocations: 308.80 MiB)
julia> @btime lj=leftjoin((tbl1, tbl2), by_pred(:id, β, r -> r.id1..r.id2))
66.516 ms (245064 allocations: 166.71 MiB)
if I walk the same steps without PUSH the time becomes very short.
So all the time seems to be due to push operations β¦
julia> function jft(tbl1,tbl2)
df=Tuple[]
for x in tbl1
X=x.id
for y in tbl2
if y.id1<=X<=y.id2
#push!(df,(x,y))
end
end
end
return df
end
jft (generic function with 1 method)
julia> @btime jft(tbl1,tbl2);
28.141 ns (1 allocation: 48 bytes)
another way that seems more efficient would be to obtain the indexes of the rows of tbl2 corresponding to each row of tbl1, without having to allocate the complete rows.
julia> function jft(tbl1,tbl2)
v=Vector{Vector{Int64}}()
for (i,x) in enumerate(tbl1)
X=x.id
push!(v,[])
for (j,y) in enumerate(tbl2)
if X β y.id1..y.id2
push!(v[i],j)
end
end
end
return view.([tbl2],v)
end
jft (generic function with 1 method)
julia> @btime jft(tbl1,tbl2);
65.533 ms (8419 allocations: 108.75 MiB)