Query.jl: Problem with multiple calculations in single query

query

#1

I have two sets of data. One with names, one with names and expenses. I want to calculate the average expenses for each name.

df_one = DataFrame(names=["John", "Sally", "Kirk", "Sally"])
df_two = DataFrame(names=["Sally", "Sally", "Kirk", "John", "Kirk", "Kirk", "John", "Sally", "Sally"],
expenses=[55.78, 10.98, 35.56, 63.21, 14.95, 32.73, 61.72, 38.27, 19.45])

I can do this in two steps:

tmp_expenses = @from i in df_one begin
        @join j in df_two on i.names equals j.names into g
        @select {
        i.names,
        total_exp = sum(g..expenses),
        num_exp = length(g)
        }
        @collect DataFrame
end

avg_expenses = @from i in tmp_expenses begin
        @select {
        i.names,
        i.total_exp,
        i.num_exp,
        avg_expenses = i.total_exp / i.num_exp
        }
        @collect DataFrame
end

However, I can’t figure out how to do it in a single query. I’ve tried:

avg_expenses = tmp_expenses = @from i in df_one begin
        @join j in df_two on i.names equals j.names into g
        @let num_exp = length(g)
        @let total_exp = sum(g..expenses)
        @let avg_expenses = total_exp / num_exp
        @select {
        i.names,
        total_exp,
        num_exp,
        avg_expenses
        }
        @collect DataFrame
end

But, it doesn’t seem to like the g.. syntax in the @let. Errors with:

syntax: "##t#765.##t#764.i" is not a valid function argument name
include_string(::String, ::String) at loading.jl:515
include_string(::String, ::String, ::Int64) at eval.jl:30
include_string(::Module, ::String, ::String, ::Int64, ::Vararg{Int64,N} where N) at eval.jl:34
(::Atom.##49#53{String,Int64,String})() at eval.jl:50
withpath(::Atom.##49#53{String,Int64,String}, ::String) at utils.jl:30
withpath(::Function, ::String) at eval.jl:38
macro expansion at eval.jl:49 [inlined]
(::Atom.##48#52{Dict{String,Any}})() at task.jl:80

Have also tried:

tmp_expenses = @from i in df_one begin
        @join j in df_two on i.names equals j.names into g
        @select {
        i.names,
        total_exp = sum(g..expenses),
        num_exp = length(g),
        avg_exp = total_exp / num_exp
        }
        @collect DataFrame
end

Which errors with:

type UnionAll has no field parameters
column_types at utilities.jl:20 [inlined]
_DataFrame(::QueryOperators.EnumerableGroupJoin{NamedTuples._NT_names_total__exp_num__exp_avg__exp{DataValues.DataValue{String},DataValues.DataValue{Float64},Int64,_} where _,DataValues.DataValue{String},NamedTuples._NT_names_expenses{DataValues.DataValue{String},DataValues.DataValue{Float64}},QueryOperators.EnumerableIterable{NamedTuples._NT_names{DataValues.DataValue{String}},IterableTables.DataFrameIterator{NamedTuples._NT_names{DataValues.DataValue{String}},Tuple{DataArrays.DataArray{String,1}}}},QueryOperators.EnumerableIterable{NamedTuples._NT_names_expenses{DataValues.DataValue{String},DataValues.DataValue{Float64}},IterableTables.DataFrameIterator{NamedTuples._NT_names_expenses{DataValues.DataValue{String},DataValues.DataValue{Float64}},Tuple{DataArrays.DataArray{String,1},DataArrays.DataArray{Float64,1}}}},##671#675,##672#676,##673#677}) at dataframes-dataarray.jl:105
DataFrames.DataFrame(::QueryOperators.EnumerableGroupJoin{NamedTuples._NT_names_total__exp_num__exp_avg__exp{DataValues.DataValue{String},DataValues.DataValue{Float64},Int64,_} where _,DataValues.DataValue{String},NamedTuples._NT_names_expenses{DataValues.DataValue{String},DataValues.DataValue{Float64}},QueryOperators.EnumerableIterable{NamedTuples._NT_names{DataValues.DataValue{String}},IterableTables.DataFrameIterator{NamedTuples._NT_names{DataValues.DataValue{String}},Tuple{DataArrays.DataArray{String,1}}}},QueryOperators.EnumerableIterable{NamedTuples._NT_names_expenses{DataValues.DataValue{String},DataValues.DataValue{Float64}},IterableTables.DataFrameIterator{NamedTuples._NT_names_expenses{DataValues.DataValue{String},DataValues.DataValue{Float64}},Tuple{DataArrays.DataArray{String,1},DataArrays.DataArray{Float64,1}}}},##671#675,##672#676,##673#677}) at dataframes-dataarray.jl:127
collect(::QueryOperators.EnumerableGroupJoin{NamedTuples._NT_names_total__exp_num__exp_avg__exp{DataValues.DataValue{String},DataValues.DataValue{Float64},Int64,_} where _,DataValues.DataValue{String},NamedTuples._NT_names_expenses{DataValues.DataValue{String},DataValues.DataValue{Float64}},QueryOperators.EnumerableIterable{NamedTuples._NT_names{DataValues.DataValue{String}},IterableTables.DataFrameIterator{NamedTuples._NT_names{DataValues.DataValue{String}},Tuple{DataArrays.DataArray{String,1}}}},QueryOperators.EnumerableIterable{NamedTuples._NT_names_expenses{DataValues.DataValue{String},DataValues.DataValue{Float64}},IterableTables.DataFrameIterator{NamedTuples._NT_names_expenses{DataValues.DataValue{String},DataValues.DataValue{Float64}},Tuple{DataArrays.DataArray{String,1},DataArrays.DataArray{Float64,1}}}},##671#675,##672#676,##673#677}, ::Type{DataFrames.DataFrame}) at sink_type.jl:2
include_string(::String, ::String) at loading.jl:515
include_string(::String, ::String, ::Int64) at eval.jl:30
include_string(::Module, ::String, ::String, ::Int64, ::Vararg{Int64,N} where N) at eval.jl:34
(::Atom.##49#53{String,Int64,String})() at eval.jl:50
withpath(::Atom.##49#53{String,Int64,String}, ::String) at utils.jl:30
withpath(::Function, ::String) at eval.jl:38
macro expansion at eval.jl:49 [inlined]
(::Atom.##48#52{Dict{String,Any}})() at task.jl:80

Obviously total_exp and num_exp are only defined as column names, not variables.

Any way to perform this in a single query?


#2

I figured out how to make life easier for myself and just use mean

tmp_expenses = @from i in df_one begin
        @join j in df_two on i.names equals j.names into g
        @select {
        i.names,
        avg_exp = mean(g..expenses)
        }
        @collect DataFrame
end

Obvious really.

Nonetheless, maybe it’s worth allowing g.. syntax in @let for more complex calculations.


#3

Just curious why you need to join here; couldn’t you just group on the names in df_two?


#4

The example data I used was a bit contrived. names in df_one should only be a subset of the names in df_two. i.e. there should be more names in df_two, but I only wish to use those with a match in df_one.