I have two sets of data. One with names, one with names and expenses. I want to calculate the average expenses for each name.
df_one = DataFrame(names=["John", "Sally", "Kirk", "Sally"])
df_two = DataFrame(names=["Sally", "Sally", "Kirk", "John", "Kirk", "Kirk", "John", "Sally", "Sally"],
expenses=[55.78, 10.98, 35.56, 63.21, 14.95, 32.73, 61.72, 38.27, 19.45])
I can do this in two steps:
tmp_expenses = @from i in df_one begin
@join j in df_two on i.names equals j.names into g
@select {
i.names,
total_exp = sum(g..expenses),
num_exp = length(g)
}
@collect DataFrame
end
avg_expenses = @from i in tmp_expenses begin
@select {
i.names,
i.total_exp,
i.num_exp,
avg_expenses = i.total_exp / i.num_exp
}
@collect DataFrame
end
However, I can’t figure out how to do it in a single query. I’ve tried:
avg_expenses = tmp_expenses = @from i in df_one begin
@join j in df_two on i.names equals j.names into g
@let num_exp = length(g)
@let total_exp = sum(g..expenses)
@let avg_expenses = total_exp / num_exp
@select {
i.names,
total_exp,
num_exp,
avg_expenses
}
@collect DataFrame
end
But, it doesn’t seem to like the g..
syntax in the @let. Errors with:
syntax: "##t#765.##t#764.i" is not a valid function argument name
include_string(::String, ::String) at loading.jl:515
include_string(::String, ::String, ::Int64) at eval.jl:30
include_string(::Module, ::String, ::String, ::Int64, ::Vararg{Int64,N} where N) at eval.jl:34
(::Atom.##49#53{String,Int64,String})() at eval.jl:50
withpath(::Atom.##49#53{String,Int64,String}, ::String) at utils.jl:30
withpath(::Function, ::String) at eval.jl:38
macro expansion at eval.jl:49 [inlined]
(::Atom.##48#52{Dict{String,Any}})() at task.jl:80
Have also tried:
tmp_expenses = @from i in df_one begin
@join j in df_two on i.names equals j.names into g
@select {
i.names,
total_exp = sum(g..expenses),
num_exp = length(g),
avg_exp = total_exp / num_exp
}
@collect DataFrame
end
Which errors with:
type UnionAll has no field parameters
column_types at utilities.jl:20 [inlined]
_DataFrame(::QueryOperators.EnumerableGroupJoin{NamedTuples._NT_names_total__exp_num__exp_avg__exp{DataValues.DataValue{String},DataValues.DataValue{Float64},Int64,_} where _,DataValues.DataValue{String},NamedTuples._NT_names_expenses{DataValues.DataValue{String},DataValues.DataValue{Float64}},QueryOperators.EnumerableIterable{NamedTuples._NT_names{DataValues.DataValue{String}},IterableTables.DataFrameIterator{NamedTuples._NT_names{DataValues.DataValue{String}},Tuple{DataArrays.DataArray{String,1}}}},QueryOperators.EnumerableIterable{NamedTuples._NT_names_expenses{DataValues.DataValue{String},DataValues.DataValue{Float64}},IterableTables.DataFrameIterator{NamedTuples._NT_names_expenses{DataValues.DataValue{String},DataValues.DataValue{Float64}},Tuple{DataArrays.DataArray{String,1},DataArrays.DataArray{Float64,1}}}},##671#675,##672#676,##673#677}) at dataframes-dataarray.jl:105
DataFrames.DataFrame(::QueryOperators.EnumerableGroupJoin{NamedTuples._NT_names_total__exp_num__exp_avg__exp{DataValues.DataValue{String},DataValues.DataValue{Float64},Int64,_} where _,DataValues.DataValue{String},NamedTuples._NT_names_expenses{DataValues.DataValue{String},DataValues.DataValue{Float64}},QueryOperators.EnumerableIterable{NamedTuples._NT_names{DataValues.DataValue{String}},IterableTables.DataFrameIterator{NamedTuples._NT_names{DataValues.DataValue{String}},Tuple{DataArrays.DataArray{String,1}}}},QueryOperators.EnumerableIterable{NamedTuples._NT_names_expenses{DataValues.DataValue{String},DataValues.DataValue{Float64}},IterableTables.DataFrameIterator{NamedTuples._NT_names_expenses{DataValues.DataValue{String},DataValues.DataValue{Float64}},Tuple{DataArrays.DataArray{String,1},DataArrays.DataArray{Float64,1}}}},##671#675,##672#676,##673#677}) at dataframes-dataarray.jl:127
collect(::QueryOperators.EnumerableGroupJoin{NamedTuples._NT_names_total__exp_num__exp_avg__exp{DataValues.DataValue{String},DataValues.DataValue{Float64},Int64,_} where _,DataValues.DataValue{String},NamedTuples._NT_names_expenses{DataValues.DataValue{String},DataValues.DataValue{Float64}},QueryOperators.EnumerableIterable{NamedTuples._NT_names{DataValues.DataValue{String}},IterableTables.DataFrameIterator{NamedTuples._NT_names{DataValues.DataValue{String}},Tuple{DataArrays.DataArray{String,1}}}},QueryOperators.EnumerableIterable{NamedTuples._NT_names_expenses{DataValues.DataValue{String},DataValues.DataValue{Float64}},IterableTables.DataFrameIterator{NamedTuples._NT_names_expenses{DataValues.DataValue{String},DataValues.DataValue{Float64}},Tuple{DataArrays.DataArray{String,1},DataArrays.DataArray{Float64,1}}}},##671#675,##672#676,##673#677}, ::Type{DataFrames.DataFrame}) at sink_type.jl:2
include_string(::String, ::String) at loading.jl:515
include_string(::String, ::String, ::Int64) at eval.jl:30
include_string(::Module, ::String, ::String, ::Int64, ::Vararg{Int64,N} where N) at eval.jl:34
(::Atom.##49#53{String,Int64,String})() at eval.jl:50
withpath(::Atom.##49#53{String,Int64,String}, ::String) at utils.jl:30
withpath(::Function, ::String) at eval.jl:38
macro expansion at eval.jl:49 [inlined]
(::Atom.##48#52{Dict{String,Any}})() at task.jl:80
Obviously total_exp
and num_exp
are only defined as column names, not variables.
Any way to perform this in a single query?