I attempted to implement your strategy, but, I’m not sure where I’m going wrong. I do see that in yours you’re not using DataFrames, would that be the reason my doesn’t work? Sorry if I’m missing something simple here. I’m not great at this. lol
Here is what I attempted to do:
const WIN_HALF = 182
# Function to process a single file
function process_file(file, output_path, all_dates_df)
stat = (basename(file))[1:4]
df = CSV.read(file, DataFrame)
tbl = [df.decyr df.value]
xs = df_all_dates
dropmissing!(df)
J = leftjoin(
(;xs, tbl),
by_pred(x -> x ± WIN_HALF, in, r -> r.tbl[1]),
groupby = :xs,
mode = FlexiJoins.Mode.NestedLoop()
)
Jagg = map(J) do j
(j.xs, isempty(j.tbl) ? missing : mean(r -> r.tbl[2], j.tbl))
end
df_merged = leftjoin(all_dates_df, DataFrames.DataFrame(Jagg), on = :decyr)
sort!(df_merged, [:decyr])
CSV.write(joinpath(output_path, "$(stat)_medave.csv"), df_merged)
end
Which throws me this error:
ERROR: MethodError: no method matching keytype(::DataFrame)
The function `keytype` exists, but no method is defined for this combination of argument types.
Closest candidates are:
keytype(::Type{Union{}}, Any...)
@ Base abstractarray.jl:189
keytype(::Type{DataStructures.SortedMultiDict{K, D, Ord}}) where {K, D, Ord<:Base.Order.Ordering}
@ DataStructures ~/.julia/packages/DataStructures/95DJa/src/sorted_multi_dict.jl:292
keytype(::Type{DataStructures.SortedSet{K, Ord}}) where {K, Ord<:Base.Order.Ordering}
@ DataStructures ~/.julia/packages/DataStructures/95DJa/src/sorted_set.jl:184
...
Stacktrace:
[1] (::FlexiJoins.var"#108#109"{1})(i::Int64, X::DataFrame, nms::FlexiJoins.Drop)
@ FlexiJoins ~/.julia/packages/FlexiJoins/OB7he/src/ix_compute.jl:65
[2] map
@ ./tuple.jl:406 [inlined]
[3] create_ix_array(datas::Tuple{…}, nonmatches::Tuple{…}, _groupby::Val{…})
@ FlexiJoins ~/.julia/packages/FlexiJoins/OB7he/src/ix_compute.jl:64
[4] _joinindices(datas::Tuple{…}, cond::FlexiJoins.ByPred{…}, multi::Tuple{…}, nonmatches::Tuple{…}, groupby::Val{…}, cardinality::Tuple{…}, mode::FlexiJoins.Mode.NestedLoop, cache::Nothing, loop_over_side::Val{…})
@ FlexiJoins ~/.julia/packages/FlexiJoins/OB7he/src/joins.jl:83
[5] _joinindices(datas::Tuple{…}, cond::FlexiJoins.ByPred{…}, multi::Tuple{…}, nonmatches::Tuple{…}, groupby::Val{…}, cardinality::Tuple{…}, mode::FlexiJoins.Mode.NestedLoop, cache::Nothing, loop_over_side::Nothing)
@ FlexiJoins ~/.julia/packages/FlexiJoins/OB7he/src/joins.jl:60
[6] _joinindices(datas::@NamedTuple{…}, cond::FlexiJoins.ByPred{…}; multi::Nothing, nonmatches::Tuple{…}, groupby::Symbol, cardinality::Nothing, mode::FlexiJoins.Mode.NestedLoop, cache::Nothing, loop_over_side::Nothing)
@ FlexiJoins ~/.julia/packages/FlexiJoins/OB7he/src/joins.jl:45
[7] joinindices(datas::@NamedTuple{…}, cond::FlexiJoins.ByPred{…}; kwargs::@Kwargs{…})
@ FlexiJoins ~/.julia/packages/FlexiJoins/OB7he/src/joins.jl:35
[8] _flexijoin(datas::@NamedTuple{…}, cond::FlexiJoins.ByPred{…}; kwargs::@Kwargs{…})
@ FlexiJoins ~/.julia/packages/FlexiJoins/OB7he/src/joins.jl:30
[9] flexijoin(datas::@NamedTuple{…}, args::FlexiJoins.ByPred{…}; kwargs::@Kwargs{…})
@ FlexiJoins ~/.julia/packages/FlexiJoins/OB7he/src/joins.jl:27
[10] leftjoin(datas::@NamedTuple{…}, args::FlexiJoins.ByPred{…}; kwargs::@Kwargs{…})
@ FlexiJoins ~/.julia/packages/FlexiJoins/OB7he/src/joins.jl:13
[11] process_file(file::String, output_path::String, all_dates_df::DataFrame)
@ Main ~/Documents/julia2/julia_scripts/average/centered_average.jl:32
[12] top-level scope
@ ~/Documents/julia2/julia_scripts/average/centered_average.jl:51
Some type information was truncated. Use `show(err)` to see complete types.
Which, based on the the error output, my error is occurring here, somewhere:
J = leftjoin(
(;xs, tbl),
by_pred(x -> x ± WIN_HALF, in, r -> r.tbl[1]),
groupby = :xs,
mode = FlexiJoins.Mode.NestedLoop()
)
Maybe this poorly drawn visualization I made will help illustrate what, in my mind, I was trying to make happen. lol
In my head, I have a static window that simply rolls along my data and takes the average, and if it encounters a spot that is missing
or NaN
it will just populate that space with data from whatever data is present in my window. Oh, and the _
at the beginning of my drawing should have zeros in them, I just didn’t think about it when I was quickly drawing it.
Please correct me if I’m wrong, but is your implementation essentially “chopping” up the dataset into chunks that fit the parameter?