I often have to write functions that recalculate some arrays of structures (sorted by one of the fields) into other arrays of structures. I often had to do conversions from one array of structures into another to fit argument type. So I wanted to minimize this boilerplate code.
In previous version I passed every column into a separate argument, but then there were no guarantees that they have the same length. So here’s a template of such a function that I eventually came up with.
using StructArrays
"""
function finds intervals of matching symbols from sorted array of symbols
"""
function find_intervals(
inp::AbstractVector{@NamedTuple{time::Int, sym::Symbol}};
target_syms::Vector{Symbol} = [:A, :B],
break_syms::Vector{Symbol} = [:C, :D],
)
# or StructVector{@NamedTuple{...}}[]
out = Vector{@NamedTuple{tbeg::Int, tend::Int, count::Int, type::Symbol}}()
count2sym = count->count < 2 ? :short : :long
is_series = false
tbeg = 1
tend = 1
count = 0
for x in inp
if x.sym in target_syms
if ~is_series
tbeg = x.time
is_series = true
count = 0
end
count += 1
elseif x.sym in break_syms
if is_series
push!(out, (; tbeg, tend, count, type = count2sym(count)))
is_series = false
end
else
# other symbols don't break series and are not counted
end
tend = x.time
end
if is_series
push!(out, (; tbeg, tend, count, type = count2sym(count)))
is_series = false
end
return out
end
times = [10,20,30,40,50,60,70,80]
syms = [:A,:B,:C,:D,:A,:B,:C,:D]
# Problem 1: I have data either in columns or rows table that should both work:
cols = (time = times, sym = syms)
rows = [(time = t, sym = s) for (t, s) in zip(times, syms)]
# rows - can pass directly:
out = find_intervals(rows, target_syms=[:A, :C], break_syms=[:B])
# cols - wrap into a struct vector:
sv = StructVector(cols)
out = find_intervals(sv, target_syms=[:A, :C], break_syms=[:B])
# Problem 2: Column names and number do not fit with function signature:
cols_ = (T = times, T2 = 2 .* times, S = syms)
rows_ = [(T = t, T2 = 2t, S = s) for (t, s) in zip(times, syms)]
# rows - should copy into another rows with renamed fields (-)
cols = broadcast(rows_) do r
selected = NamedTuple{(:T, :S)}(r)
renamed = NamedTuple{(:time, :sym)}(values(selected))
end
out = find_intervals(cols, target_syms=[:A, :C], break_syms=[:B])
# cols - should select and rename columns, wrap into another struct vector (+no copy)
cols = StructVector((time = cols_.T, sym = cols_.S))
out = find_intervals(cols, target_syms=[:A, :C], break_syms=[:B])
What confuses me is that field renaming is redundant here. When I use NamedTuple in signature, it fixes both names and order of arguments. This seems redundant, because it usually happens at substitution of positional arguments into the function. Is there a way to declare local column arguments names inside function and check only their types?