Hello everyone,
I would like to store subsamples from an initial sample of data, perform different operations according to the subsample, and then return outputs in the same order than the initial inputs (** EDIT: with Julia Base types only, ie. Without DataFrames for example**). For now, what I’m doing is something like this:
- In the main function, initialize an empty Simple Algorithm struct
- Based on the X and Y input, create a Subset Sample and store it in Simple Algorithm struct
- Perform some subsampling to create Subset A, and store it in the Simple Algorithm struct
- Perform a particular computation for Subset A and store the results in the Subset A
- Create a subset B with all observations non-A and store it in the Simple Algorithm struct
- Perform some computation on Subset B and store the results in the Subset B
- Try to unite results from subset A and B and to sort it with a weird function in order to get the results in the same order than the initial inputs X and Y.
I think my implementation is a bit ugly, and is prone to errors. My questions would be:
- Is there a Julia Base Type that you would suggest I use rather than this “Subset” one in order to robustly track the index of each observations in the case of subsampling?
- Would you think that storing subsamples in another struct like this “SimpleAlgorithm” one is a good practice?
EDIT: I need to stay with Julia Base Type in order to stay close to the rest of the functions in the package I would like to contribute, with only Matrix or Vector allowed as inputs and giving matrix or vector as outputs.
Thank you in advance for your feedback / help.
Please find below a MWE:
Base.@kwdef mutable struct Subset
""" Will store the subsample"""
X::Union{Matrix,Vector} = []
Y::Union{Matrix,Vector} = []
indexobs::Vector{Int64} = []
Results::Matrix = Matrix(zeros(2,2)) # This is only in order to store results for this subset later
end
Base.@kwdef mutable struct SimpleAlgorithm
"""A collection of subsamples, the initial sample and the final results"""
Sample::Subset = Subset()
SubsetA::Subset = Subset()
SubsetB::Subset = Subset()
Results::Matrix = Matrix(zeros(2,2)) # This is only in order to store results for this subset later
end
function someselection!(simple_algorithm::SimpleAlgorithm)
""" Performs simple subsampling """
simple_algorithm.SubsetA = Subset(X = simple_algorithm.Sample.X[1:6,:],
Y = simple_algorithm.Sample.Y[1:6,:],
indexobs = simple_algorithm.Sample.indexobs[1:6])
end
function excludingsubset(sample::Subset, subset::Subset)
""" Take the rest of the observations unselected by the someselection! function """
index_to_keep = findall(x -> !(x in subset.indexobs), sample.indexobs)
return Subset(X = sample.X[index_to_keep, :], Y = sample.Y[index_to_keep,:], indexobs = sample.indexobs[index_to_keep])
end
function somecalculationsforA!(simple_algorithm::SimpleAlgorithm)
""" Perform calculation specific to A """
simple_algorithm.SubsetA.Results = simple_algorithm.SubsetA.X .* 2
end
function somecalculationsforB!(simple_algorithm::SimpleAlgorithm)
simple_algorithm.SubsetB.Results = simple_algorithm.SubsetB.X .* 3
end
function formatresults!(simple_algorithm::SimpleAlgorithm)
"""Objective: format the results in order to appear in the same order than the initial inputs X and Y"""
results = vcat(simple_algorithm.SubsetA.Results, simple_algorithm.SubsetB.Results)
results_index = vcat(simple_algorithm.SubsetA.indexobs, simple_algorithm.SubsetB.indexobs)
results_matrix = hcat(results, results_index)
results_matrix = results_matrix[sortperm(results_matrix[:, size(results,2)+1], rev = false), :]
simple_algorithm.Results = results_matrix[:,1:size(results,2)]
end
function somemodel(X::Union{Vector,Matrix}, Y::Union{Vector,Matrix})
simple_algorithm = SimpleAlgorithm()
simple_algorithm.Sample = Subset(X = X, Y = Y, indexobs = [i for i in 1:size(X,1)])
someselection!(simple_algorithm)
somecalculationsforA!(simple_algorithm)
simple_algorithm.SubsetB = excludingsubset(simple_algorithm.Sample, simple_algorithm.SubsetA)
somecalculationsforB!(simple_algorithm)
formatresults!(simple_algorithm)
return simple_algorithm
end
X = [5 13; 16 12; 16 26; 17 15; 18 14; 23 6; 25 10; 27 22; 37 14; 42 25; 5 17];
Y = [12; 14; 25; 26; 8; 9; 27; 30; 31; 26; 12];
results = somemodel(X, Y)
results.Results