Hello everyone,
I would like to store subsamples from an initial sample of data, perform different operations according to the subsample, and then return outputs in the same order than the initial inputs (** EDIT: with Julia Base types only, ie. Without DataFrames for example**). For now, what I’m doing is something like this:
- In the main function, initialize an empty Simple Algorithm struct
 - Based on the X and Y input, create a Subset Sample and store it in Simple Algorithm struct
 - Perform some subsampling to create Subset A, and store it in the Simple Algorithm struct
 - Perform a particular computation for Subset A and store the results in the Subset A
 - Create a subset B with all observations non-A and store it in the Simple Algorithm struct
 - Perform some computation on Subset B and store the results in the Subset B
 - Try to unite results from subset A and B and to sort it with a weird function in order to get the results in the same order than the initial inputs X and Y.
 
I think my implementation is a bit ugly, and is prone to errors. My questions would be:
- Is there a Julia Base Type that you would suggest I use rather than this “Subset” one in order to robustly track the index of each observations in the case of subsampling?
 - Would you think that storing subsamples in another struct like this “SimpleAlgorithm” one is a good practice?
 
EDIT: I need to stay with Julia Base Type in order to stay close to the rest of the functions in the package I would like to contribute, with only Matrix or Vector allowed as inputs and giving matrix or vector as outputs.
Thank you in advance for your feedback / help.
Please find below a MWE:
Base.@kwdef mutable struct Subset
    """ Will store the subsample"""
    X::Union{Matrix,Vector} = []
    Y::Union{Matrix,Vector} = []
    indexobs::Vector{Int64} = []
    Results::Matrix = Matrix(zeros(2,2)) # This is only in order to store results for this subset later
end
Base.@kwdef mutable struct SimpleAlgorithm 
    """A collection of subsamples, the initial sample and the final results"""
    Sample::Subset = Subset()
    SubsetA::Subset = Subset()
    SubsetB::Subset = Subset()
    Results::Matrix = Matrix(zeros(2,2)) # This is only in order to store results for this subset later
end
function someselection!(simple_algorithm::SimpleAlgorithm)
    """ Performs simple subsampling """
    simple_algorithm.SubsetA = Subset(X = simple_algorithm.Sample.X[1:6,:],
                      Y = simple_algorithm.Sample.Y[1:6,:],
                      indexobs = simple_algorithm.Sample.indexobs[1:6])
end
function excludingsubset(sample::Subset, subset::Subset)
    """ Take the rest of the observations unselected by the someselection! function """
    index_to_keep = findall(x -> !(x in subset.indexobs), sample.indexobs)
    return Subset(X = sample.X[index_to_keep, :], Y = sample.Y[index_to_keep,:], indexobs = sample.indexobs[index_to_keep])
end
function somecalculationsforA!(simple_algorithm::SimpleAlgorithm)
    """ Perform calculation specific to A """
    simple_algorithm.SubsetA.Results = simple_algorithm.SubsetA.X .* 2
end
function somecalculationsforB!(simple_algorithm::SimpleAlgorithm)
    simple_algorithm.SubsetB.Results = simple_algorithm.SubsetB.X .* 3
end
function formatresults!(simple_algorithm::SimpleAlgorithm)
    """Objective: format the results in order to appear in the same order than the initial inputs X and Y"""
    results = vcat(simple_algorithm.SubsetA.Results, simple_algorithm.SubsetB.Results)
    results_index = vcat(simple_algorithm.SubsetA.indexobs, simple_algorithm.SubsetB.indexobs)
    results_matrix = hcat(results, results_index)
    results_matrix = results_matrix[sortperm(results_matrix[:, size(results,2)+1], rev = false), :]
    simple_algorithm.Results = results_matrix[:,1:size(results,2)]
end
function somemodel(X::Union{Vector,Matrix}, Y::Union{Vector,Matrix})
    simple_algorithm = SimpleAlgorithm()
    simple_algorithm.Sample = Subset(X = X, Y = Y, indexobs = [i for i in 1:size(X,1)])
    someselection!(simple_algorithm)
    somecalculationsforA!(simple_algorithm)
    simple_algorithm.SubsetB = excludingsubset(simple_algorithm.Sample, simple_algorithm.SubsetA)
    somecalculationsforB!(simple_algorithm)  
    formatresults!(simple_algorithm)
    return simple_algorithm
end
X = [5 13; 16 12; 16 26; 17 15; 18 14; 23 6; 25 10; 27 22; 37 14; 42 25; 5 17];
Y = [12; 14; 25; 26; 8; 9; 27; 30; 31; 26; 12];
results = somemodel(X, Y)
results.Results