A partition and a crossValidation function over arbitrary number of n-dimensional arrays

sylvaticus · April 15, 2021, 1:50pm

Hello, for my own machine learning library, I implemented the following functions to partition and run cross-validation over a m-set of n-dimensional arrays, where one can specify the dimension to partition or run the cross validation. They can be run independently from BetaML.

Comments welcome!

import Base.iterate
using Random, Statistics
abstract type  AbstractDataSampler end

@enum Verbosity NONE=0 LOW=10 STD=20 HIGH=30 FULL=40


mutable struct SamplerWithData{Ts <: AbstractDataSampler, Td <: AbstractArray}
    sampler::Ts
    data::Td
    dims::Int64
end

"""
    partition(data,parts;shuffle,dims,rng)

Partition (by dimension `dims`) one or more matrices according to the shares in `parts`.

# Parameters
* `data`: A matrix/vector or a vector of matrices/vectors
* `parts`: A vector of the required shares (must sum to 1)
* `shufle`: Whether to randomly shuffle the matrices (preserving the relative order between matrices)
* `dims`: The dimension for which to partition [def: `1`]
* `copy`: Wheter to _copy_ the actual data or only create a reference [def: `true`]
* `rng`: Random Number Generator (see [`FIXEDSEED`](@ref)) [deafult: `Random.GLOBAL_RNG`]

# Notes:
* The sum of parts must be equal to 1
* The number of elements in the specified dimension must be the same for all the arrays in `data`

# Example:
julia
julia> x = [1:10 11:20]
julia> y = collect(31:40)
julia> ((xtrain,xtest),(ytrain,ytest)) = partition([x,y],[0.7,0.3])
 
 """
function partition(data::AbstractArray{T,1},parts::AbstractArray{Float64,1};shuffle=true,dims=1,copy=true,rng = Random.GLOBAL_RNG) where T <: AbstractArray
        # the sets of vector/matrices
        N = size(data[1],dims)
        all(size.(data,dims) .== N) || @error "All matrices passed to `partition` must have the same number of elements for the required dimension"
        ridx = shuffle ? Random.shuffle(rng,1:N) : collect(1:N)
        return partition.(data,Ref(parts);shuffle=shuffle,dims=dims,fixedRIdx = ridx,copy=copy,rng=rng)
end

function partition(data::AbstractArray{T,Ndims}, parts::AbstractArray{Float64,1};shuffle=true,dims=1,fixedRIdx=Int64[],copy=true,rng = Random.GLOBAL_RNG) where {T,Ndims}
    # the individual vector/matrix
    N        = size(data,dims)
    nParts   = size(parts)
    toReturn = toReturn = Array{AbstractArray{T,Ndims},1}(undef,nParts)
    if !(sum(parts) ≈ 1)
        @error "The sum of `parts` in `partition` should total to 1."
    end
    ridx = fixedRIdx
    if (isempty(ridx))
       ridx = shuffle ? Random.shuffle(rng, 1:N) : collect(1:N)
    end
    allDimIdx = convert(Vector{Union{UnitRange{Int64},Vector{Int64}}},[1:i for i in size(data)])
    current = 1
    cumPart = 0.0
    for (i,p) in enumerate(parts)
        cumPart += parts[i]
        final = i == nParts ? N : Int64(round(cumPart*N))
        allDimIdx[dims] = ridx[current:final]
        toReturn[i]     = copy ? data[allDimIdx...] : @views data[allDimIdx...]
        current         = (final +=1)
    end
    return toReturn
end

"""
   KFold(nSplits=5,nRepeats=1,shuffle=true,rng=Random.GLOBAL_RNG)

Iterator for k-fold crossValidation strategy.

"""
mutable struct KFold <: AbstractDataSampler
    nSplits::Int64
    nRepeats::Int64
    shuffle::Bool
    rng::AbstractRNG
    function KFold(;nSplits=5,nRepeats=1,shuffle=true,rng=Random.GLOBAL_RNG)
        return new(nSplits,nRepeats,shuffle,rng)
    end
end

# Implementation of the Julia iteration  API for SamplerWithData{KFold}
function iterate(iter::SamplerWithData{KFold})
     # First iteration, I need to create the subsamples
     K    = iter.sampler.nSplits
     D    = iter.dims
     if eltype(iter.data) <: AbstractArray # data has multiple arrays, like X,Y
       subs = collect(zip(partition(iter.data,fill(1/K,K),shuffle=iter.sampler.shuffle,dims=D,rng=iter.sampler.rng,copy=false)...))
     else # data is a single matrix/tensor
       subs = collect(zip(partition(iter.data,fill(1/K,K),shuffle=iter.sampler.shuffle,dims=D,rng=iter.sampler.rng,copy=false)))
     end
     i    = (cat.(subs[2:K]...,dims=D),subs[1])
     next = (subs,2)
     return (i,next)
end

function iterate(iter::SamplerWithData{KFold},state)
     # Further iteration, I need to create the subsamples only if it is a new interaction
     K    = iter.sampler.nSplits
     D    = iter.dims
     nRep = iter.sampler.nRepeats
     subs    = state[1]
     counter = state[2]
     counter <= (K * nRep) || return nothing  # If we are done all the splits by the repetitions we are done
     kpart = counter % K
     if kpart == 1 # new round, we repartition in k parts
         if eltype(iter.data) <: AbstractArray # data has multiple arrays, like X,Y
             subs = collect(zip(partition(iter.data,fill(1/K,K),shuffle=iter.sampler.shuffle,dims=D,rng=iter.sampler.rng,copy=false)...))
         else # data is a single matrix
             subs = collect(zip(partition(iter.data,fill(1/K,K),shuffle=iter.sampler.shuffle,dims=D,rng=iter.sampler.rng,copy=false)))
         end
         i    = (cat.(subs[2:end]...,dims=D),subs[1])
         next = (subs,counter+1)
         return (i,next)
     else
        if kpart == 0 # the modulo returns the last element as zero instead as K
            i   = (cat.(subs[1:K-1]...,dims=D),subs[end])
        else
            i   = (cat.(subs[1:kpart-1]...,subs[kpart+1:end]...,dims=D),subs[kpart])
        end
        next  = (subs,counter+1)
        return (i,next)
    end
end

"""
    crossValidation(f,data,sampler;dims,verbosity,returnStatistics)

Perform crossValidation according to `sampler` rule by calling the function f and collecting its output

# Parameters
- `f`: The user-defined function that consume the specific train and validation data and return somehting (often the associated validation error). See later
- `data`: A single n-dimenasional array or a vector of them (e.g. X,Y), depending on the tasks required by `f`.
- sampler: An istance of a ` AbstractDataSampler`, defining the "rules" for sampling at each iteration. [def: `KFold(nSplits=5,nRepeats=1,shuffle=true,rng=Random.GLOBAL_RNG)` ]
- `dims`: The dimension over performing the crossValidation i.e. the dimension containing the observations [def: `1`]
- `verbosity`: The verbosity to print information during each iteration (this can also be printed in the `f` function) [def: `STD`]
- `returnStatistics`: Wheter crossValidation should return the statistics of the output of `f` (mean and standard deviation) or the whole outputs [def: `true`].

# Notes

crossValidation works by calling the function `f`, defined by the user, passing to it the tuple `trainData`, `valData` and `rng` and collecting the result of the function f. The specific method for which `trainData`, and `valData` are selected at each iteration depends on the specific `sampler`, whith a single 5 k-fold rule being the default.

This approach is very flexible because the specific model to employ or the metric to use is left within the user-provided function. The only thing that crossValidation does is provide the model defined in the function `f` with the opportune data (and the random number generator).

**Input of the user-provided function**
`trainData` and `valData` are both themselves tuples. In supervised models, crossValidations `data` should be a tuple of (X,Y) and `trainData` and `valData` will be equivalent to (xtrain, ytrain) and (xval, yval). In unsupervised models `data` is a single array, but the training and validation data should still need to be accessed as  `trainData[1]` and `valData[1]`.
**Output of the user-provided function**
The user-defined function can return whatever. However, if `returnStatistics` is left on its default `true` value the user-defined function must return a single scalar (e.g. some error measure) so that the mean and the standard deviation are returned.

Note that `crossValidation` can beconveniently be employed using the `do` syntax, as Julia automatically rewrite `crossValidation(data,...) trainData,valData,rng  ...user defined body... end` as `crossValidation(f(trainData,valData,rng ), data,...)`

# Example

julia> X = [11:19 21:29 31:39 41:49 51:59 61:69];
julia> Y = [1:9;];
julia> sampler = KFold(nSplits=3);
julia> (μ,σ) = crossValidation([X,Y],sampler) do trainData,valData,rng
                 (xtrain,ytrain) = trainData; (xval,yval) = valData
                 trainedModel    = buildForest(xtrain,ytrain,30)
                 predictions     = predict(trainedModel,xval)
                 ϵ               = meanRelError(predictions,yval,normRec=false)
                 return ϵ
               end
(0.3202242202242202, 0.04307662219315022)

"""
function crossValidation(f,data,sampler=KFold(nSplits=5,nRepeats=1,shuffle=true,rng=Random.GLOBAL_RNG);dims=1,verbosity=STD, returnStatistics=true)
    iterResults = Union{Array{Float64},Float64}[]
    for (i,iterData) in enumerate(SamplerWithData(sampler,data,dims))
       iterResult = f(iterData[1],iterData[2],sampler.rng)
       push!(iterResults,iterResult)
       if verbosity > STD
           println("Done iteration $i. This iteration output: $iterResult")
       end
    end
    if returnStatistics  return (mean(iterResults),std(iterResults)) else return iterResults end
end

Testing partition

julia> x = [11:16 21:26]; y = collect(31:36);
julia> ((xtrain,xval,xtest),(ytrain,yval,ytest)) = partition([x,y],[0.6,0.2,0.2]);
julia> print("xtrain: $xtrain ytrain: $ytrain - xval: $xval yval: $yval - xtest: $xtest ytest: $ytest") 

xtrain: [14 24; 12 22; 13 23; 16 26] ytrain: [34, 32, 33, 36] - xval: [11 21] yval: [31] - xtest: [15 25] ytest: [35]

Testing the sampler

julia> data           = [[11:16 21:26],[31:36;]]
julia> sampleIterator = SamplerWithData(KFold(nSplits=3,nRepeats=2,shuffle=false),data,1)
julia> for (i,d) in enumerate(sampleIterator)
              (xtrain,ytrain),(xval,yval) = d
              println("i: $i - xtrain: $xtrain ytrain: $ytrain - xval: $xval yval: $yval")
          end

i: 1 - xtrain: [13 23; 14 24; 15 25; 16 26] ytrain: [33, 34, 35, 36] - xval: [11 21; 12 22] yval: [31, 32]
i: 2 - xtrain: [11 21; 12 22; 15 25; 16 26] ytrain: [31, 32, 35, 36] - xval: [13 23; 14 24] yval: [33, 34]
i: 3 - xtrain: [11 21; 12 22; 13 23; 14 24] ytrain: [31, 32, 33, 34] - xval: [15 25; 16 26] yval: [35, 36]
i: 4 - xtrain: [13 23; 14 24; 15 25; 16 26] ytrain: [33, 34, 35, 36] - xval: [11 21; 12 22] yval: [31, 32]
i: 5 - xtrain: [11 21; 12 22; 15 25; 16 26] ytrain: [31, 32, 35, 36] - xval: [13 23; 14 24] yval: [33, 34]
i: 6 - xtrain: [11 21; 12 22; 13 23; 14 24] ytrain: [31, 32, 33, 34] - xval: [15 25; 16 26] yval: [35, 36]

Testing crossValidation

julia> X = [11:19 21:29 31:39 41:49 51:59 61:69]; Y = [1:9;]
julia> sampler = KFold(nSplits=3,nRepeats=1,shuffle=true)
julia> (μ,σ) = crossValidation([X,Y],sampler) do trainData,valData,rng
                         (xtrain,ytrain) = trainData; (xval,yval) = valData
                         trainedModel = buildForest(xtrain,ytrain,30,rng=rng) # Require BetaML or DecisionTrees.jl
                         predictions = predict(trainedModel,xval,rng=rng)
                         ϵ = meanRelError(predictions,yval,normRec=false)
                         return ϵ
                 end
julia> print("$μ , $σ")

(0.28702651515151517, 0.05043882771717648)

Topic		Replies	Views
Tools for partitioning matrix columns? General Usage question	3	382	September 21, 2023
Accelerating linear methods Machine Learning question	11	661	June 3, 2023
Partition of distributed arrays in Julia Performance question	0	168	April 5, 2023
Is there a simple/intuitive way to partition a matrix by arbitrary strides? Like i General Usage matrices	9	1153	February 24, 2021
NDReducibles.jl: multi-dimensional array computations without indexing (a proof-of-concept) Package Announcements	4	714	July 28, 2019

A partition and a crossValidation function over arbitrary number of n-dimensional arrays

Testing partition

Testing the sampler

Testing crossValidation

Related topics