Hello, for my own machine learning library, I implemented the following functions to partition and run cross-validation over a m-set of n-dimensional arrays, where one can specify the dimension to partition or run the cross validation. They can be run independently from BetaML.
Comments welcome!
import Base.iterate
using Random, Statistics
abstract type AbstractDataSampler end
@enum Verbosity NONE=0 LOW=10 STD=20 HIGH=30 FULL=40
mutable struct SamplerWithData{Ts <: AbstractDataSampler, Td <: AbstractArray}
sampler::Ts
data::Td
dims::Int64
end
"""
partition(data,parts;shuffle,dims,rng)
Partition (by dimension `dims`) one or more matrices according to the shares in `parts`.
# Parameters
* `data`: A matrix/vector or a vector of matrices/vectors
* `parts`: A vector of the required shares (must sum to 1)
* `shufle`: Whether to randomly shuffle the matrices (preserving the relative order between matrices)
* `dims`: The dimension for which to partition [def: `1`]
* `copy`: Wheter to _copy_ the actual data or only create a reference [def: `true`]
* `rng`: Random Number Generator (see [`FIXEDSEED`](@ref)) [deafult: `Random.GLOBAL_RNG`]
# Notes:
* The sum of parts must be equal to 1
* The number of elements in the specified dimension must be the same for all the arrays in `data`
# Example:
julia
julia> x = [1:10 11:20]
julia> y = collect(31:40)
julia> ((xtrain,xtest),(ytrain,ytest)) = partition([x,y],[0.7,0.3])
"""
function partition(data::AbstractArray{T,1},parts::AbstractArray{Float64,1};shuffle=true,dims=1,copy=true,rng = Random.GLOBAL_RNG) where T <: AbstractArray
# the sets of vector/matrices
N = size(data[1],dims)
all(size.(data,dims) .== N) || @error "All matrices passed to `partition` must have the same number of elements for the required dimension"
ridx = shuffle ? Random.shuffle(rng,1:N) : collect(1:N)
return partition.(data,Ref(parts);shuffle=shuffle,dims=dims,fixedRIdx = ridx,copy=copy,rng=rng)
end
function partition(data::AbstractArray{T,Ndims}, parts::AbstractArray{Float64,1};shuffle=true,dims=1,fixedRIdx=Int64[],copy=true,rng = Random.GLOBAL_RNG) where {T,Ndims}
# the individual vector/matrix
N = size(data,dims)
nParts = size(parts)
toReturn = toReturn = Array{AbstractArray{T,Ndims},1}(undef,nParts)
if !(sum(parts) ≈ 1)
@error "The sum of `parts` in `partition` should total to 1."
end
ridx = fixedRIdx
if (isempty(ridx))
ridx = shuffle ? Random.shuffle(rng, 1:N) : collect(1:N)
end
allDimIdx = convert(Vector{Union{UnitRange{Int64},Vector{Int64}}},[1:i for i in size(data)])
current = 1
cumPart = 0.0
for (i,p) in enumerate(parts)
cumPart += parts[i]
final = i == nParts ? N : Int64(round(cumPart*N))
allDimIdx[dims] = ridx[current:final]
toReturn[i] = copy ? data[allDimIdx...] : @views data[allDimIdx...]
current = (final +=1)
end
return toReturn
end
"""
KFold(nSplits=5,nRepeats=1,shuffle=true,rng=Random.GLOBAL_RNG)
Iterator for k-fold crossValidation strategy.
"""
mutable struct KFold <: AbstractDataSampler
nSplits::Int64
nRepeats::Int64
shuffle::Bool
rng::AbstractRNG
function KFold(;nSplits=5,nRepeats=1,shuffle=true,rng=Random.GLOBAL_RNG)
return new(nSplits,nRepeats,shuffle,rng)
end
end
# Implementation of the Julia iteration API for SamplerWithData{KFold}
function iterate(iter::SamplerWithData{KFold})
# First iteration, I need to create the subsamples
K = iter.sampler.nSplits
D = iter.dims
if eltype(iter.data) <: AbstractArray # data has multiple arrays, like X,Y
subs = collect(zip(partition(iter.data,fill(1/K,K),shuffle=iter.sampler.shuffle,dims=D,rng=iter.sampler.rng,copy=false)...))
else # data is a single matrix/tensor
subs = collect(zip(partition(iter.data,fill(1/K,K),shuffle=iter.sampler.shuffle,dims=D,rng=iter.sampler.rng,copy=false)))
end
i = (cat.(subs[2:K]...,dims=D),subs[1])
next = (subs,2)
return (i,next)
end
function iterate(iter::SamplerWithData{KFold},state)
# Further iteration, I need to create the subsamples only if it is a new interaction
K = iter.sampler.nSplits
D = iter.dims
nRep = iter.sampler.nRepeats
subs = state[1]
counter = state[2]
counter <= (K * nRep) || return nothing # If we are done all the splits by the repetitions we are done
kpart = counter % K
if kpart == 1 # new round, we repartition in k parts
if eltype(iter.data) <: AbstractArray # data has multiple arrays, like X,Y
subs = collect(zip(partition(iter.data,fill(1/K,K),shuffle=iter.sampler.shuffle,dims=D,rng=iter.sampler.rng,copy=false)...))
else # data is a single matrix
subs = collect(zip(partition(iter.data,fill(1/K,K),shuffle=iter.sampler.shuffle,dims=D,rng=iter.sampler.rng,copy=false)))
end
i = (cat.(subs[2:end]...,dims=D),subs[1])
next = (subs,counter+1)
return (i,next)
else
if kpart == 0 # the modulo returns the last element as zero instead as K
i = (cat.(subs[1:K-1]...,dims=D),subs[end])
else
i = (cat.(subs[1:kpart-1]...,subs[kpart+1:end]...,dims=D),subs[kpart])
end
next = (subs,counter+1)
return (i,next)
end
end
"""
crossValidation(f,data,sampler;dims,verbosity,returnStatistics)
Perform crossValidation according to `sampler` rule by calling the function f and collecting its output
# Parameters
- `f`: The user-defined function that consume the specific train and validation data and return somehting (often the associated validation error). See later
- `data`: A single n-dimenasional array or a vector of them (e.g. X,Y), depending on the tasks required by `f`.
- sampler: An istance of a ` AbstractDataSampler`, defining the "rules" for sampling at each iteration. [def: `KFold(nSplits=5,nRepeats=1,shuffle=true,rng=Random.GLOBAL_RNG)` ]
- `dims`: The dimension over performing the crossValidation i.e. the dimension containing the observations [def: `1`]
- `verbosity`: The verbosity to print information during each iteration (this can also be printed in the `f` function) [def: `STD`]
- `returnStatistics`: Wheter crossValidation should return the statistics of the output of `f` (mean and standard deviation) or the whole outputs [def: `true`].
# Notes
crossValidation works by calling the function `f`, defined by the user, passing to it the tuple `trainData`, `valData` and `rng` and collecting the result of the function f. The specific method for which `trainData`, and `valData` are selected at each iteration depends on the specific `sampler`, whith a single 5 k-fold rule being the default.
This approach is very flexible because the specific model to employ or the metric to use is left within the user-provided function. The only thing that crossValidation does is provide the model defined in the function `f` with the opportune data (and the random number generator).
**Input of the user-provided function**
`trainData` and `valData` are both themselves tuples. In supervised models, crossValidations `data` should be a tuple of (X,Y) and `trainData` and `valData` will be equivalent to (xtrain, ytrain) and (xval, yval). In unsupervised models `data` is a single array, but the training and validation data should still need to be accessed as `trainData[1]` and `valData[1]`.
**Output of the user-provided function**
The user-defined function can return whatever. However, if `returnStatistics` is left on its default `true` value the user-defined function must return a single scalar (e.g. some error measure) so that the mean and the standard deviation are returned.
Note that `crossValidation` can beconveniently be employed using the `do` syntax, as Julia automatically rewrite `crossValidation(data,...) trainData,valData,rng ...user defined body... end` as `crossValidation(f(trainData,valData,rng ), data,...)`
# Example
julia> X = [11:19 21:29 31:39 41:49 51:59 61:69];
julia> Y = [1:9;];
julia> sampler = KFold(nSplits=3);
julia> (μ,σ) = crossValidation([X,Y],sampler) do trainData,valData,rng
(xtrain,ytrain) = trainData; (xval,yval) = valData
trainedModel = buildForest(xtrain,ytrain,30)
predictions = predict(trainedModel,xval)
ϵ = meanRelError(predictions,yval,normRec=false)
return ϵ
end
(0.3202242202242202, 0.04307662219315022)
"""
function crossValidation(f,data,sampler=KFold(nSplits=5,nRepeats=1,shuffle=true,rng=Random.GLOBAL_RNG);dims=1,verbosity=STD, returnStatistics=true)
iterResults = Union{Array{Float64},Float64}[]
for (i,iterData) in enumerate(SamplerWithData(sampler,data,dims))
iterResult = f(iterData[1],iterData[2],sampler.rng)
push!(iterResults,iterResult)
if verbosity > STD
println("Done iteration $i. This iteration output: $iterResult")
end
end
if returnStatistics return (mean(iterResults),std(iterResults)) else return iterResults end
end
Testing partition
julia> x = [11:16 21:26]; y = collect(31:36);
julia> ((xtrain,xval,xtest),(ytrain,yval,ytest)) = partition([x,y],[0.6,0.2,0.2]);
julia> print("xtrain: $xtrain ytrain: $ytrain - xval: $xval yval: $yval - xtest: $xtest ytest: $ytest")
xtrain: [14 24; 12 22; 13 23; 16 26] ytrain: [34, 32, 33, 36] - xval: [11 21] yval: [31] - xtest: [15 25] ytest: [35]
Testing the sampler
julia> data = [[11:16 21:26],[31:36;]]
julia> sampleIterator = SamplerWithData(KFold(nSplits=3,nRepeats=2,shuffle=false),data,1)
julia> for (i,d) in enumerate(sampleIterator)
(xtrain,ytrain),(xval,yval) = d
println("i: $i - xtrain: $xtrain ytrain: $ytrain - xval: $xval yval: $yval")
end
i: 1 - xtrain: [13 23; 14 24; 15 25; 16 26] ytrain: [33, 34, 35, 36] - xval: [11 21; 12 22] yval: [31, 32]
i: 2 - xtrain: [11 21; 12 22; 15 25; 16 26] ytrain: [31, 32, 35, 36] - xval: [13 23; 14 24] yval: [33, 34]
i: 3 - xtrain: [11 21; 12 22; 13 23; 14 24] ytrain: [31, 32, 33, 34] - xval: [15 25; 16 26] yval: [35, 36]
i: 4 - xtrain: [13 23; 14 24; 15 25; 16 26] ytrain: [33, 34, 35, 36] - xval: [11 21; 12 22] yval: [31, 32]
i: 5 - xtrain: [11 21; 12 22; 15 25; 16 26] ytrain: [31, 32, 35, 36] - xval: [13 23; 14 24] yval: [33, 34]
i: 6 - xtrain: [11 21; 12 22; 13 23; 14 24] ytrain: [31, 32, 33, 34] - xval: [15 25; 16 26] yval: [35, 36]
Testing crossValidation
julia> X = [11:19 21:29 31:39 41:49 51:59 61:69]; Y = [1:9;]
julia> sampler = KFold(nSplits=3,nRepeats=1,shuffle=true)
julia> (μ,σ) = crossValidation([X,Y],sampler) do trainData,valData,rng
(xtrain,ytrain) = trainData; (xval,yval) = valData
trainedModel = buildForest(xtrain,ytrain,30,rng=rng) # Require BetaML or DecisionTrees.jl
predictions = predict(trainedModel,xval,rng=rng)
ϵ = meanRelError(predictions,yval,normRec=false)
return ϵ
end
julia> print("$μ , $σ")
(0.28702651515151517, 0.05043882771717648)