Unbalanced One-way ANOVA

I’m a graduate student seeking to learn how to do statistics in Julia. From my internet searching and working with the SimpleANOVA package, I’ve been able to perform ANOVA tests on data where each group has the same number of samples. Now I’m trying to learn how to do ANOVA tests when each group has a different number of samples. This does not appear to be supported yet in the SimpleANOVA package, is there a good way to do this type of test in another package?

Welcome @AerospaceMark!

Not sure about this specific case but I think overall it’d be good for you to have a look to this book and resources:

2 Likes

There is also ANOVA and here is some (messy but tested) code that implements R’s aov function in julia (I haven’t put it in a package yet). Use it with aov(formula, data), where formula is a StatsModel formula like e.g. formula = @formula(response ~ group * treatment) and data is a DataFrame.

using StatsModels, DataFrames, LinearAlgebra, Distributions
toterm(s1::Term, s2::Term) = InteractionTerm((s1, s2))
toterm(s1::InteractionTerm, s2::Term) = InteractionTerm((s1.terms..., s2))
toterm(s1::Term, s2::InteractionTerm) = InteractionTerm((s1, s2.terms...))
toterm(x::Tuple, s2::Term) = map(t -> toterm(t, s2), x)
toterm(s1::Term, x::Tuple) = map(t -> toterm(s1, t), x)
function predictor_product(predictors)
    init, rest = Iterators.peel(predictors)
    reduce((x, y) -> x + y + toterm(x, y), term.(rest), init = term(init))
end
function errorterm(ef, nested)
    terms = term(1) + term(ef)
    if length(nested) > 0
        nestedterms = predictor_product(nested)
        terms += toterm(term(ef), nestedterms)
    end
    terms
end

name(t::AbstractTerm) = "$(t.sym)"
name(t::InteractionTerm) = join(name.(t.terms), " & ")
name(t::InterceptTerm) = "Intercept"
pivoted_asgn(assign, qrp) = assign[qrp.p[1:rank(qrp.R)]]
function mqr(f::FormulaTerm, data; contrasts = Dict{Symbol, Any}())
    mf = ModelFrame(f, data; contrasts)
    mm = ModelMatrix(mf)
    qrp = qr(mm.m, Val(true))
    (qrp = qrp, mf = mf, mm = mm)
end
function aov(f::FormulaTerm, data; return_locals = false)
    qrp, mf, mm = mqr(f, data)
    aov(qrp, pivoted_asgn(mm.assign, qrp), mf.f.rhs.terms,
        getproperty(data, f.lhs.sym); return_locals)
end
aov(l, y) = aov(l..., y)
function aov(qrp, asgn, terms, y; return_locals = false)
    effects = qrp.Q' * y
    r = rank(qrp.R)
    SS = Float64[]
    names = String[]
    DF = Int[]
    for i in union(asgn)
        n = name(terms[i])
        n == "Intercept" && continue
        idxs = findall(asgn .== i)
        push!(SS, sum(abs2, @view(effects[idxs])))
        push!(names, n)
        push!(DF, length(idxs))
    end
    push!(SS, sum(abs2, @view(effects[r+1:end])))
    push!(names, "Residuals")
    push!(DF, length(effects) - r)
    MSS = SS ./ DF
    F = MSS ./ MSS[end]
    F[end] = NaN
    DFres = DF[end]
    result = DataFrame(names = names, DF = DF, SS = SS, MSS = MSS, F = F,
                       p = @.(ccdf(FDist(DF, DFres), F)))
    if return_locals
        result, (qrp = qrp, asgn = asgn, terms = terms)
    else
        result
    end
end
function aoverror(f::FormulaTerm, e, nested::T, data; return_locals = false) where T
    nested = T <: Symbol ? [nested] : nested
    ef = errorterm(e, nested)
    efit = mqr(f.lhs ~ ef, data,
        contrasts = Dict(e => HelmertCoding(),
                         [t => HelmertCoding() for t in nested]...))
    efit.mm.assign = efit.mm.assign[efit.qrp.p[1:rank(efit.qrp.R)]]
    mf = ModelFrame(f, data)
    mm = ModelMatrix(mf)
    ys = efit.qrp.Q' * getproperty(data, f.lhs.sym)
    append!(efit.mm.assign,
            fill(maximum(efit.mm.assign) + 1, length(ys) - length(efit.mm.assign)))
    Xs = efit.qrp.Q' * mm.m
    rows = [findall(efit.mm.assign .== i)
            for i in Iterators.drop(sort!(union(efit.mm.assign)), 1)]
    result = [inner(Xs, ys, row, mm.assign, mf.f.rhs.terms; return_locals)
              for row in rows]
    if return_locals
        first.(result), (Q = efit.qrp.Q,
                         locals_t = [(rest..., row)
                                     for (row, rest) in zip(rows, last.(result))])
    else
        result
    end
end
function aoverror(l, y)
    ys = l.Q' * y
    [aov(qrp, asgn, terms, ys[rows]) for (qrp, asgn, terms, rows) in l.locals_t]
end
function inner(X, y, rows, asgn, terms; return_locals = false)
    T = X[rows, :]
    cols = findall(reshape(sum(abs2, T, dims = 1), :) .> 1e-5)
    qrp = qr(T[:, cols], Val(true))
    aov(qrp, pivoted_asgn(asgn[cols], qrp), terms, y[rows]; return_locals)
end
1 Like

Awesome, thanks!

This looks like great code, thanks for sharing!