Generalization of the Turing Bayesian logistic regression example

How to generalize the example of the Bayesian logistic regression [turning] for programmatically accepting input data with a different number of features, similar to what is possible in the Multinomial logistic regression example (https://turing.ml/dev/tutorials/08-multinomial-logistic-regression/) ?

Full example code

using Turing, Distributions
using RDatasets
using MCMCChains, Plots, StatsPlots
using StatsFuns: logistic
using MLDataUtils: shuffleobs, stratifiedobs, rescale!
using FillArrays
using LinearAlgebra
using Distances
using NNlib: softmax

using Random
Random.seed!(0);

Turing.setprogress!(false)

Bayesian logistic regression (LR) Function

The idea is to allow the definition of the distribution of the variables like student ~ Normal(0, σ) to be automatically done having into account the (number) of the input features

@model function logistic_regression(x, y, n, σ)
intercept ~ Normal(0, σ)

student ~ Normal(0, σ)
balance ~ Normal(0, σ)
income ~ Normal(0, σ)

for i in 1:n
    v = logistic(intercept + student * x[i, 1] + balance * x[i, 2] + income * x[i, 3])
    y[i] ~ Bernoulli(v)
end

end;

Bayesian multinomial logistic regression (Modified) Function

@model function logistic_regressionVersion2(x, y, σ)
n = size(x, 1)
numberOfFeatures=size(x,2)
length(y) == n ||
throw(DimensionMismatch(“number of observations in x and y is not equal”))

# Priors of intercepts and coefficients.
intercept ~ Normal(0, σ)
coefficients ~ MvNormal(Zeros(numberOfFeatures), σ^2 * I)

# Compute the likelihood of the observations.
values = intercept .+ x * coefficients

for i in 1:n
    # the 0 corresponds to the base category `setosa`
    v = softmax([0, values[i]])
    y[i] ~ Categorical(v)
end

end;

Preparation of the input data

data = RDatasets.dataset(“ISLR”, “Default”);

function split_data(df, target; at=0.70)
shuffled = shuffleobs(df)
return trainset, testset = stratifiedobs(row → row[target], shuffled; p=at)
end

data[!, :DefaultNum] = [r.Default == “Yes” ? 1.0 : 0.0 for r in eachrow(data)]
data[!, :StudentNum] = [r.Student == “Yes” ? 1.0 : 0.0 for r in eachrow(data)]
select!(data, Not([:Default, :Student]))

features = [:StudentNum, :Balance, :Income]
numerics = [:Balance, :Income]
target = :DefaultNum

trainset, testset = split_data(data, target; at=0.05)
for feature in numerics
μ, σ = rescale!(trainset[!, feature]; obsdim=1)
rescale!(testset[!, feature], μ, σ; obsdim=1)
end

Turing requires data in matrix form, not dataframe

train = Matrix(trainset[:, features])
test = Matrix(testset[:, features])
train_label = trainset[:, target]
test_label = testset[:, target];

n, _ = size(train)

[Bayesian Logistic Regression] Sample using HMC.

m0 = logistic_regression(train, train_label,n,1)
chain0 = sample(m0, HMC(0.05, 10), MCMCThreads(), 1_500, 3)

[Bayesian Multinomial Regression] Sample using HMC (Not working well!!)

m = logistic_regressionVersion2(train, train_label,1)
chain = sample(m, HMC(0.05, 10), MCMCThreads(), 1_500, 3)

The code in Bayesian Logistic Regression generalizes to arbitrary number of columns/features:

using Turing
using LazyArrays
using Random: seed!
seed!(123)

@model function logreg(X, y; predictors=size(X, 2))
    #priors
    α ~ Normal(0, 2.5)
    β ~ filldist(TDist(3), predictors)

    #likelihood
    y ~ arraydist(LazyArray(@~ BernoulliLogit.(α .+ X * β)))
end;
1 Like