How to generalize the example of the Bayesian logistic regression [turning] for programmatically accepting input data with a different number of features, similar to what is possible in the Multinomial logistic regression example (https://turing.ml/dev/tutorials/08-multinomial-logistic-regression/) ?
Full example code
using Turing, Distributions
using RDatasets
using MCMCChains, Plots, StatsPlots
using StatsFuns: logistic
using MLDataUtils: shuffleobs, stratifiedobs, rescale!
using FillArrays
using LinearAlgebra
using Distances
using NNlib: softmax
using Random
Random.seed!(0);
Turing.setprogress!(false)
Bayesian logistic regression (LR) Function
The idea is to allow the definition of the distribution of the variables like student ~ Normal(0, σ) to be automatically done having into account the (number) of the input features
@model function logistic_regression(x, y, n, σ)
intercept ~ Normal(0, σ)
student ~ Normal(0, σ)
balance ~ Normal(0, σ)
income ~ Normal(0, σ)
for i in 1:n
v = logistic(intercept + student * x[i, 1] + balance * x[i, 2] + income * x[i, 3])
y[i] ~ Bernoulli(v)
end
end;
Bayesian multinomial logistic regression (Modified) Function
@model function logistic_regressionVersion2(x, y, σ)
n = size(x, 1)
numberOfFeatures=size(x,2)
length(y) == n ||
throw(DimensionMismatch(“number of observations in x
and y
is not equal”))
# Priors of intercepts and coefficients.
intercept ~ Normal(0, σ)
coefficients ~ MvNormal(Zeros(numberOfFeatures), σ^2 * I)
# Compute the likelihood of the observations.
values = intercept .+ x * coefficients
for i in 1:n
# the 0 corresponds to the base category `setosa`
v = softmax([0, values[i]])
y[i] ~ Categorical(v)
end
end;
Preparation of the input data
data = RDatasets.dataset(“ISLR”, “Default”);
function split_data(df, target; at=0.70)
shuffled = shuffleobs(df)
return trainset, testset = stratifiedobs(row → row[target], shuffled; p=at)
end
data[!, :DefaultNum] = [r.Default == “Yes” ? 1.0 : 0.0 for r in eachrow(data)]
data[!, :StudentNum] = [r.Student == “Yes” ? 1.0 : 0.0 for r in eachrow(data)]
select!(data, Not([:Default, :Student]))
features = [:StudentNum, :Balance, :Income]
numerics = [:Balance, :Income]
target = :DefaultNum
trainset, testset = split_data(data, target; at=0.05)
for feature in numerics
μ, σ = rescale!(trainset[!, feature]; obsdim=1)
rescale!(testset[!, feature], μ, σ; obsdim=1)
end
Turing requires data in matrix form, not dataframe
train = Matrix(trainset[:, features])
test = Matrix(testset[:, features])
train_label = trainset[:, target]
test_label = testset[:, target];
n, _ = size(train)
[Bayesian Logistic Regression] Sample using HMC.
m0 = logistic_regression(train, train_label,n,1)
chain0 = sample(m0, HMC(0.05, 10), MCMCThreads(), 1_500, 3)
[Bayesian Multinomial Regression] Sample using HMC (Not working well!!)
m = logistic_regressionVersion2(train, train_label,1)
chain = sample(m, HMC(0.05, 10), MCMCThreads(), 1_500, 3)