QR Parameterization in Turing

A question to Turing Specialists.
I performed a QR decomposition on my model matrix X.

#### QR Decomposition ####
Q, R = qr(X)

Then i fitted a Turing Model to my Q Matrix, casting it to a Matrix:

model_qr = varying_intercept(Matrix(Q), idx, float(y));
chn_qr = sample(model_qr, NUTS(1_000, 0.65), MCMCThreads(), 2_000, 4);

This is the output:

julia> chn_qr
Chains MCMC chain (2000×24×4 Array{Float64,3}):
Iterations        = 1:2000
Thinning interval = 1
Chains            = 1, 2, 3, 4
Samples per chain = 2000
parameters        = β[1], β[2], μ, μⱼ[1], μⱼ[2], μⱼ[3], μⱼ[4], μⱼ[5], μⱼ[6], μⱼ[7], σ, σⱼ
internals         = acceptance_rate, hamiltonian_energy, hamiltonian_energy_error, is_accept, log_density, lp, max_hamiltonian_energy_error, n_steps, nom_step_size, numerical_error, step_size, tree_depth
Summary Statistics
  parameters       mean       std   naive_se      mcse         ess      rhat 
      Symbol    Float64   Float64    Float64   Float64     Float64   Float64 
        β[1]     0.0615    1.6131     0.0180    0.0242   5229.9895    0.9997
        β[2]   -46.6065    4.4243     0.0495    0.0556   6779.8184    1.0009
           μ    23.2964    1.7568     0.0196    0.0354   1548.2673    1.0012
       μⱼ[1]     1.3817    1.8077     0.0202    0.0364   1599.0513    1.0011
       μⱼ[2]     1.6890    1.7905     0.0200    0.0362   1604.2076    1.0009
       μⱼ[3]    -4.0131    1.7790     0.0199    0.0355   1565.3781    1.0010
       μⱼ[4]     5.8102    2.0615     0.0230    0.0377   2110.1449    1.0009
       μⱼ[5]    -2.0791    1.8817     0.0210    0.0356   1732.1206    1.0011
       μⱼ[6]    -5.2884    1.7996     0.0201    0.0362   1600.3742    1.0012
       μⱼ[7]     1.9578    1.8095     0.0202    0.0365   1607.9793    1.0011
           σ     2.6639    0.1189     0.0013    0.0014   7292.1689    1.0006
          σⱼ     4.2218    1.4394     0.0161    0.0266   3021.6447    1.0011
Quantiles
  parameters       2.5%      25.0%      50.0%      75.0%      97.5% 
      Symbol    Float64    Float64    Float64    Float64    Float64 
        β[1]    -2.9792    -0.7044     0.0580     0.8240     3.1977
        β[2]   -55.0442   -49.6002   -46.6371   -43.6904   -37.9659
           μ    19.9347    22.2450    23.2709    24.3002    26.9628
       μⱼ[1]    -2.3245     0.3304     1.3992     2.4828     4.9164
       μⱼ[2]    -2.0222     0.6622     1.6990     2.7723     5.1757
       μⱼ[3]    -7.7450    -5.0484    -3.9772    -2.9457    -0.6303
       μⱼ[4]     1.7884     4.5075     5.7806     7.0934     9.8737
       μⱼ[5]    -6.0375    -3.1619    -2.0521    -0.9130     1.5270
       μⱼ[6]    -9.0518    -6.3368    -5.2189    -4.1916    -1.8506
       μⱼ[7]    -1.8358     0.9237     1.9936     3.0302     5.4514
           σ     2.4414     2.5788     2.6598     2.7444     2.9033
          σⱼ     2.4002     3.2597     3.9161     4.8519     7.7719

I’ve managed to reconstruct the quantiles of β by multiplying R^-1 * β (in a very ugly non-elegant way):

quantiles_beta = select(DataFrame(quantile(group(chn_qr, :β))), r"%");
mapcols(x -> R^-1 * x, quantiles_beta)
2×5 DataFrame
 Row │ 2.5%         25.0%        50.0%        75.0%        97.5%       
     │ Float64      Float64      Float64      Float64      Float64     
─────┼─────────────────────────────────────────────────────────────────
   1 │ -2.56661     -2.34772     -2.22018     -2.09349     -1.86299
   2 │  0.00516392   0.00465319   0.00437521   0.00409877   0.00356173

How would I apply R^-1 * β to all the β in the chn_qr object so that I can create a new “chn_qr_reconstructed” object and perform all the methods that uses Chains to it?

chn_qr has the following dimensions (2000 samples, 24 parameters - which there are only 2 β, and 4 chains)

size(chn_qr)
(2000, 24, 4)

BTW: QR reparameterization is 19.6s sampling and using X is 135.6s sampling (with a way better ESS overall)

Solved by @bdeonovic in Julia’s Slack:

I thought the Bayesian logistic regression example didn’t look very “Turian” so I made some updates to it. I also reparameterized it for even more speed and efficiency

train_F = qr([ones(size(train,1)) train]);
train_Q_ast = Matrix(train_F.Q) * sqrt(size(train,1)-1)
train_R_ast = train_F.R / sqrt(size(train,1)-1)
train_R_ast_inv = inv(train_R_ast)
# Bayesian logistic regression (LR)
@model logistic_regression_new(Q_ast, y, p, σ) = begin
    theta ~ MvNormal(zeros(p), σ)
    y ~ Distributions.Product(Bernoulli.(logistic.(Q_ast * theta)))
end;
# Retrieve the number of observations.
n, p = size([ones(size(train,1)) train])
# Sample using HMC.
@time chain = mapreduce(c -> sample(logistic_regression_new(train_Q_ast, train_label, p, 1), HMC(0.05, 10), 1500),
    chainscat,
    1:3
)
describe(chain)
beta = mapslices(x-> train_R_ast_inv * x, chain[:,namesingroup(chain, :theta),:].value.data, dims=[2])
transformed_chain = Chains(beta, ["Intercept", "student", "balance", "income"]

On my machine the original code gives
28.732436 seconds (304.93 M allocations: 10.426 GiB, 9.61% gc time)
While the new code gives
4.446898 seconds (8.36 M allocations: 2.351 GiB, 10.44% gc time)
Plus almost 6X increase in ESS :slightly_smiling_face:

And this is what I’ve created from the solution to my needs:

using Turing, DataFrames, Chain, CSV, HTTP
using Random:seed!
using Statistics: mean, std
using LinearAlgebra:qr

seed!(1)
setprogress!(true)

df = @chain HTTP.get("https://github.com/selva86/datasets/blob/master/mpg_ggplot2.csv?raw=TRUE") begin
    _.body
    CSV.read(DataFrame)
end

#### Data Prep ####
idx_map = Dict(key => idx for (idx, key) in enumerate(unique(df.class)))
y = df[:, :hwy]
idx = getindex.(Ref(idx_map), df.class)
X = Matrix(select(df, [:displ, :year])) # the model matrix

#### QR Decomposition ####
Q, R = qr(X)
Q_ast = Matrix(Q) * sqrt(size(X, 1) - 1)
R_ast = R / sqrt(size(X, 1) - 1)

#### NCP Varying Intercept Model ####
@model varying_intercept_ncp(X, idx, y; n_gr=length(unique(idx)), predictors=size(X, 2)) = begin
    # priors
    μ ~ Normal(mean(y), 2.5 * std(y))       # population-level intercept
    σ ~ Exponential(1 / std(y))             # residual SD
    # Coefficients Student-t(ν = 3)
    θ ~ filldist(TDist(3), predictors)
    # Prior for variance of random intercepts. Usually requires thoughtful specification.
    τ ~ truncated(Cauchy(0, 2), 0, Inf)
    zⱼ ~ filldist(Normal(0, 1), n_gr)      # NCP group-level intercepts

    # likelihood
    ŷ = μ .+ X * θ .+ zⱼ[idx] .* τ
    y ~ MvNormal(ŷ, σ)
end

model_qr_ncp = varying_intercept_ncp(Q_ast, idx, float(y))


chn_qr_ncp = sample(model_qr_ncp, NUTS(1_000, 0.65), MCMCThreads(), 2_000, 4)

#### get αⱼ from zⱼ by zⱼ * τ ####
τ = summarystats(chn_qr_ncp)[:τ, :mean]
αⱼ = mapslices(x -> x * τ, chn_qr_ncp[:, namesingroup(chn_qr_ncp, :zⱼ),:].value.data, dims=[2])
chn_ncp_reconstructed = hcat(Chains(αⱼ, ["αⱼ[$i]" for i in 1:length(unique(idx))]), chn_qr_ncp)

#### get β by R_ast^-1 * θ ####
betas = mapslices(x -> R_ast^-1 * x, chn_qr_ncp[:,namesingroup(chn_qr_ncp, :θ),:].value.data, dims=[2])
chn_qr_ncp_reconstructed = hcat(Chains(betas, ["β[$i]" for i in 1:size(Q_ast, 2)]), chn_ncp_reconstructed)