This is a small model with fake data in a linear regression. Unexpectedly, shuffling the rows of the dataframe changes the estimates.
import Random
using DataFrames
using Pipe: @pipe
using StatsBase: StatsBase, mad, median, percentile, sample, shuffle
using Distributions
using Turing
Random.seed!(1)
##
df, parameters = let
β = (
β_0 = 7,
β_1 = 0.05,
β_2 = 0.10,
β_3 = 0.15,
β_4 = 0.20,
)
σ = 2
parameters = (β..., σ=σ)
parameters_df = DataFrame(
parameter=collect(keys(parameters)),
value=collect(values(parameters)),
)
N = 100_000
X = DataFrame(
x_0=fill(1, N),
x_1=rand([1,2], N),
x_2=rand(1:1:10, N),
x_3=rand([0,1], N),
x_4=rand(0:1:10, N),
)
df = transform(X)
μ = Matrix(df) * collect(β)
ϵ = rand(Normal(0, σ), N)
y = μ .+ ϵ
(df = transform(
X,
[] => (() -> μ) => :μ,
[] => (() -> ϵ) => :ϵ,
[] => (() -> y) => :y,
),
parameters = parameters_df, )
end
##
@model function linear_outcome_model(X, y)
β_0 ~ Normal(mean(y), 2std(y))
β_1 ~ Normal(0, .5)
β_2 ~ Normal(0, .5)
β_3 ~ Normal(0, .5)
β_4 ~ Normal(0, .5)
μ = (
β_0 .* X.x_0
.+ β_1 .* X.x_1
.+ β_2 .* X.x_2
.+ β_3 .* X.x_3
.+ β_4 .* X.x_4
)
σ ~ truncated(Normal(0, 2std(y)), 0, Inf)
y ~ MvNormal(μ, σ)
end
fields = [
:x_0,
:x_1,
:x_2,
:x_3,
:x_4,
]
samples_original = @pipe df |>
linear_outcome_model(_[:, fields], _.y) |>
sample(_, NUTS(0.65), 3000)
samples_shuffled = @pipe df |>
DataFrame(shuffle(eachrow(_))) |>
linear_outcome_model(_[:, fields], _.y) |>
sample(_, NUTS(0.65), 3000)
function compare_samples(samples)
@pipe DataFrame(summarize(samples, mean, std)) |>
rename(_, :parameters => :parameter) |>
innerjoin(_, parameters, on=:parameter)
end
comparison_original = compare_samples(samples_original)
#comparison_original
#6×4 DataFrame
# Row │ parameter mean std value
# │ Symbol Float64 Float64 Real
#─────┼─────────────────────────────────────────
# 1 │ β_0 6.97052 0.0266058 7
# 2 │ β_1 0.0417781 0.0130792 0.05
# 3 │ β_2 0.101241 0.00227608 0.1
# 4 │ β_3 0.170157 0.012559 0.15
# 5 │ β_4 0.203193 0.00201488 0.2
# 6 │ σ 1.99352 0.00446498 2
comparison_shuffled = compare_samples(samples_shuffled)
# 6×4 DataFrame
# Row │ parameter mean std value
# │ Symbol Float64 Float64 Real
# ─────┼───────────────────────────────────────────
# 1 │ β_0 8.70263 0.0270448 7
# 2 │ β_1 -0.00431323 0.0133114 0.05
# 3 │ β_2 -0.00108792 0.00232939 0.1
# 4 │ β_3 0.0106373 0.0134052 0.15
# 5 │ β_4 -0.00122068 0.00215966 0.2
# 6 │ σ 2.11683 0.0047302 2