I have a million-observation regression problem with one outcome and several predictors, including a time variable. I would like to fit the over-time variation with a Gaussian process. I tried using AbstractGPs with DynamicHMC but it didn’t scale to that size: on 400 observations it took 8 minutes (code below).
I see there are a lot of options I haven’t tried: AdvancedHMC, EllipticalSliceSampling, VariationalInference, TemporalGPs, and probably others.
Rather than blindly trying them all, I’d like to ask for some guidance: what would you suggest I do here?
using AbstractGPs
using Distributions
using StatsFuns
using DynamicHMC
using LogDensityProblems
using Random
Random.seed!(1234)
const n = 400
const x_train = collect(range(-5.0, 5.0; length=n))
const y_train = randn(n) + sin.(x_train .* 2);
function gp_loglikelihood(x, y)
function loglikelihood(params)
kernel =
softplus(params[1]) * (Matern52Kernel() ∘ ScaleTransform(softplus(params[2])))
f = GP(kernel)
fx = f(x, 0.1)
return logpdf(fx, y)
end
return loglikelihood
end
function gp_posterior(x, y, p)
kernel = softplus(p[1]) * (Matern52Kernel() ∘ ScaleTransform(softplus(p[2])))
f = GP(kernel)
return posterior(f(x, 0.1), y)
end
const loglik_train = gp_loglikelihood(x_train, y_train)
logprior(params) = logpdf(MvNormal(2, 1), params)
const n_samples = 2_000
const n_adapts = 1_000
# Log joint density
function LogDensityProblems.logdensity(ℓ::typeof(loglik_train), params)
return ℓ(params) + logprior(params)
end
# The parameter space is two-dimensionaln
LogDensityProblems.dimension(::typeof(loglik_train)) = 2
# `loglik_train` does not allow to evaluate derivatives of
# the log-likelihood function
function LogDensityProblems.capabilities(::Type{<:typeof(loglik_train)})
return LogDensityProblems.LogDensityOrder{0}()
end
let
@time mcmc_result = mcmc_with_warmup(
Random.GLOBAL_RNG,
ADgradient(:ForwardDiff, loglik_train),
n_samples
)
end #481s