Choice of Kernel Function (MultiOutput Gaussian Process)

Hi !

I am using Gaussian Processes in my research, but I lack some theoretical knowledge about multioutput kernels.
For the moment, I am using the JuliaGaussianProcesses ecosystem.

My training data are maps associated with some parameters, maps that are pretty huge (between 1 to 10 thousand points), I am using some dimensionality reduction techniques (namely PCA/ICA or KernelPCA, depends which one performs the best on the test set for reconstruction), and I try to predict the score using a GP.

Jfy, the KernelPCA code (MultivariateStats.jl) so that you know what I call a ‘score’:

M = fit(KernelPCA, ytrain_raw,
    inverse=true, β=1e-10, maxoutdim=n_out, solver=:eig, maxiter=10_000,
    kernel = (x,y) -> exp(-1/γ*norm(x-y)^2));
ytrain = predict(M) # the so-called scores

And after that, it is the usual thing, build a GP and optimize the parameters using Zygote (mainly a copy-paste of the docs).

It is easy with one-dimensional data, but here I have absolutely no idea which MO kernel I should use. IndependentMOKernel, IntrinsicCoregionMOKernel, … Even inside them, I don’t know… Which parameters have to be optimized, which have to be precomputed, I don’t know.

Ps: I tried the spectral_mixture_kernel which seems to be great when the data structure is unknown, but for obscure reasons Zygote crashes (which a huge error message impossible to read) and such a kernel for multioutput is not (yet?) implemented.

For now, I am simply using an IndependentMOKernel filled with SEKernel and different lengthscales for each of the three input parameters (and noise on the training data).

Build and optim
x,y = prepare_isotopic_multi_output_data(ColVecs(xtrain), ColVecs(ytrain))

θ_init = (lSE = positive([1.0, 1.0, 1.0]),
    σ =  positive(exp(-1.0)),
    σnugget =  positive(exp(-1.0)),)

function kernel_(θ)
    k = SEKernel() ∘ ARDTransform(1 ./ θ.lSE)
    return θ.σ^2 * IndependentMOKernel(k)
end;

function build_gp_prior(θ)
    return GP(kernel_(θ))
end;

function build_finite_gp(θ)
    f = build_gp_prior(θ)
    return f(x, θ.σnugget^2)
end;

function build_posterior_gp(θ)
    fx = build_finite_gp(θ)
    return posterior(fx, y)
end;

function loss(θ)
    fx = build_finite_gp(θ)
    lml = logpdf(fx, y)
    return -lml
end
loss(ParameterHandling.value(θ_init))

default_optimizer = Optim.LBFGS(;
    alphaguess=Optim.LineSearches.InitialStatic(; scaled=true),
    linesearch=Optim.LineSearches.BackTracking()
)

function optimize_loss(loss, θ_init_; optimizer=default_optimizer, maxiter=1_000)
    options = Optim.Options(; iterations=maxiter, show_trace=false)

    θ_flat_init, unflatten = ParameterHandling.value_flatten(θ_init_)
    loss_packed = loss ∘ unflatten

    # https://julianlsolvers.github.io/Optim.jl/stable/#user/tipsandtricks/#avoid-repeating-computations
    function fg!(F, G, x)
        if F !== nothing && G !== nothing
            val, grad = Zygote.withgradient(loss_packed, x)
            G .= only(grad)
            return val
        elseif G !== nothing
            grad = Zygote.gradient(loss_packed, x)
            G .= only(grad)
            return nothing
        elseif F !== nothing
            return loss_packed(x)
        end
    end

    result = Optim.optimize(Optim.only_fg!(fg!), θ_flat_init, optimizer, options; inplace=false)

    return unflatten(result.minimizer), result
end

@time θ_opt, min_obj = let
    best_θ_opt = nothing
    best_min_obj = Inf

    for _ in 1:1 # starting with different initial conditions
        θ_init_ = fθ_init() # random initial values
        # try
            global θ_opt, opt_result = optimize_loss(loss, θ_init_)
            if Optim.converged(opt_result)
                @info "Optimization converged
                θ_opt = $θ_opt
                min_obj = $(opt_result.minimum)"
                min_obj = opt_result.minimum
                if min_obj < best_min_obj
                    best_θ_opt = θ_opt
                    best_min_obj = min_obj
                end
            else
                @warn "Optimization did not converge"
            end
        # catch e
        #     @warn "Optimization failed - error"
        #     println(e)
        # end
    end
    best_θ_opt, best_min_obj
end

but the performance is not that good, even for the first PCA scores (the last ones are mostly noise, so I don’t expect to predict them, but the first ones are important).
Do you have any idea, or advice on the choice of kernel function in order to have better predictions?

Thank you in advance! :slight_smile: