How to speed up these functions?

This is the current status:

n = 1000
t0 = rand(1:200, n)
t1 = rand(1:200, n)
t = vcat(t0, t1)

c = similar(t)
for i in 1:2n
    rand() < 0.5 ? c[i] = t[i] : c[i] = rand(1:t[i])
end

xx0 = 100 .* [rand(n).*rand([-1, 1], n) rand(n).*rand([-1, 1], n)]
xx1 = 100 .* [rand(n).*rand([-1, 1], n) rand(n).*rand([-1, 1], n)]
xx = vcat(xx0, xx1)

α = 1.2
β = [-1.33, -0.006, 0.12]
γ = [0.07, 0.03, 0.5]
ρ = 11.3
ψ0 = 0.046
ψ1 = 0.040

function logℒ_fast(α, β, t, c, x)
    eα = abs(α)
    n, k = size(x)

    (n == length(t) == length(c) && length(β) == k + 1) || throw(DimensionMismatch())
    s = zero(typeof(α))
    for i in 1:n
        xb = sum(x[i, j] * β[j+1] for j in 1:k) + β[1]
        ti = t[i]
        s += (1 - (c[i] == ti)) * (log(eα) + (eα - 1) * log(ti) + xb) - ti^eα * exp(xb)
    end
    return s
end

# compute log(1 + exp(x)) without spurious overflow
softplus(x) = log1p(exp(-abs(x))) + max(x, 0)

function 𝒬_fast(γ, t, x)
    s = log(1 + zero(eltype(x)))
    n, k = size(x)
    (n == length(t) && length(γ) == k + 1) || throw(DimensionMismatch())
    for i = 1:n
        xb0 = sum(x[i, j] * γ[j] for j in 1:k)
        xb1 = xb0 + γ[end]
        s += (t[i] > 0) * xb0 + (t[i] > 1) * xb1 - softplus(xb0) - softplus(xb1)
    end
    return s
end

## removes collinear columns
function removeredundant(x)
    return hcat(unique(eachcol(x))...)
end

function 𝒞_fast(ρ, t0, t1, xx0, xx1)
    xx = removeredundant([xx0 xx1])

    y1 = log.(t1)
    y0 = log.(t0)
    b1 = xx \ y1
    b0 = xx \ y0

    s = zero(eltype(ρ))
    n, k = size(xx)
    (n == length(t0) == length(t1)) || throw(DimensionMismatch())
    for i = 1:n
        e1 = y1[i] - sum(xx[i, j] * b1[j] for j in 1:k)
        e0 = y0[i] - sum(xx[i, j] * b0[j] for j in 1:k)
        s -= (e1 * e0 - ρ)^2
    end
    return s
end

function boole_i(t, age; ll_ret=1, ul_ret=2)
    age - ll_ret <= t <= age + ul_ret
end

function 𝒮_fast(ψ0, ψ1, t0, t1; age=4*(65-45), ll_ret=1, ul_ret=2)
    n = length(t0)
    (n == length(t1)) || throw(DimensionMismatch())
    s = zero(typeof(ψ0))

    for i in 1:n
        s -= (boole_i(t0[i], age; ll_ret=ll_ret, ul_ret=ul_ret) - ψ0)^2 +
            (boole_i(t1[i], age; ll_ret=ll_ret, ul_ret=ul_ret) - ψ1)^2
    end
    return s
end

logℒ_fast(α, β, t, c, xx) +
𝒬_fast(γ, t, xx) +
𝒞_fast(ρ, t0, t1, xx0, xx1) +
𝒮_fast(ψ0, ψ1, t0, t1)

There are some allocations I didn’t know how to avoid in 𝒞_fast:

  1. I need to remove collinear columns in the matrix [xx0 xx1].
  2. I need to compute the OLS coefficients of regressing log(t1) on xx, and log(t0) on xx.

Does anyone have any suggestions on avoiding those allocations?
Does anyone have any suggestions on further optimizations? (I know I can transpose the matrices to exploit the column-major ordering of Julia).

Also, as I mentioned above, I get an error when trying to use LoopVectorization in the loops. I think that happens because @turbo does not support more than one index and I want to generalize the program to allow for a larger xx matrix. How can I avoid that error?