This is the current status:
n = 1000
t0 = rand(1:200, n)
t1 = rand(1:200, n)
t = vcat(t0, t1)
c = similar(t)
for i in 1:2n
    rand() < 0.5 ? c[i] = t[i] : c[i] = rand(1:t[i])
end
xx0 = 100 .* [rand(n).*rand([-1, 1], n) rand(n).*rand([-1, 1], n)]
xx1 = 100 .* [rand(n).*rand([-1, 1], n) rand(n).*rand([-1, 1], n)]
xx = vcat(xx0, xx1)
α = 1.2
β = [-1.33, -0.006, 0.12]
γ = [0.07, 0.03, 0.5]
ρ = 11.3
ψ0 = 0.046
ψ1 = 0.040
function logℒ_fast(α, β, t, c, x)
    eα = abs(α)
    n, k = size(x)
    (n == length(t) == length(c) && length(β) == k + 1) || throw(DimensionMismatch())
    s = zero(typeof(α))
    for i in 1:n
        xb = sum(x[i, j] * β[j+1] for j in 1:k) + β[1]
        ti = t[i]
        s += (1 - (c[i] == ti)) * (log(eα) + (eα - 1) * log(ti) + xb) - ti^eα * exp(xb)
    end
    return s
end
# compute log(1 + exp(x)) without spurious overflow
softplus(x) = log1p(exp(-abs(x))) + max(x, 0)
function 𝒬_fast(γ, t, x)
    s = log(1 + zero(eltype(x)))
    n, k = size(x)
    (n == length(t) && length(γ) == k + 1) || throw(DimensionMismatch())
    for i = 1:n
        xb0 = sum(x[i, j] * γ[j] for j in 1:k)
        xb1 = xb0 + γ[end]
        s += (t[i] > 0) * xb0 + (t[i] > 1) * xb1 - softplus(xb0) - softplus(xb1)
    end
    return s
end
## removes collinear columns
function removeredundant(x)
    return hcat(unique(eachcol(x))...)
end
function 𝒞_fast(ρ, t0, t1, xx0, xx1)
    xx = removeredundant([xx0 xx1])
    y1 = log.(t1)
    y0 = log.(t0)
    b1 = xx \ y1
    b0 = xx \ y0
    s = zero(eltype(ρ))
    n, k = size(xx)
    (n == length(t0) == length(t1)) || throw(DimensionMismatch())
    for i = 1:n
        e1 = y1[i] - sum(xx[i, j] * b1[j] for j in 1:k)
        e0 = y0[i] - sum(xx[i, j] * b0[j] for j in 1:k)
        s -= (e1 * e0 - ρ)^2
    end
    return s
end
function boole_i(t, age; ll_ret=1, ul_ret=2)
    age - ll_ret <= t <= age + ul_ret
end
function 𝒮_fast(ψ0, ψ1, t0, t1; age=4*(65-45), ll_ret=1, ul_ret=2)
    n = length(t0)
    (n == length(t1)) || throw(DimensionMismatch())
    s = zero(typeof(ψ0))
    for i in 1:n
        s -= (boole_i(t0[i], age; ll_ret=ll_ret, ul_ret=ul_ret) - ψ0)^2 +
            (boole_i(t1[i], age; ll_ret=ll_ret, ul_ret=ul_ret) - ψ1)^2
    end
    return s
end
logℒ_fast(α, β, t, c, xx) +
𝒬_fast(γ, t, xx) +
𝒞_fast(ρ, t0, t1, xx0, xx1) +
𝒮_fast(ψ0, ψ1, t0, t1)
There are some allocations I didn’t know how to avoid in 𝒞_fast:
- I need to remove collinear columns in the matrix [xx0 xx1].
- I need to compute the OLS coefficients of regressing log(t1)onxx, andlog(t0)onxx.
Does anyone have any suggestions on avoiding those allocations?
Does anyone have any suggestions on further optimizations? (I know I can transpose the matrices to exploit the column-major ordering of Julia).
Also, as I mentioned above, I get an error when trying to use LoopVectorization in the loops. I think that happens because @turbo does not support more than one index and I want to generalize the program to allow for a larger xx matrix. How can I avoid that error?