This is the current status:
n = 1000
t0 = rand(1:200, n)
t1 = rand(1:200, n)
t = vcat(t0, t1)
c = similar(t)
for i in 1:2n
rand() < 0.5 ? c[i] = t[i] : c[i] = rand(1:t[i])
end
xx0 = 100 .* [rand(n).*rand([-1, 1], n) rand(n).*rand([-1, 1], n)]
xx1 = 100 .* [rand(n).*rand([-1, 1], n) rand(n).*rand([-1, 1], n)]
xx = vcat(xx0, xx1)
α = 1.2
β = [-1.33, -0.006, 0.12]
γ = [0.07, 0.03, 0.5]
ρ = 11.3
ψ0 = 0.046
ψ1 = 0.040
function logℒ_fast(α, β, t, c, x)
eα = abs(α)
n, k = size(x)
(n == length(t) == length(c) && length(β) == k + 1) || throw(DimensionMismatch())
s = zero(typeof(α))
for i in 1:n
xb = sum(x[i, j] * β[j+1] for j in 1:k) + β[1]
ti = t[i]
s += (1 - (c[i] == ti)) * (log(eα) + (eα - 1) * log(ti) + xb) - ti^eα * exp(xb)
end
return s
end
# compute log(1 + exp(x)) without spurious overflow
softplus(x) = log1p(exp(-abs(x))) + max(x, 0)
function 𝒬_fast(γ, t, x)
s = log(1 + zero(eltype(x)))
n, k = size(x)
(n == length(t) && length(γ) == k + 1) || throw(DimensionMismatch())
for i = 1:n
xb0 = sum(x[i, j] * γ[j] for j in 1:k)
xb1 = xb0 + γ[end]
s += (t[i] > 0) * xb0 + (t[i] > 1) * xb1 - softplus(xb0) - softplus(xb1)
end
return s
end
## removes collinear columns
function removeredundant(x)
return hcat(unique(eachcol(x))...)
end
function 𝒞_fast(ρ, t0, t1, xx0, xx1)
xx = removeredundant([xx0 xx1])
y1 = log.(t1)
y0 = log.(t0)
b1 = xx \ y1
b0 = xx \ y0
s = zero(eltype(ρ))
n, k = size(xx)
(n == length(t0) == length(t1)) || throw(DimensionMismatch())
for i = 1:n
e1 = y1[i] - sum(xx[i, j] * b1[j] for j in 1:k)
e0 = y0[i] - sum(xx[i, j] * b0[j] for j in 1:k)
s -= (e1 * e0 - ρ)^2
end
return s
end
function boole_i(t, age; ll_ret=1, ul_ret=2)
age - ll_ret <= t <= age + ul_ret
end
function 𝒮_fast(ψ0, ψ1, t0, t1; age=4*(65-45), ll_ret=1, ul_ret=2)
n = length(t0)
(n == length(t1)) || throw(DimensionMismatch())
s = zero(typeof(ψ0))
for i in 1:n
s -= (boole_i(t0[i], age; ll_ret=ll_ret, ul_ret=ul_ret) - ψ0)^2 +
(boole_i(t1[i], age; ll_ret=ll_ret, ul_ret=ul_ret) - ψ1)^2
end
return s
end
logℒ_fast(α, β, t, c, xx) +
𝒬_fast(γ, t, xx) +
𝒞_fast(ρ, t0, t1, xx0, xx1) +
𝒮_fast(ψ0, ψ1, t0, t1)
There are some allocations I didn’t know how to avoid in 𝒞_fast
:
- I need to remove collinear columns in the matrix
[xx0 xx1]
. - I need to compute the OLS coefficients of regressing
log(t1)
onxx
, andlog(t0)
onxx
.
Does anyone have any suggestions on avoiding those allocations?
Does anyone have any suggestions on further optimizations? (I know I can transpose the matrices to exploit the column-major ordering of Julia).
Also, as I mentioned above, I get an error when trying to use LoopVectorization
in the loops. I think that happens because @turbo
does not support more than one index and I want to generalize the program to allow for a larger xx
matrix. How can I avoid that error?