There is an issue for `end`

support. There also seems to be another problem that I’m fixing at the moment.

@amrods

This is what I’m getting:

```
julia> @btime logℒ_fast($α, $β, $t, $c, $xx) +
𝒬_fast($γ, $t, $xx) +
𝒞_fast($ρ, $t0, $t1, $xx0, $xx1) +
𝒮_fast($ψ0, $ψ1, $t0, $t1)
1.140 ms (18092 allocations: 704.41 KiB)
-9.978867771099854e8
julia> @btime logℒ_fast_turbo($α, $β, $t, $c, $xx) +
𝒬_fast_turbo($γ, $t, $xx) +
𝒞_fast_turbo($ρ, $t0, $t1, $xx0, $xx1) +
𝒮_fast_simd($ψ0, $ψ1, $t0, $t1)
121.907 μs (148 allocations: 299.58 KiB)
-9.978867771099855e8
```

All those allocations, especially with the original versions, suggest you can probably still optimize them by a lot. But I just wanted to show `@turbo`

on three of them.

##
Code

```
using LoopVectorization, BenchmarkTools
n = 1000;
t0 = rand(1:200, n);
t1 = rand(1:200, n);
t = vcat(t0, t1);
c = similar(t);
for i in 1:2n
rand() < 0.5 ? c[i] = t[i] : c[i] = rand(1:t[i])
end
xx0 = 100 .* [rand(n).*rand([-1, 1], n) rand(n).*rand([-1, 1], n)];
xx1 = 100 .* [rand(n).*rand([-1, 1], n) rand(n).*rand([-1, 1], n)];
xx = vcat(xx0, xx1);
α = 1.2;
β = [-1.33, -0.006, 0.12];
γ = [0.07, 0.03, 0.5];
ρ = 11.3;
ψ0 = 0.046;
ψ1 = 0.040;
function logℒ_fast(α, β, t, c, x)
eα = abs(α)
n, k = size(x)
(n == length(t) == length(c) && length(β) == k + 1) || throw(DimensionMismatch())
s = zero(typeof(α))
@inbounds for i in 1:n
xb = sum(@inbounds(x[i, j] * β[j+1]) for j in 1:k) + β[1]
ti = t[i]
s += (1 - (c[i] == ti)) * (log(eα) + (eα - 1) * log(ti) + xb) - ti^eα * exp(xb)
end
return s
end
# compute log(1 + exp(x)) without spurious overflow
softplus(x) = log1p(exp(-abs(x))) + max(x, 0)
function 𝒬_fast(γ, t, x)
s = log(1 + zero(eltype(x)))
n, k = size(x)
(n == length(t) && length(γ) == k + 1) || throw(DimensionMismatch())
@inbounds for i = 1:n
xb0 = sum(@inbounds(x[i, j] * γ[j]) for j in 1:k)
xb1 = xb0 + γ[end]
s += (t[i] > 0) * xb0 + (t[i] > 1) * xb1 - softplus(xb0) - softplus(xb1)
end
return s
end
## removes collinear columns
function removeredundant(x)
return hcat(unique(eachcol(x))...)
end
function 𝒞_fast(ρ, t0, t1, xx0, xx1)
xx = removeredundant([xx0 xx1])
y1 = log.(t1)
y0 = log.(t0)
b1 = xx \ y1
b0 = xx \ y0
s = zero(eltype(ρ))
n, k = size(xx)
(n == length(t0) == length(t1)) || throw(DimensionMismatch())
@inbounds for i = 1:n
e1 = y1[i] - sum(@inbounds(xx[i, j] * b1[j]) for j in 1:k)
e0 = y0[i] - sum(@inbounds(xx[i, j] * b0[j]) for j in 1:k)
s -= (e1 * e0 - ρ)^2
end
return s
end
function boole_i(t, age; ll_ret=1, ul_ret=2)
age - ll_ret <= t <= age + ul_ret
end
function 𝒮_fast(ψ0, ψ1, t0, t1; age=4*(65-45), ll_ret=1, ul_ret=2)
n = length(t0)
(n == length(t1)) || throw(DimensionMismatch())
s = zero(typeof(ψ0))
for i in 1:n
s -= (boole_i(t0[i], age; ll_ret=ll_ret, ul_ret=ul_ret) - ψ0)^2 +
(boole_i(t1[i], age; ll_ret=ll_ret, ul_ret=ul_ret) - ψ1)^2
end
return s
end
logℒ_fast(α, β, t, c, xx) +
𝒬_fast(γ, t, xx) +
𝒞_fast(ρ, t0, t1, xx0, xx1) +
𝒮_fast(ψ0, ψ1, t0, t1)
function logℒ_fast_turbo(α, β, t, c, x)
eα = abs(α)
n, k = size(x)
(n == length(t) == length(c) && length(β) == k + 1) || throw(DimensionMismatch())
s = zero(typeof(α))
@turbo for i in 1:n
xb0 = 0.0
for j in 1:k
xb0 += x[i,j] * β[j+1]
end
xb = xb0 + β[1]
ti = t[i]
s += (1 - (c[i] == ti)) * (log(eα) + (eα - 1) * log(ti) + xb) - ti^eα * exp(xb)
end
return s
end
function 𝒬_fast_turbo(γ, t, x)
s = log(1 + zero(eltype(x)))
n, k = size(x)
(n == length(t) && length(γ) == k + 1) || throw(DimensionMismatch())
@turbo for i = 1:n
xb0 = 0.0
for j in 1:k
xb0 += x[i, j] * γ[j]
end
xb1 = xb0 + γ[k+1]
s += (t[i] > 0) * xb0 + (t[i] > 1) * xb1 - softplus(xb0) - softplus(xb1)
end
return s
end
function 𝒞_fast_turbo(ρ, t0, t1, xx0, xx1)
xx = removeredundant([xx0 xx1])
y1 = log.(t1)
y0 = log.(t0)
b1 = xx \ y1
b0 = xx \ y0
s = zero(eltype(ρ))
n, k = size(xx)
(n == length(t0) == length(t1)) || throw(DimensionMismatch())
@turbo for i = 1:n
e1acc = 0.0
e0acc = 0.0
for j in 1:k
e1acc += xx[i, j] * b1[j]
e0acc += xx[i, j] * b0[j]
end
e1 = y1[i] - e1acc
e0 = y0[i] - e0acc
s -= (e1 * e0 - ρ)^2
end
return s
end
function 𝒮_fast_simd(ψ0, ψ1, t0, t1; age=4*(65-45), ll_ret=1, ul_ret=2)
n = length(t0)
(n == length(t1)) || throw(DimensionMismatch())
s = zero(typeof(ψ0))
@inbounds @simd for i in 1:n
s -= (boole_i(t0[i], age; ll_ret=ll_ret, ul_ret=ul_ret) - ψ0)^2 +
(boole_i(t1[i], age; ll_ret=ll_ret, ul_ret=ul_ret) - ψ1)^2
end
return s
end
@btime logℒ_fast($α, $β, $t, $c, $xx) +
𝒬_fast($γ, $t, $xx) +
𝒞_fast($ρ, $t0, $t1, $xx0, $xx1) +
𝒮_fast($ψ0, $ψ1, $t0, $t1)
@btime logℒ_fast_turbo($α, $β, $t, $c, $xx) +
𝒬_fast_turbo($γ, $t, $xx) +
𝒞_fast_turbo($ρ, $t0, $t1, $xx0, $xx1) +
𝒮_fast_simd($ψ0, $ψ1, $t0, $t1)
```