I am trying to optimize some matrix multiplications inside a Turing model (a Kalman filter) but since I am depending on autodiff I can’t pre-allocate matrices and mutate things. How does one go about optimizing this type of code?

Below is a working snippet of the (unoptimized) model code and the code used to generate flame graphs. I have also commented the lines where the profiler collects the most samples, which reveal a bottle neck in `kalman_update`

.

```
using Turing, Random, LinearAlgebra
D = 35
x0 = randn(D) # init state
P0 = diagm(ones(D)) # init state covariance
F = diagm(ones(D)) # transition
Q = 0.1*diagm(ones(D)) # transition covariance
H = ones(1, D) # observation
R = 0.1*diagm(ones(1)) # observation covariance
xs = Vector{Array{Float64}}(undef, 10)
xs[1] = x0
for i in 2:length(xs)
xs[i] = F*xs[i-1]
end
ys = Matrix([only(H*xs[i] .+ randn(1)*R) for i in eachindex(xs)]')
function kalman_predict(x, P, H, F, Q, R)
x = F*x
P = F*P*F' + Q # 9% samples here
y = H*x
S = H*P*H' + R
x, P, y, S
end
function kalman_update(x, P, r, S, H, R)
K = P*H'/S
x = x + K*r
I_KH = I - K*H
P = I_KH*P*I_KH' + K*R*K' # 88% samples here
y = H*x
S = H*P*H' + R
x, P, y, S
end
@model function kalman_model(ys, H, F, Q)
obs_dim, N = size(ys)
latent_dim = size(H, 2)
x₀ ~ MvNormal(zeros(latent_dim), I)
ϵ ~ filldist(Gamma(1, 1), latent_dim)
σ ~ filldist(Gamma(1, 1), obs_dim)
P = diagm(ϵ.^2)
R = diagm(σ.^2)
x = x₀
for t in 1:N
x, P, y, S = kalman_predict(x, P, H, F, Q, R) # 9% samples in kalman_predict
if t <= N
r = ys[:,t] - y
x, P, y, S = kalman_update(x, P, r, S, H, R) # 90% samples in kalman_update
Turing.@addlogprob! - 0.5 * sum(logdet(S) + r'*inv(S)*r)
end
end
end
Random.seed!(12345)
model = kalman_model(ys, H, F, Q,)
function benchmark_model(model, n)
for i in 1:n
sample(model, NUTS(), 1)
end
end
@profview benchmark_model(model, 1)
@profview benchmark_model(model, 10)
```