My code attached is long because I couldn’t figure out which part should be improved. I guess the bottleneck is function rls_multi which implements a recursive OLS. Note that this function was translated from C++ (armadillo) code in R package exuber:

I have read the performance tips thoroughly. But it is still very slow. Running the same task in R takes less than 10 seconds, but running the following Julia code takes 3 minutes. Any suggestion would be greatly appreciated.

```
using LinearAlgebra
using Statistics
using Random
function default_win_size(obs::Int64)
r0 = 0.01 + 1.8 / sqrt(obs)
return floor(Int64, r0 * obs)
end
function get_reg(y::AbstractVector{Float64}, lag::Int64, null_hyp=false) #where {F<:Float64}
T = length(y)
lag_y = @view(y[lag+1:T-1])
dy=diff(y)
dy01 = @view(dy[lag+1:end]) # @view(y[lag+2:end]) - @view(y[lag+1:end-1])
start_col=ifelse(null_hyp, 1, 2)
reg = Matrix{Float64}(undef, size(dy01,1), start_col + lag)
reg[:, start_col] .=1
if null_hyp == false
reg[:,1] =lag_y
end
@inbounds @simd for i in 1:lag
reg[:, i+start_col] = @view(dy[lag+1-i:end-i])
end
return reg, dy01
end
function rls_multi(y::AbstractVector{Float64}, x::Matrix{Float64}, badf::Vector{Float64}, bsadf::Vector{Float64}, min_win::Int64) #where {F<:Float64}
obs = size(y,1)
x_width = size(x, 2) # the number of independent variables
t_x = x'
temp_t=Vector{Float64}(undef, length(bsadf))
local g,b # g is a square matrix of size x_width, b is a column vector of size x_width
@inbounds for r2 in min_win:obs
@inbounds for r1 in r2-min_win+1:-1:1
win_size = r2 - r1 + 1
tsx = @view t_x[:, r1:r2]
sx = tsx'
sy = @view y[r1:r2]
if r1 == r2 - min_win + 1
g = inv(tsx * sx)
b = g * (tsx * sy)
else
t_new_x = @view tsx[:, [1]]
new_x = @view sx[[1], :]
gx = g * t_new_x
the_factor = 1 / (1 + (new_x*gx)[1])
g -= the_factor * (gx * gx')
b -= g * t_new_x * (dot(b, t_new_x) - sy[1])
end
res = sy - sx*b
sqres = (res'res)[1]
vares::Float64 = sqres / (win_size - x_width)
#sb_1::Float64 = sqrt(vares * diag(g)[1])
sb_1::Float64 = sqrt(vares * g[1,1])
temp_t[r1] = (b[1]) / sb_1
if r1==1
badf[r2-min_win+1] = temp_t[r1]
end
end
bsadf[r2-min_win+1] = maximum(@view temp_t[1:r2-min_win+1])#[1]
end
end
function rls_gsadf(y::AbstractVector{Float64}, adflag::Int64, min_win::Int64)
obs_ori = length(y)
obs = obs_ori - 1 - adflag # length(dy01)
max_start::Int64 = obs_ori - adflag - min_win #max_starting
x, dy01=get_reg(y, adflag)
badf=Vector{Float64}(undef, max_start)
bsadf=Vector{Float64}(undef, max_start)
if adflag == 0
rls_zero(dy01, x[:,1], max_start, min_win, obs)
else
rls_multi(dy01, x, badf, bsadf, min_win)
end
return (badf, bsadf)
end
function CV_PSY(T::Int, nboot::Int, min_window::Int, adflag::Int)
# MC simulation to get critical values
min_window=ifelse(min_window==0, default_win_size(T), min_window)
qe = [0.90 0.95 0.99]
dim = T - adflag - min_window
MPSY = Matrix{Float64}(undef, dim, nboot)
tbr = Matrix{Float64}(undef, dim, length(qe))
seed = 123
Random.seed!(seed)
e = randn(T, nboot)
a = T^(-1)
y = cumsum(e .+ a, dims=1)
@inbounds @simd for iter ∈ 1:nboot
if mod(iter, 100) == 0
println(iter)
end
badf, _ =rls_gsadf(@view(y[:,iter]), adflag, min_window)
MPSY[:, iter] = accumulate(max, badf)
end
@inbounds @simd for i in 1:dim
@inbounds for j in eachindex(qe)
tbr[i, j] = quantile(@view(MPSY[i, :]), qe[j])
end
end
return (tbr)
end
start_time = now()
tbr=CV_PSY(300, 1000, 0, 1)
println("Elapsed time: ", now() - start_time)
```