I have the following two functions for linear regression.

```
function ordinary_least_squares_regression(x,y;lambda=0,weights=0)
if weights == 0
X = x' * x + lambda*I
return X\(x'y)
else
w = spdiagm(weights)
return ((x'*w*x + lambda*I))\(x'*w*y)
end
end
```

```
function ordinary_least_squares_regression!(x,y,transpose_buffer,vector_buffer,matrix_buffer;lambda=0,weights=0)
if typeof(weights) != Int
for i in 1:size(x)[2]
for j in 1:size(x)[1]
#Creates x' w matrix
transpose_buffer[i,j] = x[j,i] * weights[j]
end
end
else
#Stores x'
transpose!(transpose_buffer,x)
end
#Calculates x'y
mul!(vector_buffer,transpose_buffer,y)
#Calculates x'x
mul!(matrix_buffer,transpose_buffer,x)
if lambda != 0
for i in 1:size(matrix_buffer)[1]
#x'x + lambda I
matrix_buffer[i,i] += lambda
end
end
return matrix_buffer\vector_buffer
end
```

If I benchmark the two using the following buffers

```
x = randn(500000,8)
y = randn(500000)
mb = zeros(8,8)
vb = zeros(8)
tb = zeros(8,500000)
b = zeros(8)
w = collect(1:500000)
b1 = ordinary_least_squares_regression(x,y,weights=w)
b2 = ordinary_least_squares_regression!(x,y,tb,vb,mb,weights=w)
@btime for i in 1:20
b1 = ordinary_least_squares_regression($x,$y,weights=$w)
end
@btime for i in 1:20
b2 = ordinary_least_squares_regression!($x,$y,$tb,$vb,$mb,weights=$w)
end
```

i.e., weighted linear regression, I get ` 887.315 ms (660 allocations: 1.42 GiB) 559.459 ms (60 allocations: 16.25 KiB)`

; in-place linear regression is slightly faster. However, if you to the calculations with weights = 0 (i.e. no weights), then the result is ` 101.898 ms (120 allocations: 41.25 KiB) 405.156 ms (60 allocations: 16.25 KiB)`

.

Question: why on earth is my in-place linear regression solver so much slower? Using a Cholesky decomposition doesn’t reduce allocations or memory usage, and the memory allocations are unavoidable as I’m solving a matrix equation. The two results agree to `1e-19`

with or without weights, so I’m not doing anything wrong algorithmically.

(I have implemented other linear regression algorithms, such as coordinate descent and gradient descent, so for now I am solely interested in the relative slowness of this in-place implementation of the direct matrix solver)