This is the fastest I can come up with on the CPU (before I go to sleep). You might be able to squeeze more out of it on a GPU.

On my computer, my version is **4x faster** than the fastest version in your blog yet.

But the idea is that you can do multi-threading. And you avoid memory allocating in `gen_data`

by preallocating matrices and just re-using them.

But for multi-threading, you are better off creating multiple copies of the data so your threads can work concurrently and don’t have to share data (leading to false-sharing).

Also, you can save time by not going through with `DataFrames`

and just create the matrix directly.

Also, if you use GPU you might want to turn off multi-threading.

```
function gen_data2!(X, y, x1_dmean, x2_dmean)
## total time periods in the the panel = 500
tt = 500
# x1 and x2 covariates
x1_A = 1 .+ rand(std_normal, tt)
x1_B = 1/4 .+ rand(std_normal, tt)
x2_A = 1 .+ x1_A .+ rand(std_normal, tt)
x2_B = 1 .+ x1_B .+ rand(std_normal, tt)
# outcomes (notice different slope coefs for x2_A and x2_B)
y_A = x1_A .+ 1*x2_A + rand(std_normal, tt)
y_B = x1_B .+ 2*x2_B + rand(std_normal, tt)
x1 = vcat(x1_A, x1_B)
x2 = vcat(x2_A, x2_B)
X[:, 2] .= vcat(fill(0, length(x1_A)), fill(1, length(x1_B)))
X[:, 3] .= x1
X[:, 4] .= x2
X[:, 5] .= (x1 .* x2)
y .= vcat(y_A, y_B)
x1_dmean .= vcat(x1_A .- mean(x1_A), x1_B .- mean(x1_B))
x2_dmean .= vcat(x2_A .- mean(x2_A), x2_B .- mean(x2_B))
end
function coefs_lm_formula4(X, y, x1_dmean, x2_dmean)
mod_level = fastfit(X, y)
@inbounds X[:, 5] .= x1_dmean .* x2_dmean
mod_dmean = fastfit(X, y)
(mod_level[end], mod_dmean[end])
end
function run_simulations5(nsim)
sims = zeros(nsim, 2);
X = [zeros(Float64, 1000, 5) for _ in 1:nthreads()]
y = [zeros(Float64, 1000) for _ in 1:nthreads()]
x1_dmean = [zeros(Float64, 1000) for _ in 1:nthreads()]
x2_dmean = [zeros(Float64, 1000) for _ in 1:nthreads()]
# set constant
for i in 1:nthreads()
X[i][:, 1] .= 1
end
@threads for i in 1:nsim
gen_data2!(X[threadid()], y[threadid()], x1_dmean[threadid()], x2_dmean[threadid()])
@inbounds sims[i, 1], sims[i, 2] = coefs_lm_formula4(X[threadid()], y[threadid()], x1_dmean[threadid()], x2_dmean[threadid()])
end
sims
end
```