I’m trying to take use of mulithreading to parallelize an internal loop, but while it works (correct answer and I can see CPU being used), there is a serious performance hit:

```
function update(x, Δt)
return x + 0.5 * Δt * x
end
function compute_nothreaded(x0_vals, Δt, nΔt)
x_vals = copy(x0_vals);
nx = length(x0_vals);
for n in 1:nΔt
@. x_vals = update(x_vals, Δt);
end
return x_vals
end
function compute_threaded(x0_vals, Δt, nΔt)
x_vals = copy(x0_vals);
nx = length(x0_vals);
for n in 1:nΔt
Threads.@threads for j in 1:nx
x_vals[j] = update(x_vals[j], Δt)
end
end
return x_vals
end
Random.seed!(100);
x0 = randn(10);
x₀ = [1.0];
Δt = 0.5;
nΔt = 10^2;
f1(x) = sin(x)
f2(x) = x^2
# compute_threaded(x0, Δt, nΔt, (f1,))
@btime compute_threaded($x0, $Δt, $nΔt)
@btime compute_threaded($x0, $Δt, $nΔt)
@btime compute_threaded($x0, $Δt, $nΔt)
4.299 ms (4230 allocations: 319.81 KiB)
4.178 ms (4224 allocations: 319.62 KiB)
4.309 ms (4227 allocations: 319.72 KiB)
@btime compute_nothreaded($x0, $Δt, $nΔt)
@btime compute_nothreaded($x0, $Δt, $nΔt)
@btime compute_nothreaded($x0, $Δt, $nΔt)
928.654 ns (1 allocation: 160 bytes)
882.420 ns (1 allocation: 160 bytes)
931.420 ns (1 allocation: 160 bytes)
```

This is the proptotype for a problem with a much more expensive `update`

function.