With 32 threads:

```
using Base.Threads, BenchmarkTools
f(x) = sin(x) + cos(x)
function serial(n)
s = 0.0
for x = 1:n
s += f(x)
end
return s
end
function threads(n)
res_vec = Vector{Float64}(uninitialized, nthreads())
@threads for i ∈ 1:nthreads()
res_vec[i] = local_sum(threadid(), n, nthreads())
end
sum(res_vec)
end
function local_sum(id, n, nthread)
out = 0.0
l = 1 + div(n * (id-1), nthread)
u = div(n * id, nthread)
for x ∈ l:u
out += f(x)
end
out
end
N = 8
trial = Array{BenchmarkTools.Trial}(N,2)
times = Array{Float64}(N,2)
ratios = Array{Float64}(N)
for i = 1:N
n = 10^i
@assert isapprox(serial(n), threads(n))
trial[i,1] = @benchmark serial($n)
trial[i,2] = @benchmark threads($n)
times[i,1] = median(trial[i,1].times)
times[i,2] = median(trial[i,2].times)
ratios[i] = times[i,1] / times[i,2]
end
println(ratios)
```

```
[0.0346708, 0.350999, 2.9688, 12.7841, 16.0687, 17.1916, 16.1807, 18.0311]
```

This is much better than what I got for parallelism:

```
julia> println(ratios)
[0.000225118, 0.00054258, 0.00585587, 0.0602492, 0.639873, 4.08709, 12.308, 16.3199]
```