Functions having same output but different multithreading setting have very different performances. Why would this happen? Are there any rules to conform when using multithreading?

Below shows the code snippets of data generation process and two methods of calculating loglikelihood function. My questions focus on the latter, in which has different multithreading settings and very different performance.

When I run method 1, I get a very high CPU usage (80%+) and poorer performance. When I run method 2, I get a lower CPU usage (40%) and better performance. Is the CPU usage not proportional to the performance in terms of multithreading?

# Data Generation Process

```
using Random, Distributions
using LogExpFunctions, FastGaussQuadrature
const seed = Random.seed!(2023 - 10 - 8)
const LT = 20
pifun(t) = logistic(t - LT / 2)
t = 1:LT
pit = pifun.(t)
pit[end] = 1
Inds = 5000
mR = 3.0
sigmaR = 1.0
mC = -1.0
sigmaC = 1.0
y1 = randn(seed, Inds)
y2 = randn(seed, Inds)
r = exp.(mR .+ sigmaR .* y1)
c = exp.(mC .+ sigmaC .* y2)
function calcEMAX(ri::T, ci::T, pit = pit) where {T <: Real}
EV = zeros(T, LT)
for t in (LT - 1):-1:1
EV[t] = log1pexp(pit[t + 1] * ri - ci + (1 - pit[t + 1]) * EV[t + 1])
end
return EV
end
function simulateY(r = r, c = c, pit = pit)
Y = Vector{Vector{Int64}}()
for i in 1:Inds
t = 1
yi = Int64[]
EMAX = calcEMAX(r[i], c[i])
while t <= LT
if pit[t] > rand(seed)
break
else
p1prob = 1 / (1 + exp(pit[t] * r[i] - c[i] + (1 - pit[t]) * EMAX[t]))
if p1prob > rand(seed)
push!(yi, 1)
break
else
push!(yi, 0)
end
end
t = t + 1
end
if yi != []
push!(Y, yi)
end
end
return Y
end
Y = simulateY(r, c)
```

# Calculate the likelihood: Method 1

```
function indll(mR::T, sigmaR::T, mC::T, sigmaC::T, yi, pit = pit, nodes = nodes, weights = weights) where {T <: Real}
indll = Vector{T}(undef, length(nodes) * length(nodes))
@sync for k1 in eachindex(nodes)
for k2 in eachindex(nodes)
Threads.@spawn begin
r = exp(mR + sqrt(2) * sigmaR * nodes[k1])
c = exp(mC + sqrt(2) * sigmaC * nodes[k2])
EMAX = calcEMAX(r, c)
indllnode = zero(T)
for t in eachindex(yi)
u0 = pit[t] * r - c + (1 - pit[t]) * EMAX[t]
if yi[t] == 0
indllnode += u0 - log1pexp(u0)
else
indllnode += -log1pexp(u0)
end
end
indll[(k1 - 1) * 10 + k2] = indllnode + log(weights[k1] * weights[k2] / pi)
end
end
end
return logsumexp(indll)
end
function nll(mR::T, sigmaR::T, mC::T, sigmaC::T, Y = Y, pit = pit, nodes = nodes, weights = weights) where {T <: Real}
ll = zero(T)
for i in eachindex(Y)
ll += indll(mR, sigmaR, mC, sigmaC, Y[i])
end
return -ll
end
```

Performance

```
nodes, weights = gausshermite(10)
using BenchmarkTools
@btime nll(zeros(4)...)
# 804.050 ms (4977257 allocations: 476.75 MiB)
# 36692.967318
```

# Calculate the likelihood: Method 2

```
function indll(mR::T, sigmaR::T, mC::T, sigmaC::T, yi, pit = pit, nodes = nodes, weights = weights) where {T <: Real}
indll = Vector{T}()
for k1 ∈ eachindex(nodes)
for k2 ∈ eachindex(nodes)
r = exp(mR + sqrt(2) * sigmaR * nodes[k1])
c = exp(mC + sqrt(2) * sigmaC * nodes[k2])
EMAX = calcEMAX(r, c)
indllnode = zero(T)
for t ∈ eachindex(yi)
u0 = pit[t] * r - c + (1 - pit[t]) * EMAX[t]
if yi[t] == 0
indllnode += u0 - log1pexp(u0)
else
indllnode += -log1pexp(u0)
end
end
indllnode = indllnode + log(weights[k1] * weights[k2] / pi)
push!(indll, indllnode)
end
end
return logsumexp(indll)
end
function nll(mR::T, sigmaR::T, mC::T, sigmaC::T, Y = Y, pit = pit, nodes = nodes, weights = weights) where {T <: Real}
ll = zeros(T, Threads.nthreads())
Threads.@threads for i ∈ eachindex(Y)
ll[Threads.threadid()] += indll(mR, sigmaR, mC, sigmaC, Y[i])
end
return -sum(ll)
end
```

Performace

```
@btime nll(zeros(4)...)
# 42.054 ms (1595245 allocations: 132.62 MiB)
# 36692.967318
```