I am trying to speed up this code. The maximization is very time consuming. Perhaps I am also not doing the allocations efficiently. I have a core with 64 threads, but if I use multithread I incur in an error (the result changes)

```
using Random
function myfun()
rng = MersenneTwister(1234)
T1 = 20
T2 = 10
S = 100
N = 300
L = 1000
superset1 = Vector{Array{Float64,2}}(undef,N)
superset2 = Vector{Vector{Int}}(undef,N)
for n = 1:N
superset1[n] = zeros(10,L)
superset2[n] = rand(1:S,10)
end
Tmat = ones(T2,T1)
par = rand(T2)
p = rand(S)
x = zeros(T2,T1,N,L)
v = rand(S,N)
u = rand(T1,N)
shock1 = rand(S,L)
shock2 = rand(N,L)
maxS = Array{Matrix{CartesianIndex{2}},2}(undef,T2,N)
final_mat = zeros(T2,T1,S,N)
final_mat2 = zeros(T2,T1,S,N)
@time for n = 1:N
for t2=1:T2
@inbounds superset1[n] .= @views( par[t2]*v[superset2[n],n] .+ shock1[superset2[n],:] )
@inbounds maxS[t2,n] = argmax(superset1[n],dims=1)
@inbounds x[t2,:,n,:] .= @views( u[:,n] .+ superset1[n][maxS[t2,n]] .+ shock2[n,:]' ) # η*α[n]
end
end
@time for t2=1:T2 for t1=1:T1
indic = Tmat[t2,t1]/L
for l in 1:L #Threads.@threads
den = sum(exp.(x[t2,t1,:,l]))
for n = 1:N
sInd = superset2[n][maxS[t2,n][l][1]]
@inbounds final_mat[t2,t1,sInd,n] += exp(x[t2,t1,n,l])/den*indic
@inbounds final_mat2[t2,t1,sInd,n] += exp(x[t2,t1,n,l])/den*indic*p[sInd]
@inbounds final_mat2[t2,t1,5,n] += exp(x[t2,t1,n,l])/den*indic*(1-p[sInd])
end
end
end end
return sum(final_mat)
end
result = myfun()
```