Speeding up maximization

I am trying to speed up this code. The maximization is very time consuming. Perhaps I am also not doing the allocations efficiently. I have a core with 64 threads, but if I use multithread I incur in an error (the result changes)

``````
using Random

function myfun()

rng = MersenneTwister(1234)

T1 = 20
T2 = 10
S = 100
N = 300
L = 1000
superset1 = Vector{Array{Float64,2}}(undef,N)
superset2 = Vector{Vector{Int}}(undef,N)
for n = 1:N
superset1[n] = zeros(10,L)
superset2[n] = rand(1:S,10)
end
Tmat = ones(T2,T1)
par = rand(T2)
p = rand(S)
x = zeros(T2,T1,N,L)
v = rand(S,N)
u = rand(T1,N)
shock1 = rand(S,L)
shock2 = rand(N,L)
maxS = Array{Matrix{CartesianIndex{2}},2}(undef,T2,N)
final_mat = zeros(T2,T1,S,N)
final_mat2 = zeros(T2,T1,S,N)

@time for n = 1:N
for t2=1:T2
@inbounds superset1[n] .= @views( par[t2]*v[superset2[n],n] .+ shock1[superset2[n],:] )
@inbounds maxS[t2,n]    = argmax(superset1[n],dims=1)
@inbounds x[t2,:,n,:]  .= @views( u[:,n] .+ superset1[n][maxS[t2,n]] .+ shock2[n,:]'  ) # η*α[n]
end
end
@time for t2=1:T2 for t1=1:T1
indic = Tmat[t2,t1]/L
den = sum(exp.(x[t2,t1,:,l]))
for n = 1:N
sInd = superset2[n][maxS[t2,n][l][1]]
@inbounds final_mat[t2,t1,sInd,n]  += exp(x[t2,t1,n,l])/den*indic
@inbounds final_mat2[t2,t1,sInd,n] += exp(x[t2,t1,n,l])/den*indic*p[sInd]
@inbounds final_mat2[t2,t1,5,n]    += exp(x[t2,t1,n,l])/den*indic*(1-p[sInd])
end
end
end end

return sum(final_mat)
end

result = myfun()
``````

Indeed, it allocates a lot :

``````
using Random

rng = MersenneTwister(1234)

T1 = 20
T2 = 10
S = 100
N = 300
L = 1000
superset1 = Vector{Array{Float64,2}}(undef,N)
superset2 = Vector{Vector{Int}}(undef,N)
for n = 1:N
superset1[n] = zeros(10,L)
superset2[n] = rand(1:S,10)
end
Tmat = ones(T2,T1)
par = rand(T2)
p = rand(S)
x = zeros(T2,T1,N,L)
v = rand(S,N)
u = rand(T1,N)
shock1 = rand(S,L)
shock2 = rand(N,L)
maxS = Array{Matrix{CartesianIndex{2}},2}(undef,T2,N)
final_mat = zeros(T2,T1,S,N)
final_mat2 = zeros(T2,T1,S,N)

function myfun!(T1,T2,S,N,L,superset1,superset2,Tmat,par,p,x,v,u,shock1,shock2,maxS,final_mat,final_mat2)
for n = 1:N
for t2=1:T2
@inbounds superset1[n] .= @views( par[t2]*v[superset2[n],n] .+ shock1[superset2[n],:] )
@inbounds maxS[t2,n]    = argmax(superset1[n],dims=1)
@inbounds x[t2,:,n,:]  .= @views( u[:,n] .+ superset1[n][maxS[t2,n]] .+ shock2[n,:]'  ) # η*α[n]
end
end
for t2=1:T2 for t1=1:T1
indic = Tmat[t2,t1]/L
den = sum(exp.(x[t2,t1,:,l]))
for n = 1:N
sInd = superset2[n][maxS[t2,n][l][1]]
@inbounds final_mat[t2,t1,sInd,n]  += exp(x[t2,t1,n,l])/den*indic
@inbounds final_mat2[t2,t1,sInd,n] += exp(x[t2,t1,n,l])/den*indic*p[sInd]
@inbounds final_mat2[t2,t1,5,n]    += exp(x[t2,t1,n,l])/den*indic*(1-p[sInd])
end
end
end end
return sum(final_mat)
end

result = myfun!(T1,T2,S,N,L,superset1,superset2,Tmat,par,p,x,v,u,shock1,shock2,maxS,final_mat,final_mat2)

using BenchmarkTools
@btime myfun!(T1,T2,S,N,L,superset1,superset2,Tmat,par,p,x,v,u,shock1,shock2,maxS,final_mat,final_mat2)
#   8.614 s (424001 allocations: 1.02 GiB)
``````

might be solvable though.

While the views will avoid allocations on the slices, I think the result of the operation will be a new array, here and in the other similar operations.

I suggest using `@views` in the whole function to simplify these lines (remove inbounds until these allocations are not solved). Then, be careful with the broadcasting, use preferentially the `@.` macro, or expand the operation into loops to see more explicitly what is going on.

The following code is `5X` faster with as low as 50% allocations as the original code. The main issue with the original code is that many nd-arrays are accessed in an incorrect memory order and at random indices most of the time. We can remedy this using `permutedims` to get the array stored in the memory layout we want without changing its values. Also, we use memoization where possible to avoid recalculation of values unnecessarily. And to keep allocations low, always decorate slices with `@views` to avoid data copying. Minor changes are also made like using `findmax` instead of `argmax`, as the former computes both values and indices and `superset1` needn’t be an array of arrays. Enjoy!

``````function myfun()
Random.seed!(1234)
T1 = 20
T2 = 10
S  = 100
N  = 300
L  = 1000
superset1  = zeros(10, L)             # array of arrays not needed
superset2  = [rand(1:S,10) for n=1:N] # simplify creation
Tmat       = ones(T2,T1)
par        = rand(T2)
p          = rand(S)
x          = zeros(T2,T1,N,L)
v          = rand(S,N)
u          = rand(T1,N)
shock1     = rand(S,L)
shock2     = rand(N,L)
maxS       = Matrix{Matrix{CartesianIndex{2}}}(undef,T2,N)
final_mat  = zeros(T2,T1,S,N)
final_mat2 = zeros(T2,T1,S,N)

@time for n = 1:N
for t2 = 1:T2
superset1       .= @views par[t2] .* v[superset2[n],n] .+ shock1[superset2[n],:]
valS, maxS[t2,n] = findmax(superset1,dims=1)
x[t2,:,n,:]     .= @views u[:,n] .+ valS .+ shock2[n,:]'   # η*α[n]
end
end

@time begin
sInd3 = [superset2[n][maxS[t2,n][l][1]] for n=1:N, t2=1:T2, l=1:L] # memoize
prm_x = permutedims(x, (3,1,2,4)) # adjust memory layout for faster access
map!(exp, prm_x, prm_x)           # inplace calculation of exp
for t1 = 1:T1
for t2 = 1:T2
indic = Tmat[t2,t1]/L
den = sum(view(prm_x,:,t2,t1,l))
for n = 1:N
sInd = sInd3[n,t2,l]  # superset2[n][maxS[t2,n][l][1]]
final_mat[t2,t1,sInd,n]  += prm_x[n,t2,t1,l]/den*indic
final_mat2[t2,t1,sInd,n] += prm_x[n,t2,t1,l]/den*indic*p[sInd]
final_mat2[t2,t1,5,n]    += prm_x[n,t2,t1,l]/den*indic*(1-p[sInd])
end
end
end
end
end
return sum(final_mat)
end

result = myfun()
#   0.587365 seconds (449.38 k allocations: 90.829 MiB, 30.63% compilation time)
#   1.656068 seconds (5 allocations: 480.652 MiB, 0.49% gc time)
# result = 200.0

# Before:
# 0.668699 seconds (474.13 k allocations: 91.983 MiB, 3.50% gc time, 34.53% compilation time)
# 8.631730 seconds (400.00 k allocations: 976.562 MiB, 2.30% gc time)
``````

That was very helpful. I guess it feels strange that sInd3 done all outside the loop is more efficient than just calling it when needed. I modified the example - to make it the actual one - and now the allocations jumped to 20M again. The only thing I changed is having another max inside the second big for loop (and the definition of “prob”) outside. Any idea where the problem is here? I wonder whether there is anything that can be parallelized, especially if L = 100,000.

``````using Random
using LinearAlgebra

###

function myfun()
Random.seed!(1234)
T1 = 20
T2 = 10
S  = 100
N  = 300
L  = 10000
superset1  = zeros(10, L)             # array of arrays not needed
superset2  = [rand(1:S,10) for n=1:N] # simplify creation
Tmat       = ones(T2,T1)
par        = rand(T2)
p          = rand(S)
x          = zeros(T2,T1,N,L)
v          = rand(S,N)
u          = rand(T1,N)
shock1     = rand(S,L)
shock2     = rand(N,L)
maxS       = Matrix{Matrix{CartesianIndex{2}}}(undef,T2,N)
final_mat  = zeros(T2,T1,S,N)
final_mat2 = zeros(T2,T1,S,N)

@time for n = 1:N
for t2 = 1:T2
superset1       .= @views par[t2] .* v[superset2[n],n] .+ shock1[superset2[n],:]
valS, maxS[t2,n] = findmax(superset1,dims=1)
x[t2,:,n,:]     .= @views u[:,n] .+ valS .+ shock2[n,:]'   # η*α[n]
end
end

@time begin
sInd3 = [superset2[n][maxS[t2,n][l][1]] for n=1:N, t2=1:T2, l=1:L] # memoize
prob = [ vec( sum( superset2[n].==sInd3[n,t2,:]',dims=2 )/L ) for n=1:N, t2=1:T2 ];
prm_x = permutedims(x, (3,1,2,4)) # adjust memory layout for faster access
#map!(exp, prm_x, prm_x)           # inplace calculation of exp
for l in 1:L
for t1 = 1:T1
for t2 = 1:T2
indic = Tmat[t2,t1]/L
nInd = argmax(view(prm_x,:,t2,t1,l))
final_mat[t2,t1,superset2[nInd],nInd]  += prob[nInd,t2]*indic
final_mat2[t2,t1,superset2[nInd],nInd] += prob[nInd,t2].*p[superset2[nInd]]*indic
final_mat2[t2,t1,5,nInd]               += indic*dot( (1 .-p[superset2[nInd]]) , prob[nInd,t2] )
end
end
end
end
return sum(final_mat)
end

result = myfun()
``````