Speeding up maximization

I am trying to speed up this code. The maximization is very time consuming. Perhaps I am also not doing the allocations efficiently. I have a core with 64 threads, but if I use multithread I incur in an error (the result changes)


using Random

function myfun()

    rng = MersenneTwister(1234)

    T1 = 20
    T2 = 10
    S = 100
    N = 300
    L = 1000
    superset1 = Vector{Array{Float64,2}}(undef,N)
    superset2 = Vector{Vector{Int}}(undef,N)
    for n = 1:N
        superset1[n] = zeros(10,L)
        superset2[n] = rand(1:S,10)
    end
    Tmat = ones(T2,T1)
    par = rand(T2)
    p = rand(S)
    x = zeros(T2,T1,N,L)
    v = rand(S,N)
    u = rand(T1,N)
    shock1 = rand(S,L)
    shock2 = rand(N,L)
    maxS = Array{Matrix{CartesianIndex{2}},2}(undef,T2,N)
    final_mat = zeros(T2,T1,S,N)
    final_mat2 = zeros(T2,T1,S,N)

    @time for n = 1:N 
        for t2=1:T2
            @inbounds superset1[n] .= @views( par[t2]*v[superset2[n],n] .+ shock1[superset2[n],:] )
            @inbounds maxS[t2,n]    = argmax(superset1[n],dims=1)
            @inbounds x[t2,:,n,:]  .= @views( u[:,n] .+ superset1[n][maxS[t2,n]] .+ shock2[n,:]'  ) # η*α[n]
        end 
    end
    @time for t2=1:T2 for t1=1:T1
            indic = Tmat[t2,t1]/L
            for l in 1:L #Threads.@threads 
                den = sum(exp.(x[t2,t1,:,l]))
                for n = 1:N
                    sInd = superset2[n][maxS[t2,n][l][1]]
                    @inbounds final_mat[t2,t1,sInd,n]  += exp(x[t2,t1,n,l])/den*indic
                    @inbounds final_mat2[t2,t1,sInd,n] += exp(x[t2,t1,n,l])/den*indic*p[sInd]
                    @inbounds final_mat2[t2,t1,5,n]    += exp(x[t2,t1,n,l])/den*indic*(1-p[sInd])
                end
            end
    end end

    return sum(final_mat)
end

result = myfun()

Indeed, it allocates a lot :


using Random

rng = MersenneTwister(1234)

T1 = 20
T2 = 10
S = 100
N = 300
L = 1000
superset1 = Vector{Array{Float64,2}}(undef,N)
superset2 = Vector{Vector{Int}}(undef,N)
for n = 1:N
    superset1[n] = zeros(10,L)
    superset2[n] = rand(1:S,10)
end
Tmat = ones(T2,T1)
par = rand(T2)
p = rand(S)
x = zeros(T2,T1,N,L)
v = rand(S,N)
u = rand(T1,N)
shock1 = rand(S,L)
shock2 = rand(N,L)
maxS = Array{Matrix{CartesianIndex{2}},2}(undef,T2,N)
final_mat = zeros(T2,T1,S,N)
final_mat2 = zeros(T2,T1,S,N)

function myfun!(T1,T2,S,N,L,superset1,superset2,Tmat,par,p,x,v,u,shock1,shock2,maxS,final_mat,final_mat2)
    for n = 1:N 
        for t2=1:T2
            @inbounds superset1[n] .= @views( par[t2]*v[superset2[n],n] .+ shock1[superset2[n],:] )
            @inbounds maxS[t2,n]    = argmax(superset1[n],dims=1)
            @inbounds x[t2,:,n,:]  .= @views( u[:,n] .+ superset1[n][maxS[t2,n]] .+ shock2[n,:]'  ) # η*α[n]
        end 
    end
    for t2=1:T2 for t1=1:T1
            indic = Tmat[t2,t1]/L
            for l in 1:L #Threads.@threads 
                den = sum(exp.(x[t2,t1,:,l]))
                for n = 1:N
                    sInd = superset2[n][maxS[t2,n][l][1]]
                    @inbounds final_mat[t2,t1,sInd,n]  += exp(x[t2,t1,n,l])/den*indic
                    @inbounds final_mat2[t2,t1,sInd,n] += exp(x[t2,t1,n,l])/den*indic*p[sInd]
                    @inbounds final_mat2[t2,t1,5,n]    += exp(x[t2,t1,n,l])/den*indic*(1-p[sInd])
                end
            end
    end end
    return sum(final_mat)
end

result = myfun!(T1,T2,S,N,L,superset1,superset2,Tmat,par,p,x,v,u,shock1,shock2,maxS,final_mat,final_mat2)

using BenchmarkTools
@btime myfun!(T1,T2,S,N,L,superset1,superset2,Tmat,par,p,x,v,u,shock1,shock2,maxS,final_mat,final_mat2)
#   8.614 s (424001 allocations: 1.02 GiB)

might be solvable though.

While the views will avoid allocations on the slices, I think the result of the operation will be a new array, here and in the other similar operations.

I suggest using @views in the whole function to simplify these lines (remove inbounds until these allocations are not solved). Then, be careful with the broadcasting, use preferentially the @. macro, or expand the operation into loops to see more explicitly what is going on.

The following code is 5X faster with as low as 50% allocations as the original code. The main issue with the original code is that many nd-arrays are accessed in an incorrect memory order and at random indices most of the time. We can remedy this using permutedims to get the array stored in the memory layout we want without changing its values. Also, we use memoization where possible to avoid recalculation of values unnecessarily. And to keep allocations low, always decorate slices with @views to avoid data copying. Minor changes are also made like using findmax instead of argmax, as the former computes both values and indices and superset1 needn’t be an array of arrays. Enjoy!

function myfun()
    Random.seed!(1234)
    T1 = 20
    T2 = 10
    S  = 100
    N  = 300
    L  = 1000
    superset1  = zeros(10, L)             # array of arrays not needed
    superset2  = [rand(1:S,10) for n=1:N] # simplify creation
    Tmat       = ones(T2,T1)
    par        = rand(T2)
    p          = rand(S)
    x          = zeros(T2,T1,N,L)
    v          = rand(S,N)
    u          = rand(T1,N)
    shock1     = rand(S,L)
    shock2     = rand(N,L)
    maxS       = Matrix{Matrix{CartesianIndex{2}}}(undef,T2,N)
    final_mat  = zeros(T2,T1,S,N)
    final_mat2 = zeros(T2,T1,S,N)

    @time for n = 1:N 
        for t2 = 1:T2
            superset1       .= @views par[t2] .* v[superset2[n],n] .+ shock1[superset2[n],:] 
            valS, maxS[t2,n] = findmax(superset1,dims=1)
            x[t2,:,n,:]     .= @views u[:,n] .+ valS .+ shock2[n,:]'   # η*α[n]
        end
    end

    @time begin 
    sInd3 = [superset2[n][maxS[t2,n][l][1]] for n=1:N, t2=1:T2, l=1:L] # memoize
    prm_x = permutedims(x, (3,1,2,4)) # adjust memory layout for faster access
    map!(exp, prm_x, prm_x)           # inplace calculation of exp
    for l in 1:L # Threads.@threads
        for t1 = 1:T1
            for t2 = 1:T2
                indic = Tmat[t2,t1]/L
                den = sum(view(prm_x,:,t2,t1,l))
                for n = 1:N
                    sInd = sInd3[n,t2,l]  # superset2[n][maxS[t2,n][l][1]]
                    final_mat[t2,t1,sInd,n]  += prm_x[n,t2,t1,l]/den*indic  
                    final_mat2[t2,t1,sInd,n] += prm_x[n,t2,t1,l]/den*indic*p[sInd]
                    final_mat2[t2,t1,5,n]    += prm_x[n,t2,t1,l]/den*indic*(1-p[sInd])
                end
            end
        end
    end
    end
    return sum(final_mat)
end

result = myfun()
#   0.587365 seconds (449.38 k allocations: 90.829 MiB, 30.63% compilation time)
#   1.656068 seconds (5 allocations: 480.652 MiB, 0.49% gc time)
# result = 200.0

# Before:
# 0.668699 seconds (474.13 k allocations: 91.983 MiB, 3.50% gc time, 34.53% compilation time)
# 8.631730 seconds (400.00 k allocations: 976.562 MiB, 2.30% gc time)

That was very helpful. I guess it feels strange that sInd3 done all outside the loop is more efficient than just calling it when needed. I modified the example - to make it the actual one - and now the allocations jumped to 20M again. The only thing I changed is having another max inside the second big for loop (and the definition of “prob”) outside. Any idea where the problem is here? I wonder whether there is anything that can be parallelized, especially if L = 100,000.

using Random
using LinearAlgebra

###

function myfun()
    Random.seed!(1234)
    T1 = 20
    T2 = 10
    S  = 100
    N  = 300
    L  = 10000
    superset1  = zeros(10, L)             # array of arrays not needed
    superset2  = [rand(1:S,10) for n=1:N] # simplify creation
    Tmat       = ones(T2,T1)
    par        = rand(T2)
    p          = rand(S)
    x          = zeros(T2,T1,N,L)
    v          = rand(S,N)
    u          = rand(T1,N)
    shock1     = rand(S,L)
    shock2     = rand(N,L)
    maxS       = Matrix{Matrix{CartesianIndex{2}}}(undef,T2,N)
    final_mat  = zeros(T2,T1,S,N)
    final_mat2 = zeros(T2,T1,S,N)

    @time for n = 1:N 
        for t2 = 1:T2
            superset1       .= @views par[t2] .* v[superset2[n],n] .+ shock1[superset2[n],:] 
            valS, maxS[t2,n] = findmax(superset1,dims=1)
            x[t2,:,n,:]     .= @views u[:,n] .+ valS .+ shock2[n,:]'   # η*α[n]
        end
    end

    @time begin 
    sInd3 = [superset2[n][maxS[t2,n][l][1]] for n=1:N, t2=1:T2, l=1:L] # memoize
    prob = [ vec( sum( superset2[n].==sInd3[n,t2,:]',dims=2 )/L ) for n=1:N, t2=1:T2 ];
    prm_x = permutedims(x, (3,1,2,4)) # adjust memory layout for faster access
    #map!(exp, prm_x, prm_x)           # inplace calculation of exp
    for l in 1:L 
        for t1 = 1:T1
            for t2 = 1:T2
                indic = Tmat[t2,t1]/L
                nInd = argmax(view(prm_x,:,t2,t1,l))
                final_mat[t2,t1,superset2[nInd],nInd]  += prob[nInd,t2]*indic  
                final_mat2[t2,t1,superset2[nInd],nInd] += prob[nInd,t2].*p[superset2[nInd]]*indic
                final_mat2[t2,t1,5,nInd]               += indic*dot( (1 .-p[superset2[nInd]]) , prob[nInd,t2] )
            end
        end
    end
    end
    return sum(final_mat)
end

result = myfun()