 # Need help with performance in a very large loop

Hello, I hope you guys are safe and healthy in these unprecedented times. I am trying to improve the performance of a loop, in terms of both time and memory usage, that attempts to find a maximum among a number of alternatives that grows exponentially with parameter K. I cannot parallelize this because it is part of an outer loop that is already being parallelized.

I would be super happy if I could push this code to perform with a value of K around 14 or 15 within a reasonable time. I guess it sounds heroic but I am sure that my code is very very far away from being efficient. Any help will be very much appreciated.

``````
using LinearAlgebra
using BitOperations, Profile

J = 1000
T = 200
R=125*rand(J,T)
Π = 100*rand(J,T)
D = rand(,J,1)
S = 20*rand(J,T)
τ = 199
K = 8
Ind = vec(sort(rand((1:J),K,1),dims=1))
p = 0.8
SeqShocks = (rand(J,T).>p).+1
Jᵢ = Ind[1:K-1]
Dᵢ = zeros(J)

matchrow(a,B) = findfirst(i->all(j->a[j] == B[i,j],1:size(B,2)),1:size(B,1))

@views function pshockprob(sequences,prob)
(KK,K)=size(sequences)
proba = zeros(Float32,KK)
for i=1:KK
proba[i]=prob^(sum(sequences[i,:]))*(1-prob)^(K-sum(sequences[i,:]))
end
return proba
end

@views function shocks2ind(sequences,shockpath)
(K,T)=size(shockpath)
inds = zeros(T)
shocks=copy(shockpath)
replace!(shocks, 2=>0)
for i=1:T
inds[i]=matchrow(shocks[:,i]',sequences)
end
return convert(Array{Int32}, inds)
end

@views function auxfun(Ind,K,Choices,option)
if option == 1
return 0.5*sum(Choices), 0
else
return 0.6*sum(Choices), K
end
end

@views  function fun_inside(τ,Jᵢ,Ind,R,Π,S,SeqShocks,Dᵢ,K,J,T,option;Φ=2,prob=0.8)
#println("Backward Induction with Dependent Countries")
β = 0.9
R = R[Ind,:]
Π = Π[Ind,1]
S = S[Ind,1]
UnexpShocks = SeqShocks[Ind,:]
Indices =  ones(Int32,2^K,2^K,T)
v = Array{Float32}(undef,2^K,2^K)
Dstar = zeros(Int8,K,1)
Dstar[:,1]=Dᵢ[Ind]
ChoiceSet = Array{Union{Nothing, Int8}}(nothing,2^K,K)
ShockSet = Array{Union{Nothing, Int8}}(nothing,2^K,K)
indexdiff = findall(in(setdiff(Ind,Jᵢ)),Ind)
Dub = ones(J,T)
let int_s = fill(false,K) #pre-allocate output
for j=0:2^K-1
@. int_s = bget(j,0:K-1)
ChoiceSet[j+1,:]=int_s
ShockSet[j+1,:]=int_s
end
end
combindex = Int[ Dub[setdiff(Ind,Jᵢ),τ] == ChoiceSet[i,indexdiff] for i=1:size(ChoiceSet,1) ]
pstatevec=pshockprob(ShockSet,prob)
indshocks=shocks2ind(ShockSet,UnexpShocks)
v_previous = zeros(Float32,2^K)
temp = zeros(Float32, K, 2^K)
Choice = Float32.(ChoiceSet)
Choice = Choice.*ShockSet
if option == 1
x1 = Array{Float32}(undef,2^K)
x2 = Array{Float32}(undef,2^K)

for j=1:2^K
tempeg = auxfun(Ind,K,ChoiceSet[j,:],option)
x1[j]=tempeg
x2[j]=tempeg
end
elseif option == 2
x1 = Array{Float32}(undef,2^K)
x2 = Array{Float32}(undef,2^K)
for j=1:2^K
tempeg =auxfun(Ind,K,ChoiceSet[j,:],option)
x1[j]=tempeg
x2[j]=tempeg
end
end
if option == 1
temp1WC = Array{Float32}(undef,2^K,2^K)
Vlbs = rand(2^K,2^K)
mul!(v_previous, Vlbs, pstatevec)
v_previous .*= β
@fastmath @inbounds for t=0:T-τ
if R[:,T-t]!=0
temp1 = Array{Float32}(undef,2^K,2^K)
temp2 = Array{Float32}(undef,2^K,2^K)
vc = Array{Float32}(undef,2^K,2^K)
temp1 .= v_previous .+ x1
vcWC = Array{Float32}(undef,2^K,2^K)
temp1WC .= v_previous .+ x2
for  i=1:2^K
for iii=1:2^K
for j=1:K
temp[j,iii] = exp(Φ*ChoiceSet[i,j])*R[j,T-t]-Π[j]+S[j]*ChoiceSet[i,j]-(1-ShockSet[iii,j])*99999999999999999999999999999999999999
end
end
mul!(temp2, Choice, temp)
for iii=1:2^K
best = -Inf
bestWC = -Inf
tempind = 1
tempindWC = 1
for ii=1:2^K
if t == T-τ
if combindex[ii] == 1 && indshocks[τ] == iii
vc[ii,iii] = temp2[ii,iii]+temp1[ii]
vcWC[ii,iii] = temp2[ii,iii]+temp1WC[ii]
if vc[ii,iii] > best
best = vc[ii,iii]
tempind = ii
end
if vcWC[ii,iii] > bestWC
bestWC = vcWC[ii,iii]
tempindWC = ii
end
end
else
vc[ii,iii] = temp2[ii,iii]+temp1[ii]
vcWC[ii,iii] = temp2[ii,iii]+temp1WC[ii]
if vc[ii,iii] > best
best = vc[ii,iii]
tempind = ii
end
if vcWC[ii,iii] > bestWC
bestWC = vcWC[ii,iii]
tempindWC = ii
end
end
end
Indices[i,iii,T-t] = tempind
v[i,iii]= bestWC
end
end
mul!(v_previous, v, pstatevec)
v_previous .*= β
end
end
elseif option == 2
temp1PC = Array{Float32}(undef,2^K,2^K)
Vubs = rand(2^K,2^K)
mul!(v_previous, Vubs, pstatevec)
v_previous .*= β
@fastmath @inbounds for t=0:T-τ
if R[:,T-t]!=0
temp1 = Array{Float32}(undef,2^K,2^K)
temp2 = Array{Float32}(undef,2^K,2^K)
vc = Array{Float32}(undef,2^K,2^K)
vcPC = Array{Float32}(undef,2^K,2^K)
temp1 .= v_previous .+ x1
temp1PC  .= v_previous  .+ x2
for  i=1:2^K
for iii=1:2^K
for j=1:K
temp[j,iii] = exp(Φ*ChoiceSet[i,j])*R[j,T-t]-Π[j]+S[j]*ChoiceSet[i,j]-(1-ShockSet[iii,j])*99999999999999999999999999999999999999
end
end
mul!(temp2, Choice, temp)
for iii=1:2^K
best = -Inf
bestPC = -Inf
tempind = 1
tempindPC = 1
for ii=1:2^K
if t == T-τ
if combindex[ii] == 1 && indshocks[τ] == iii
vc[ii,iii] = temp2[ii,iii]+temp1[ii]
vcPC[ii,iii] = temp2[ii,iii]+temp1PC[ii]
if vc[ii,iii] > best
best = vc[ii,iii]
tempind = ii
end
if vcPC[ii,iii] > bestPC
bestPC = vcPC[ii,iii]
tempindPC = ii
end
end
else
vc[ii,iii] = temp2[ii,iii]+temp1[ii]
vcPC[ii,iii] = temp2[ii,iii]+temp1PC[ii]
if vc[ii,iii] > best
best = vc[ii,iii]
tempind = ii
end
if vcPC[ii,iii] > bestPC
bestPC = vcPC[ii,iii]
tempindPC = ii
end
end
end
Indices[i,iii,T-t] = tempind
v[i,iii]= bestPC
end
end
mul!(v_previous, v, pstatevec)
v_previous .*= β
end
end
end
Dpast = matchrow(ones(K,1),ChoiceSet)
Dstar = ChoiceSet[Indices[Dpast,indshocks[τ],τ],:]
return Dstar
end

@views  function fun_outside(τ,Jᵢ,Ind,R,Π,S,SeqShocks,Dᵢ,K,J,T;Φ=2,prob=0.8)
D1 =  fun_inside(τ,Jᵢ,Ind,R,Π,S,SeqShocks,Dᵢ,K,J,T,1)
D2 =  fun_inside(τ,Jᵢ,Ind,R,Π,S,SeqShocks,Dᵢ,K,J,T,2)
return D1, D2
end

#fun_outside(τ,Jᵢ,Ind,R,Π,S,SeqShocks,Dᵢ,K,J,T)
TEST= @time fun_outside(τ,Jᵢ,Ind,R,Π,S,SeqShocks,Dᵢ,K,J,T)

``````

EDIT: This MWE has been edited to include the comments made by @jlapeyre and answer some questions posed by @bcmichael

Have you profiled? That’ll help you identify where to focus. Further, have you seen the general performance tips?

I haven’t profiled. The reason is I have a very hard time understanding the output of the Profiler. And, I believe I am implementing the performance tips. At least most of them. However, I do not know if there are potential gains with additional memory pre-allocation.

Check out ProfileView.jl or if you’re using Juno or JuliaPro, look at its profiler. You could also try explicitly annotating your code with TimerOutputs.jl to get a coarser (but simpler) idea of where your time is being spent.

It’s a bit much to ask for a volunteer to optimize 200+ lines of code for you. This will help you narrow it down to the chunkiest portion.

1 Like

You are making the most common performance mistake: non-constant globals. Fixing this may or may not help. It would be worth spending some time re-reading the performance tips and comparing each one to your code.

You can get more help if you can trim some extraneous things from your code. Ideally, it should be as concise as possible. For example, are all of the `using`'ed packages necessary ?

Fully qualifying functions, eg `APackage.afunction(x, y,...)` makes it easier to help you.

I also suggest you OwnTime.jl, because in my opinion its output in REPL can be a little more clear.

3 Likes

My recollection from looking at this code when you asked about it before was that the runtime was largely dominated by the matrix multiplication. You’re multiplying 2^K x 2^K matrices and you do so 2^K times. That means your code is O(2^4K). You’re already using BLAS to do the matrix multiplication, so it’s going to be very difficult to really make that go faster. Your options are probably down to fiddling with BLAS (ie trying to use MKL vs OpenBLAS or messing with the number of BLAS threads to see if it can go a little faster) or throwing more hardware at it.

I hate to be negative, but you probably shouldn’t expect any miracles here. On my machine a single 2^14 x 2^14 Float32 matrix multiplication took a bit less than a minute, and your code would do 2^14 of them in each iteration of your loop over t. That would take my computer like a week and a half.

1 Like

Can mul! be beaten by implementing LoopVectorization somehow?

Hi, thanks! One question, which global is non-constant?

Maybe? I don’t really know for sure, but I doubt it. Certainly not to the degree that you need to make what you’re asking for feasible. Matrix multiplication is an operation that has had a lot of optimization work put into it. For this to run in a reasonable time frame with K as large as 14 or 15, I’m pretty sure you’re going to need to speed it up by many orders of magnitude. You’re not going to find a different implementation of matrix multiplication that does that.

Because of how your problem scales, increasing K leads to gigantic increases in the run time. The only approach I can think of that will help lift the limit of what K you can handle with this algorithm is more/faster hardware. You probably could get to higher K relatively easily by using a GPU to do the bulk of the computation, but I doubt that would get you up to 14 or 15.

Ok! Perhaps the K equals 14 or 15 was a very ambitious goal. Anyway, do you have any further ideas on how to speed up the code for K = 9/10 (your help has already increased the performance of the code heroically)?

One thing that confuses me a little bit is that the Juno Profiler suggests that the chunks of code that take the most time with K = 9 is the following and not the mul! (see picture)

``````else
vc[ii,iii] = temp2[ii,iii]+temp1[ii]
vcWC[ii,iii] = temp2[ii,iii]+temp1WC[ii]
if vc[ii,iii] > best
best = vc[ii,iii]
tempind = ii
end
if vcWC[ii,iii] > bestWC
bestWC = vcWC[ii,iii]
tempindWC = ii
end
end

``````

``````    for i=1:KK
proba[i]=prob^(sum(sequences[i,:]))*(1-prob)^(K-sum(sequences[i,:]))
end
``````

You’re doing a lot of extra work in those summations, recalculating numbers you’ve already computed.
Also, why make these arrays `Matrix{Union{Nothing,Int8}}` instead of just `Matrix{Int8}`?

Hi! Thanks for the reply. You are totally right that when I switch from option 1 to option 2 I have a bunch of things that I compute twice. I am working towards that right now.

It appears that I slightly misremembered/misread. The multiplication is actually a 2^KxK matrix times a Kx2^K matrix, which isn’t as bad as I thought, although it will still give you big problems if you tried to go up to 14 or 15.

Try replacing `J = 1000` with `const J = 1000`. Do this for every variable at the top. If you get a message `WARNING: redefinition of constant...` rewrite the code so that the global is indeed constant.

Note that they’re explicitly passing these globals as same-named arguments to inner functions, so this isn’t actually using those non-const globals — this is a red herring.

4 Likes

Yeah. Like I said it may or may not help. In fact, probably not. I can see that they are effectively copied to local variables. Unless I missed one. I still think its best to always avoid non-constant globals. Especially in a help-with-performance thread on this forum.

Thanks! It is a very nice visualization. After running the profiler, it seems that the following code is the one that is taking a big amount of time

``````else
vc[ii,iii] = temp2[ii,iii]+temp1[ii]
vcWC[ii,iii] = temp2[ii,iii]+temp1WC[ii]
if vc[ii,iii] > best
best = vc[ii,iii]
tempind = ii
end
if vcWC[ii,iii] > bestWC
bestWC = vcWC[ii,iii]
tempindWC = ii
end
end

``````

I do not want to keep bothering you. However, I found out something a little bit puzzling. First, I re-wrote fun_inside so that the two “options” are done in the same loop, thus recycling some computations at the expense of having extra arrays stored:

``````@views  function fun_insidev2(τ,Jᵢ,Ind,R,Π,S,SeqShocks,Dᵢ,K,J,T;Φ=2,prob=0.8)
#println("Backward Induction with Dependent Countries")
β = 0.9
R = R[Ind,:]
Π = Π[Ind,1]
S = S[Ind,1]
UnexpShocks = SeqShocks[Ind,:]
IndicesLB =  ones(Int32,2^K,2^K,T)
IndicesUB =  ones(Int32,2^K,2^K,T)
vLB = Array{Float32}(undef,2^K,2^K)
vUB = Array{Float32}(undef,2^K,2^K)
DstarLB = zeros(Int8,K,1)
DstarLB[:,1]=Dᵢ[Ind]
DstarUB = copy(DstarLB)
ChoiceSet = Array{Int8}(undef,2^K,K)
ShockSet = Array{Int8}(undef,2^K,K)
indexdiff = findall(in(setdiff(Ind,Jᵢ)),Ind)
Dub = ones(J,T)
let int_s = fill(false,K) #pre-allocate output
for j=0:2^K-1
@. int_s = bget(j,0:K-1)
ChoiceSet[j+1,:]=int_s
ShockSet[j+1,:]=int_s
end
end
combindex = Int[ Dub[setdiff(Ind,Jᵢ),τ] == ChoiceSet[i,indexdiff] for i=1:size(ChoiceSet,1) ]
pstatevec=pshockprob(ShockSet,prob)
indshocks=shocks2ind(ShockSet,UnexpShocks)
v_previousWC = zeros(Float32,2^K)
v_previousBC = zeros(Float32,2^K)
temp = zeros(Float32, K, 2^K)
temp1LB = Array{Float32}(undef,2^K,2^K)
temp1UB = similar(temp1LB)
temp2 = similar(temp1LB)
Choice = Float16.(ChoiceSet)
Choice = Choice.*ShockSet
temp1WC = Array{Float32}(undef,2^K,2^K)
Vlbs = rand(2^K,2^K)
mul!(v_previousWC, Vlbs, pstatevec)
v_previousWC .*= β

temp1PC = Array{Float32}(undef,2^K,2^K)
Vubs = rand(2^K,2^K)
mul!(v_previousBC, Vubs, pstatevec)
v_previousBC .*= β
@fastmath @inbounds for t=0:T-τ
if R[:,T-t]!=0
vcLB = Array{Float32}(undef,2^K,2^K)
vcWC = Array{Float32}(undef,2^K,2^K)
temp1LB .= v_previousWC .+ 10
temp1WC .= v_previousWC .+20

vcUB = Array{Float32}(undef,2^K,2^K)
vcPC = Array{Float32}(undef,2^K,2^K)
temp1UB .= v_previousBC .+ 1
temp1PC  .= v_previousBC .+ 1

for  i=1:2^K
for iii=1:2^K
for j=1:K
temp[j,iii] = exp(Φ*ChoiceSet[i,j])*R[j,T-t]-Π[j]+S[j]*ChoiceSet[i,j]-(1-ShockSet[iii,j])*99999999999999999999999999999999999999
end
end
#println(temp)
mul!(temp2, Choice, temp)
#println(temp2)
for iii=1:2^K
bestLB = -Inf
bestWC = -Inf
tempindLB = 1
tempindWC = 1
bestUB = -Inf
bestPC = -Inf
tempindUB = 1
tempindPC = 1
for ii=1:2^K
if t == T-τ
if combindex[ii] == 1 && indshocks[τ] == iii
vcLB[ii,iii] = temp2[ii,iii]+temp1LB[ii]
vcWC[ii,iii] = temp2[ii,iii]+temp1WC[ii]
if vcLB[ii,iii] > bestLB
bestLB = vcLB[ii,iii]
tempindLB = ii
end
if vcWC[ii,iii] > bestWC
bestWC = vcWC[ii,iii]
tempindWC = ii
end
vcUB[ii,iii] = temp2[ii,iii]+temp1UB[ii]
vcPC[ii,iii] = temp2[ii,iii]+temp1PC[ii]
if vcUB[ii,iii] > bestUB
bestUB = vcUB[ii,iii]
tempindUB = ii
end
if vcPC[ii,iii] > bestPC
bestPC = vcPC[ii,iii]
tempindPC = ii
end
end
else
vcLB[ii,iii] = temp2[ii,iii]+temp1LB[ii]
vcWC[ii,iii] = temp2[ii,iii]+temp1WC[ii]
if vcLB[ii,iii] > bestLB
bestLB = vcLB[ii,iii]
tempindLB = ii
end
if vcWC[ii,iii] > bestWC
bestWC = vcWC[ii,iii]
tempindWC = ii
end
vcUB[ii,iii] = temp2[ii,iii]+temp1UB[ii]
vcPC[ii,iii] = temp2[ii,iii]+temp1PC[ii]
if vcUB[ii,iii] > bestUB
bestUB = vcUB[ii,iii]
tempindUB = ii
end
if vcPC[ii,iii] > bestPC
bestPC = vcPC[ii,iii]
tempindPC = ii
end
end
end
IndicesLB[i,iii,T-t] = tempindLB
vLB[i,iii]= bestWC
IndicesUB[i,iii,T-t] = tempindUB
vUB[i,iii]= bestPC
end
end
mul!(v_previousWC, vLB, pstatevec)
v_previousWC .*= β
mul!(v_previousBC, vUB, pstatevec)
v_previousBC .*= β
end
end
Dpast = matchrow(ones(K,1),ChoiceSet)
DstarLB = ChoiceSet[IndicesLB[Dpast,indshocks[τ],τ],:]
DstarUB = ChoiceSet[IndicesUB[Dpast,indshocks[τ],τ],:]
return DstarLB, DstarUB
end
``````

This is slower that the original version of fun_inside which is puzzling because it uses less allocations and less memory. But what I find a even more puzzling is that when I use the profiler in this new function

``````mul!(temp2, Choice, temp)
``````

This is weird because this is an operation that can be done only once for both “options”.

Okay I looked at this again and I have a couple of comments/suggestions.

First of all, correct me if I am wrong, but the only difference I can see between option 1 and 2 is that you use some different variable names (WC vs PC) and add a different constant to temp1/temp1WC(or temp1PC). Copying your whole inner loop for those small differences is probably not the best way to handle that.

Second, there are a few variables where you allocate a large matrix that you don’t really need. For example you only ever handle one entry in vc/vcWC at a time and only use the values in one iteration of the inner loop. You don’t need to store the values and keep them around. You don’t need temp1 to be 2D if you only ever access one column of it.

Third, as far as I can tell temp1WC always differs from temp1 by the same constant (10). This means that vcWC also differs from vc by that constant, and tempindWC is always the same as tempind. In short the code that the profiler is pointing at is doing twice as much work as it needs to.