Help Improving Performance of a Loop

Hello!
I am trying to make a code more efficient and I have found a huge bottleneck in the following function of this MWE:

using Random, BenchmarkTools

J = 250
T = 100


Mat1 = rand(J,J)
vec1 = rand(J)

Mat2= rand(J,J)
vec2 = rand(J)

Mat3 = rand(J,J,T)
vec3 = rand(J,T)



@views function f1(X,Y,Z,params,Mat1,Mat2,vec2,vec1,vec3,Mat3)
    (J,T) = size(X)
    (γoᴱ,γgᴱ,ψgᴱ,κgᴱ,γlᴱ,ψlᴱ,κlᴱ,γaᴱ,ψaᴱ,κaᴱ)=params
    eⱼX = zeros(Float32,J,T)
    eⱼY = zeros(Float32,J,T)
    eⱼZ = zeros(Float32,J,T)
    @fastmath @inbounds for t=1:T, l=1:J, j=1:J
            if (l!=j)
            ejl = γoᴱ+γgᴱ*(1+ψgᴱ*vec1[j])*exp(Mat1[j,l]*(-κgᴱ))+γlᴱ*(1+ψlᴱ*vec2[j])*exp(Mat2[j,l]*(-κlᴱ))+γaᴱ*(1+ψaᴱ*vec3[j,t])*exp(Mat3[j,l,t]*(-κaᴱ))
            elj = γoᴱ+γgᴱ*(1+ψgᴱ*vec1[l])*exp(Mat1[j,l]*(-κgᴱ))+γlᴱ*(1+ψlᴱ*vec2[l])*exp(Mat2[j,l]*(-κlᴱ))+γaᴱ*(1+ψaᴱ*vec3[l,t])*exp(Mat3[j,l,t]*(-κaᴱ))
            eⱼX[j,t] =eⱼX[j,t] + X[l,t]*(ejl+elj)
            eⱼY[j,t] =eⱼY[j,t] + Y[l,t]*(ejl+elj)
            eⱼZ[j,t] =eⱼZ[j,t] + Z[l,t]*(ejl+elj)
            end
    end
    return eⱼX, eⱼY, eⱼZ
end

params = [0.0,
22.504883,
0.00010256348,
0.0021472168,
28.549805,
1.4460449,
23.164062,
29.42871,
1.4924316,
79.72656]

TEST = @benchmark f1(zeros(Int8,J,60),ones(Int8,J,60),ones(Int8,J,60),params,Mat1,Mat2,vec2,vec1,vec3,Mat3)

the benchmark results are

BenchmarkTools.Trial: 
  memory estimate:  220.58 KiB
  allocs estimate:  10
  --------------
  minimum time:     198.501 ms (0.00% GC)
  median time:      199.499 ms (0.00% GC)
  mean time:        199.653 ms (0.00% GC)
  maximum time:     202.778 ms (0.00% GC)
  --------------
  samples:          26
  evals/sample:     1

The parameter values in params are meaningless. However, I do not think they should affect performance in any way.

I would really appreciate any recommendation to make this faster!

I am not completely sure if the results are the same, you should take a look at that first (they are certainly not the same, but I don’t know if the differences are important or if the reason if something not related to the benchmark). What I did is to separate the inner loop in its three levels and used LoopVectorization (@avx macro) in the inner loop. That seems to accelerate ~7x the execution (I removed the @views, @inbounds and @fastmath flags as well):

With your code (second execution):

julia> @time f1(zeros(Int8,J,60),ones(Int8,J,60),ones(Int8,J,60),params,Mat1,Mat2,vec2,vec1,vec3,Mat3)
  0.207526 seconds (10 allocations: 220.578 KiB)
(Float32[0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0], Float32[12179.851 12341.136 … 12276.678 12244.565; 12507.148 12552.595 … 12462.509 12452.235; … ; 12946.373 12772.485 … 12852.6045 12584.991; 12812.645 12605.404 … 12843.521 12823.51], Float32[12179.851 12341.136 … 12276.678 12244.565; 12507.148 12552.595 … 12462.509 12452.235; … ; 12946.373 12772.485 … 12852.6045 12584.991; 12812.645 12605.404 … 12843.521 12823.51])

With this code (second run in all cases):

Code
using Random

J = 250
T = 100


Mat1 = rand(J,J)
vec1 = rand(J)

Mat2= rand(J,J)
vec2 = rand(J)

Mat3 = rand(J,J,T)
vec3 = rand(J,T)


using LoopVectorization

function f1(X,Y,Z,params,Mat1,Mat2,vec2,vec1,vec3,Mat3)
    (J,T) = size(X)
    (γoᴱ,γgᴱ,ψgᴱ,κgᴱ,γlᴱ,ψlᴱ,κlᴱ,γaᴱ,ψaᴱ,κaᴱ)=params
    eⱼX = zeros(Float32,J,T)
    eⱼY = zeros(Float32,J,T)
    eⱼZ = zeros(Float32,J,T)
    for t=1:T
      for l=1:J
        @avx for j=1:J
            ejl = γoᴱ+γgᴱ*(1+ψgᴱ*vec1[j])*exp(Mat1[j,l]*(-κgᴱ))+γlᴱ*(1+ψlᴱ*vec2[j])*exp(Mat2[j,l]*(-κlᴱ))+γaᴱ*(1+ψaᴱ*vec3[j,t])*exp(Mat3[j,l,t]*(-κaᴱ))
            elj = γoᴱ+γgᴱ*(1+ψgᴱ*vec1[l])*exp(Mat1[j,l]*(-κgᴱ))+γlᴱ*(1+ψlᴱ*vec2[l])*exp(Mat2[j,l]*(-κlᴱ))+γaᴱ*(1+ψaᴱ*vec3[l,t])*exp(Mat3[j,l,t]*(-κaᴱ))
            eⱼX[j,t] =eⱼX[j,t] + X[l,t]*(ejl+elj)
            eⱼY[j,t] =eⱼY[j,t] + Y[l,t]*(ejl+elj)
            eⱼZ[j,t] =eⱼZ[j,t] + Z[l,t]*(ejl+elj)
        end
      end 
    end
    return eⱼX, eⱼY, eⱼZ
end

params = [0.0,
22.504883,
0.00010256348,
0.0021472168,
28.549805,
1.4460449,
23.164062,
29.42871,
1.4924316,
79.72656]

@time f1(zeros(Int8,J,60),ones(Int8,J,60),ones(Int8,J,60),params,Mat1,Mat2,vec2,vec1,vec3,Mat3)
@time f1(zeros(Int8,J,60),ones(Int8,J,60),ones(Int8,J,60),params,Mat1,Mat2,vec2,vec1,vec3,Mat3)



julia> @time f1(zeros(Int8,J,60),ones(Int8,J,60),ones(Int8,J,60),params,Mat1,Mat2,vec2,vec1,vec3,Mat3)
  0.028289 seconds (10 allocations: 220.578 KiB)
(Float32[0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0], Float32[12634.557 12446.863 … 12334.467 121
62.421; 12453.994 12371.172 … 12615.109 12296.466; … ; 12301.93 12348.444 … 12477.65 12510.166; 12712.252 12737.326 … 12705.662 12569
.258], Float32[12634.557 12446.863 … 12334.467 12162.421; 12453.994 12371.172 … 12615.109 12296.466; … ; 12301.93 12348.444 … 12477.6
5 12510.166; 12712.252 12737.326 … 12705.662 12569.258])

By the way, since you are using Float32 arrays inside, if you initialize everything as Float32 you get an additional ~2x speedup:

 0.012065 seconds (10 allocations: 220.578 KiB)
(Float32[0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0], Float32[12742.045 12706.508 … 12620.173 12605.99; 12646.944 12537.022 … 12411.814 12757.828; … ; 12568.096 12390.755 … 12645.672 12821.599; 13024.046 13046.444 … 12973.952 13356.733], Float32[12742.045 12706.508 … 12620.173 12605.99; 12646.944 12537.022 … 12411.814 12757.828; … ; 12568.096 12390.755 … 12645.672 12821.599; 13024.046 13046.444 … 12973.952 13356.733])

Code
using Random

J = 250
T = 100

Mat1 = rand(Float32,J,J)
vec1 = rand(Float32,J)

Mat2= rand(Float32,J,J)
vec2 = rand(Float32,J)

Mat3 = rand(Float32,J,J,T)
vec3 = rand(Float32,J,T)

using LoopVectorization

function f1(X,Y,Z,params,Mat1,Mat2,vec2,vec1,vec3,Mat3)
    (J,T) = size(X)
    (γoᴱ,γgᴱ,ψgᴱ,κgᴱ,γlᴱ,ψlᴱ,κlᴱ,γaᴱ,ψaᴱ,κaᴱ)=params
    eⱼX = zeros(Float32,J,T)
    eⱼY = zeros(Float32,J,T)
    eⱼZ = zeros(Float32,J,T)
    for t=1:T
      for l=1:J
        @avx for j=1:J
            ejl = γoᴱ+γgᴱ*(1+ψgᴱ*vec1[j])*exp(Mat1[j,l]*(-κgᴱ))+γlᴱ*(1+ψlᴱ*vec2[j])*exp(Mat2[j,l]*(-κlᴱ))+γaᴱ*(1+ψaᴱ*vec3[j,t])*exp(Mat3[j,l,t]*(-κaᴱ))
            elj = γoᴱ+γgᴱ*(1+ψgᴱ*vec1[l])*exp(Mat1[j,l]*(-κgᴱ))+γlᴱ*(1+ψlᴱ*vec2[l])*exp(Mat2[j,l]*(-κlᴱ))+γaᴱ*(1+ψaᴱ*vec3[l,t])*exp(Mat3[j,l,t]*(-κaᴱ))
            eⱼX[j,t] =eⱼX[j,t] + X[l,t]*(ejl+elj)
            eⱼY[j,t] =eⱼY[j,t] + Y[l,t]*(ejl+elj)
            eⱼZ[j,t] =eⱼZ[j,t] + Z[l,t]*(ejl+elj)
        end
      end 
    end
    return eⱼX, eⱼY, eⱼZ
end

params = Float32[0.0,
22.504883,
0.00010256348,
0.0021472168,
28.549805,
1.4460449,
23.164062,
29.42871,
1.4924316,
79.72656]

@time f1(zeros(Int8,J,60),ones(Int8,J,60),ones(Int8,J,60),params,Mat1,Mat2,vec2,vec1,vec3,Mat3)
@time f1(zeros(Int8,J,60),ones(Int8,J,60),ones(Int8,J,60),params,Mat1,Mat2,vec2,vec1,vec3,Mat3)



5 Likes

Hi @lmiq , as usual, your advice is most useful. The reason both results do not coincide is because there is an if statement in the inner loop. I passed this requirement as a Boolean (l!=j) so that it works with Avx.

1 Like

Oh, yes, I forgot I have removed that. Did it work? (How did you solve the avx problem?)

Yes, both methods yield the same result, see the code below

function f2(X,Y,Z,params,Mat1,Mat2,vec2,vec1,vec3,Mat3)
    (J,T) = size(X)
    (γoᴱ,γgᴱ,ψgᴱ,κgᴱ,γlᴱ,ψlᴱ,κlᴱ,γaᴱ,ψaᴱ,κaᴱ)=params
    eⱼX = zeros(Float32,J,T)
    eⱼY = zeros(Float32,J,T)
    eⱼZ = zeros(Float32,J,T)
    for t=1:T
      for l=1:J
        @avx for j=1:J
            ejl = γoᴱ+γgᴱ*(1+ψgᴱ*vec1[j])*exp(Mat1[j,l]*(-κgᴱ))+γlᴱ*(1+ψlᴱ*vec2[j])*exp(Mat2[j,l]*(-κlᴱ))+γaᴱ*(1+ψaᴱ*vec3[j,t])*exp(Mat3[j,l,t]*(-κaᴱ))
            elj = γoᴱ+γgᴱ*(1+ψgᴱ*vec1[l])*exp(Mat1[j,l]*(-κgᴱ))+γlᴱ*(1+ψlᴱ*vec2[l])*exp(Mat2[j,l]*(-κlᴱ))+γaᴱ*(1+ψaᴱ*vec3[l,t])*exp(Mat3[j,l,t]*(-κaᴱ))
            eⱼX[j,t] =eⱼX[j,t] + X[l,t]*(ejl+elj)*(l!=j)
            eⱼY[j,t] =eⱼY[j,t] + Y[l,t]*(ejl+elj)*(l!=j)
            eⱼZ[j,t] =eⱼZ[j,t] + Z[l,t]*(ejl+elj)*(l!=j)
        end
      end
    end
    return eⱼX, eⱼY, eⱼZ
end
2 Likes

Nice, smart way of dealing with that. I didn’t think of that before.

2 Likes

Is it possible to use LoopVectorization if the function f1 is being parallelized? I am getting a very weird error

On worker 2:
LoadError: MethodError: no method matching instruction!(::LoopVectorization.LoopSet, ::typeof(Base.maybeview))
Closest candidates are:
  instruction!(::LoopVectorization.LoopSet, !Matched::Symbol) at C:\Users\vincenzi\.julia\packages\LoopVectorization\lmGAZ\src\graphs.jl:563
  instruction!(::LoopVectorization.LoopSet, !Matched::Expr) at C:\Users\vincenzi\.julia\packages\LoopVectorization\lmGAZ\src\graphs.jl:555
add_compute! at C:\Users\vincenzi\.julia\packages\LoopVectorization\lmGAZ\src\add_compute.jl:209
add_compute! at C:\Users\vincenzi\.julia\packages\LoopVectorization\lmGAZ\src\add_compute.jl:207 [inlined]
add_operation! at C:\Users\vincenzi\.julia\packages\LoopVectorization\lmGAZ\src\graphs.jl:584
add_parent! at C:\Users\vincenzi\.julia\packages\LoopVectorization\lmGAZ\src\add_compute.jl:61
add_compute! at C:\Users\vincenzi\.julia\packages\LoopVectorization\lmGAZ\src\add_compute.jl:238
add_compute! at C:\Users\vincenzi\.julia\packages\LoopVectorization\lmGAZ\src\add_compute.jl:207 [inlined]
add_operation! at C:\Users\vincenzi\.julia\packages\LoopVectorization\lmGAZ\src\graphs.jl:584
add_parent! at C:\Users\vincenzi\.julia\packages\LoopVectorization\lmGAZ\src\add_compute.jl:61
add_compute! at C:\Users\vincenzi\.julia\packages\LoopVectorization\lmGAZ\src\add_compute.jl:238
add_compute! at C:\Users\vincenzi\.julia\packages\LoopVectorization\lmGAZ\src\add_compute.jl:207 [inlined]
add_operation! at C:\Users\vincenzi\.julia\packages\LoopVectorization\lmGAZ\src\graphs.jl:584
add_parent! at C:\Users\vincenzi\.julia\packages\LoopVectorization\lmGAZ\src\add_compute.jl:61
add_compute! at C:\Users\vincenzi\.julia\packages\LoopVectorization\lmGAZ\src\add_compute.jl:238
add_compute! at C:\Users\vincenzi\.julia\packages\LoopVectorization\lmGAZ\src\add_compute.jl:207 [inlined]
add_operation! at C:\Users\vincenzi\.julia\packages\LoopVectorization\lmGAZ\src\graphs.jl:584
push! at C:\Users\vincenzi\.julia\packages\LoopVectorization\lmGAZ\src\graphs.jl:655
add_block! at C:\Users\vincenzi\.julia\packages\LoopVectorization\lmGAZ\src\graphs.jl:436 [inlined]
add_loop! at C:\Users\vincenzi\.julia\packages\LoopVectorization\lmGAZ\src\graphs.jl:539
copyto! at C:\Users\vincenzi\.julia\packages\LoopVectorization\lmGAZ\src\constructors.jl:6 [inlined]
LoopSet at C:\Users\vincenzi\.julia\packages\LoopVectorization\lmGAZ\src\constructors.jl:52
LoopSet at C:\Users\vincenzi\.julia\packages\LoopVectorization\lmGAZ\src\constructors.jl:56
@avx at C:\Users\vincenzi\.julia\packages\LoopVectorization\lmGAZ\src\constructors.jl:137
eval at .\boot.jl:330
#105 at D:\buildbot\worker\package_win64\build\usr\share\julia\stdlib\v1.3\Distributed\src\process_messages.jl:290
run_work_thunk at D:\buildbot\worker\package_win64\build\usr\share\julia\stdlib\v1.3\Distributed\src\process_messages.jl:79
run_work_thunk at D:\buildbot\worker\package_win64\build\usr\share\julia\stdlib\v1.3\Distributed\src\process_messages.jl:88
#98 at .\task.jl:333
in expression starting at C:\Users\vincenzi\Documents\vincenzi\10-Estimation_Revenues_Inside.jl:956

...and 40 more exception(s).

sync_end(::Array{Any,1}) at task.jl:300
macro expansion at task.jl:319 [inlined]
remotecall_eval(::Module, ::Array{Int64,1}, ::Expr) at macros.jl:217
top-level scope at macros.jl:201

Probably experts can give you a more specific advice. Indeed, puting the two macros in the loops returns errors. However, this splitting, for example, works:

julia> using LoopVectorization

julia> function inner_sum(x)
         s = 0.
         @avx for i in 1:length(x)
           s += x[i]
         end
         s
       end
       function mysum(x)
         s = zeros(Threads.nthreads())
         Threads.@threads for i in 1:size(x,2)
           y = @view x[:,i]
           s[Threads.threadid()] += inner_sum(y)
         end
         sum(s)
       end
mysum (generic function with 1 method)

julia> function naivesum(x)
         s = 0.
         for j in 1:size(x,2)
            for i in 1:size(x,1)
               s += x[i,j]
            end 
         end
         s
       end
naivesum (generic function with 1 method)

julia> using BenchmarkTools

julia> x = rand(1000,1000);

julia> @btime mysum($x)
  104.049 μs (22 allocations: 3.11 KiB)
499788.61955476226

julia> @btime naivesum($x)
  1.029 ms (0 allocations: 0 bytes)
499788.6195547561

julia> Threads.nthreads()
4


edit (again). If I try to implement that in one function I get an error, related to the fact that s is a vector now, but I don’t know why:

julia> function naivesum(x)
         s = zeros(Threads.nthreads())
         Threads.@threads for i in 1:size(x,1)
            @avx for j in 1:size(x,2)
               s[Threads.threadid()] += x[i,j]
            end 
         end
         sum(s)
       end
naivesum (generic function with 1 method)

julia> @btime naivesum($x)
ERROR: TaskFailedException:
UndefVarError: ##indexpr#440 not defined
Stacktrace:
 [1] macro expansion at ./REPL[29]:4 [inlined]
 [2] (::var"#145#threadsfor_fun#17"{Array{Float64,2},Array{Float64,1},UnitRange{Int64}})(::Bool) at ./threadingconstructs.jl:81
 [3] (::var"#145#threadsfor_fun#17"{Array{Float64,2},Array{Float64,1},UnitRange{Int64}})() at ./threadingconstructs.jl:48

Yeah, I do not know how, but my problem was being caused by having the @views macro wrapping the function.

If my advice is worth anything here, I would suggest not wrapping a whole function like that in any of these macros, as one can use views exactly at the points they are needed, and that makes the code much clearer. And be more conservative in using @fastmath, @inbounds, etc. Most times these flags do not add a performance gain that is worth the bugs you are missing. I would leave them for the very, very last moment, when you are absolutely sure your code does is very good in all other aspects.

(one can also run julia with julia --check-bounds=no to skip bounds checking as a whole for the final production run, if that is really important).

3 Likes

Hi! As a sidenote, I get a ~ x19 performance by computing the matrix exponentials in a separate loop (and using LoopVectorization.jl)

My code:

function f1(X,Y,Z,params,Mat1,Mat2,vec2,vec1,vec3,Mat3)
    (J,T) = size(X)
    (γoᴱ,γgᴱ,ψgᴱ,κgᴱ,γlᴱ,ψlᴱ,κlᴱ,γaᴱ,ψaᴱ,κaᴱ)=params
    eⱼX = zeros(Float32,J,T)
    eⱼY = zeros(Float32,J,T)
    eⱼZ = zeros(Float32,J,T)
    
    em1 = zeros(Float32, J, J)
    em2 = zeros(Float32, J, J)
    
    for l=1:J
        @avx for j=1:J
            em1[j, l] = exp(Mat1[j, l]*(-κgᴱ))
            em2[j, l] = exp(Mat2[j, l]*(-κlᴱ))
        end
    end
        
    for t=1:T, l=1:J
        @avx for j=1:J
        em3 = exp(Mat3[j, l, t]*(-κaᴱ))

        ejl = γoᴱ+γgᴱ*(1+ψgᴱ*vec1[j])*em1[j,l]+γlᴱ*(1+ψlᴱ*vec2[j])*em2[j,l]+γaᴱ*(1+ψaᴱ*vec3[j,t])*em3
        elj = γoᴱ+γgᴱ*(1+ψgᴱ*vec1[l])*em1[j,l]+γlᴱ*(1+ψlᴱ*vec2[l])*em2[j,l]+γaᴱ*(1+ψaᴱ*vec3[l,t])*em3
        eⱼX[j,t] += X[l,t]*(ejl+elj)*(l!=j)
        eⱼY[j,t] += Y[l,t]*(ejl+elj)*(l!=j)
        eⱼZ[j,t] += Z[l,t]*(ejl+elj)*(l!=j)
        end
    end
        
    return eⱼX, eⱼY, eⱼZ
end

A1 = zeros(Int8,J,60)
A2 = ones(Int8,J,60)
A3 = ones(Int8,J,60)

TEST = @benchmark f1($A1,$A2,$A3,$params,$Mat1,$Mat2,$vec2,$vec1,$vec3,$Mat3)

I go from

memory estimate:  176.11 KiB
  allocs estimate:  6
  --------------
  minimum time:     232.081 ms (0.00% GC)
  median time:      236.289 ms (0.00% GC)
  mean time:        237.704 ms (0.00% GC)
  maximum time:     258.714 ms (0.00% GC)
  --------------
  samples:          22
  evals/sample:     1

to

memory estimate:  664.64 KiB
  allocs estimate:  10
  --------------
  minimum time:     12.698 ms (0.00% GC)
  median time:      12.988 ms (0.00% GC)
  mean time:        14.293 ms (0.10% GC)
  maximum time:     27.887 ms (0.00% GC)
  --------------
  samples:          350
  evals/sample:     1

It seems that exponentiation is very costly and you don’t want to compute the same exponentials several times in the loop

:grinning:

7 Likes

Note that exp is about 4x faster in 1.6, but this is still probably worthwhile.

3 Likes

Thanks! To build intuition, why do you think it is better doing it in a loop rather than broadcasting it also outside of the loop?

@avx em1 .= exp.(Mat1.*(-κgᴱ)) should be the same speed.

3 Likes

I didn’t play much around with that but I though broadcasting twice would take longer than a single loop

Probably there are cases and cases, but that can indeed happen:

julia> x = zeros(1_000_000); y = zeros(1_000_000); z = zeros(1_000_000); w = zeros(1_000_000);

julia> function myloop!(x,y,z,w)
         @avx for i in 1:length(x)
           z[i] = x[i]*y[i]
           w[i] = x[i]+y[i]
         end
       end
myloop! (generic function with 1 method)

julia> @btime myloop!($x,$y,$z,$w);
  1.564 ms (0 allocations: 0 bytes)

julia> function mybroadcasts!(x,y,z,w)
          @avx z .= x .* y
          @avx w .= x .+ y
       end
mybroadcasts! (generic function with 1 method)

julia> @btime mybroadcasts!($x,$y,$z,$w);
  2.411 ms (0 allocations: 0 bytes)

(the @avx macro does not play significant role here, probably because the loops are too simple and get vectorized anyway).