How to optimize these nesting loops?

111 · May 22, 2018, 7:07am

The code is:

nE = 2
eMax = 3.0
epsilon = linspace(0, eMax, nE)

function transition_momentum()
    u0 = load("data/gs.jld")["u"]
    D = zeros(Complex128, nE, nE)

    x, p = xpgrid(nx, dx, dp)
    for jE in 1:nE
        for iE in 1:nE
            for jx in 1:nx
                D[iE, jE] += sum( cis. ( x * √(2*epsilon[iE]) + x[jx] * √(2*epsilon[jE]) ) .* (x + x[jx]) .* conj.(@view u0[1:nx, jx]) )
            end
        end
        println("jE=", jE)
    end
    save("data/d.jld", "D", D)
end

@time @allocated transition_momentum()

Here, u0 :: Array{Complex128, 2}
This function cost a lot of memory when nE is large. can I optimize this code? how?

Elrod · May 22, 2018, 7:20am

You have a lot of variables of unknown (to the compiler) type.
Assuming xpgrid is type stable, then this should be better:

nE = 2
eMax = 3.0
epsilon = linspace(0, eMax, nE)

function transition_momentum(nE, nx, dx, dp, eMax, epsilon, u0)
    D = zeros(Complex{Float64}, nE, nE) #change because Complex128 will soon be deprecated

    root_2eps = @. √(2epsilon)
    x, p = xpgrid(nx, dx, dp) #what is this?
    buffer = similar(x, Complex{Float64})
    @inbounds for jE in 1:nE 
        for iE in 1:nE, jx in 1:nx
            @. buffer = cis( x * root_2eps[iE] + x[jx] * root_2eps[jE] ) * (x + x[jx]) * conj(@view u0[1:nx, jx]) 
            D[iE, jE] += sum( buffer )
        end
        println("jE=", jE)
    end
    save("data/d.jld", "D", D)
end
@time @allocated transition_momentum(nE, nx, dx, dp, eMax, epsilon, load("data/gs.jld")["u"])

Basically, I just made the global variables arguments to the function.
If you really want to load u0 inside your function, then annotate it:

u0 = (load("data/gs.jld")["u"])::Array{Complex128, 2}

111 · May 22, 2018, 7:30am

sorry, i forget to give the implemention of function xpgrid

function xpgrid(nx::Int , dx::Float64, dp::Float64)
    x = Array((-nx/2+1 : nx/2) * dx)
    p = Array((-nx/2+1 : nx/2) * dp)
    p = circshift(p, round(Int, nx/2))
    x, p
end

Elrod · May 22, 2018, 7:41am

I added the rest of the missing variables to the function transition_momentum, so my version should be type stable. Use @code_warntype to be sure.
But you should add enough code (eg, what are nx, dx, and dp). And we can use something like:

u0 = @. randn(nx,nx) + im*randn(nx,nx)

for u0 so we don’t have to bother with the file.

111 · May 22, 2018, 7:43am

Thank you. The code is faster now.
the @time results was:

8.406818 seconds (4.80 M allocations: 2.957 GiB, 6.25% gc time)

and now is

6.706108 seconds (3.13 M allocations: 1.903 GiB, 6.72% gc time)

without the root_2ps strategy.
I got an error with this strategy:

ERROR: LoadError: UndefVarError: @.√ not defined
Stacktrace:
 [1] include_from_node1(::String) at .\loading.jl:576
 [2] include(::String) at .\sysimg.jl:14
 [3] process_options(::Base.JLOptions) at .\client.jl:305
 [4] _start() at .\client.jl:371
while loading E:\julia codes\dynamic-interference\dynamic.jl, in expression starting on line 12

The versioninfo is

 Julia Version 0.6.2
Commit d386e40c17* (2017-12-13 18:08 UTC)
Platform Info:
  OS: Windows (x86_64-w64-mingw32)
  CPU: Intel(R) Core(TM) i7-4770 CPU @ 3.40GHz
  WORD_SIZE: 64
  BLAS: mkl_rt
  LAPACK: mkl_rt
  LIBM: libopenlibm
  LLVM: libLLVM-3.9.1 (ORCJIT, haswell)

111 · May 22, 2018, 7:46am

const nx = 2^12
const dx = 0.2
const dp = 2pi/ (nx * dx)

and u0 is read from file using the JLD package.

Elrod · May 22, 2018, 7:47am

It should be fine even without root_2ps:

julia> x = 1:20;

julia> @. x + √(2x[3])
20-element Array{Float64,1}:
  3.44949
  4.44949
  5.44949
  6.44949
  7.44949
  8.44949
  9.44949
 10.4495 
 11.4495 
 12.4495 
 13.4495 
 14.4495 
 15.4495 
 16.4495 
 17.4495 
 18.4495 
 19.4495 
 20.4495 
 21.4495 
 22.4495 

julia> versioninfo()
Julia Version 0.6.2
Commit d386e40* (2017-12-13 18:08 UTC)
Platform Info:
  OS: Linux (x86_64-pc-linux-gnu)
  CPU: AMD Ryzen Threadripper 1950X 16-Core Processor
  WORD_SIZE: 64
  BLAS: libopenblas (USE64BITINT NO_LAPACK NO_LAPACKE ZEN)
  LAPACK: libopenblas
  LIBM: libopenlibm
  LLVM: libLLVM-3.9.1 (ORCJIT, generic)

Yes, I saw u0 is read from a file.
But I would recommend you add a line like I had to randomly generate it to the code you posted, so everyone reading the thread can easily copy and paste your code to test it.

Because I couldn’t, I just looked at it and guessed, without actually being able to / bothering to test anything.

EDIT:

nE = 2
eMax = 3.0
epsilon = linspace(0, eMax, nE)
const nx = 2^12
const dx = 0.2
const dp = 2pi/ (nx * dx)
u0 = @. randn(nx,nx) + im*randn(nx,nx)
function xpgrid(nx::Int , dx::Float64, dp::Float64)
    x = Array((-nx/2+1 : nx/2) * dx)
    p = Array((-nx/2+1 : nx/2) * dp)
    p = circshift(p, round(Int, nx/2))
    x, p
end
function transition_momentum(nE, nx, dx, dp, eMax, epsilon, u0)
    D = zeros(Complex{Float64}, nE, nE) #change because Complex128 will soon be deprecated

    root_2eps = @. √(2epsilon)
    x, p = xpgrid(nx, dx, dp) #what is this?
    @inbounds for jE in 1:nE 
        for iE in 1:nE, jx in 1:nx
            temp = zero(Complex{Float64})
            for i ∈ eachindex(x)
                temp += cis( x[i] * root_2eps[iE] + x[jx] * root_2eps[jE] ) * (x[i] + x[jx]) * conj(u0[i, jx])
            end
            D[iE, jE] += temp
        end
        println("jE=", jE)
    end
    #save("data/d.jld", "D", D)
    D
end
@time transition_momentum(nE, nx, dx, dp, eMax, epsilon, u0)
@time transition_momentum(nE, nx, dx, dp, eMax, epsilon, u0)

The second run results in:

julia> @time transition_momentum(nE, nx, dx, dp, eMax, epsilon, u0)
jE=1
jE=2
  1.586719 seconds (31 allocations: 97.219 KiB)
2×2 Array{Complex{Float64},2}:
  5.33832e5-1.51294e6im   1.93597e6+2.96262e5im
 -5.82017e5-1.36504e6im  -4.47849e5-2.37888e5im

111 · May 22, 2018, 8:07am

following your advice, I rewrite the code, and it’s now

function transition_momentum()
    nE = 2
    eMax = 3.0
    epsilon = linspace(0, eMax, nE)

    u0 = load("data/gs.jld")["u"]::Array{Complex128, 2}
    D = zeros(Complex128, nE, nE)

    x, p = xpgrid(nx, dx, dp)
    buffer = zeros(Complex128, nx)
    # root_2eps = @. √(2epsilon)
    @inbounds for jE in 1:nE
        for iE in 1:nE, jx in 1:nx
            buffer .= cis. ( x * √(2*epsilon[iE]) + x[jx] * √(2*epsilon[jE]) ) .* (x + x[jx]) .* conj.(@view u0[1:nx, jx])
            # @. buffer = cis ( x * √(2*epsilon[iE]) + x[jx] * √(2*epsilon[jE]) ) * (x + x[jx]) * conj(@view u0[1:nx, jx])
            D[iE, jE] += sum( buffer )
        end
        println("jE=", jE)
    end
    save("data/d.jld", "D", D)
    nothing
end

it’s faster now, and the @code_warntype() result seems fine. However, the memory allocation is still huge, especially when nE is larger. For instance, when nE=8, the allocation is 24.47 G.

Is it possible to reduce the memory allocation further?

kristoffer.carlsson · May 22, 2018, 8:10am

Please give an example that doesn’t require reading any external files. I want to be able to copy paste it into the terminal and then it just runs.

Elrod · May 22, 2018, 8:12am

On my last version (where I added calls to randn so we don’t have to load files), I get:

julia> @time transition_momentum(8, nx, dx, dp, eMax, epsilon, u0)
jE=1
jE=2
jE=3
jE=4
jE=5
jE=6
jE=7
jE=8
 30.421795 seconds (80 allocations: 99.969 KiB)
8×8 Array{Complex{Float64},2}:
  5.33832e5-1.51294e6im  -1.54329e6+2.22141e6im  -4.31243e5-5.86691e5im  …   -1.1312e6-696146.0im    1.93597e6+2.96262e5im
  1.75447e6-1.93466e5im  -2.05835e6-4.62485e5im   2.37408e6+61961.5im       -6.39608e5-4.80046e5im   5.35422e5+9.48658e5im
 -1.17199e6+1.25909e6im    7.6453e5-5.34269e5im   2.74881e6-1.39307e5im     -1.61246e6+6.512e5im     3.62134e5-1.92652e5im
  -852106.0+1.43851e6im     88059.1-1.72902e6im   1.51782e6-7.26979e5im        36996.0-1.0321e6im    7.45749e5+7.68406e5im
 -3.16811e5-1.29502e5im  -1.41417e6+4.29387e5im  -3.35235e6+5.73937e5im      -2.8774e6-9.57062e5im   1.83652e6+9.30584e5im
  -4.3543e5-1.91258e6im     2.776e6+1.91286e6im  -1.66534e6+7.29206e5im  …   2.33819e6-6.04518e5im  -1.34943e6-1.27409e6im
 -2.45719e6+5.22639e5im  -8.39155e5-8.91657e5im   6.29999e5+6.64297e5im     -1.83033e6-1.63293e6im   7.74408e5-1.33113e6im
 -5.82017e5-1.36504e6im     37593.4+1.6048e6im   -8.53199e5-248828.0im       1.56588e6-1.75479e6im  -4.47849e5-2.37888e5im

Less than 100k allocated with nE=8.

111 · May 22, 2018, 8:18am

Thank you very much, that is a big improvement for my code.

111 · May 22, 2018, 8:21am

Sorry, I should’ve given an example without external file requirement. The revised version given by @Elrod solved my problem.

Elrod · May 22, 2018, 8:24am

No problem.

Watch out for the everywhere you’re allocating a new matrix. Using @view like you did helps, but often the easiest way is to just be explicit with a for loop instead of a broadcast.

Topic		Replies	Views
Optimizing nested for loops and memory allocation in finite differences routine Performance	3	278	September 19, 2022
Nested Loop optimization Performance	2	426	September 20, 2022
Optimizing nested loops with conditional inside Performance question , loops , physics , loopvectorization	10	460	July 19, 2023
The Optimization Problem with Nested Loops in Julia Performance question , performance , loops , physics , sparsearrays	8	720	October 23, 2023
Help me optimize dynamic programming code General Usage	7	1602	October 12, 2019

How to optimize these nesting loops?

Related topics