Hot loop updating array elements with product over other array

How can one improve performance of the following MWE (using 1.0.0)?

function bg_sim_mwe(nB=100,nSamples=100)

    e = 0.9
    p = 0.8
    nt = 100

    t = range(0,stop=5.0,length=nt)

    V(ph) = 1.0 - p + p*(cos(ph) + 1im*e*sin(ph))
    phi = zeros(nt)

    Vtotal = zeros(Complex,nt)

    for a = 1:nSamples

        om = randn(nB)  # this replaces an actual calculation

        for it = 1:nt
            phi .= om.*t[it]
            Vtotal[it] += prod(V,phi)
        end

    end

end

This runs as follows:

julia> @btime bg_sim_mwe()
  18.681 ms (30301 allocations: 1.09 MiB)

In my actual code, nB=10_000 and nSamples=1_000_000.

Here is my current state of insight:

  1. @inbounds in front of the for statement make no difference.
  2. @fastmath in front of the for statement saves 1/3 of the allocations, but does not affect performance beyond a few percent.
  3. julia --track-allocation=user indicates that 960k of allocations happen in the line with the +=. I assume these must occur inside of prod. Since the outer and the inner loop each run 100 times, this means that this line appears to requires 96k of allocations. That seems a lot for a 100-element complex vector that’s created on the fly and reduced via prod. Maybe I don’t understand what is going on here.
  4. @code_warntype bg_sim_mwe(100,100) shows some red for the line with the +=: ::Complex and ::Complex{_1} where _1. I don’t understand what to do about this. Vtotal is pre-allocated as a complex-valued array.

Any help is appreciated!

(Edit: Did wrong calculation of allocations.)

Complex is an abstract type, use e.g. ComplexF64.

You can replace the separate calls to sin and cos with one call to s, c = sincos(ph).

Ah, thank you, that makes sense! Replacing Complex by ComplexF64 improved performance somewhat from

julia> @btime bg_sim_mwe()
  18.524 ms (30202 allocations: 1.01 MiB)

to

julia> @btime bg_sim_mwe()
  17.862 ms (102 allocations: 90.14 KiB)

The number of allocations in now really small, so now it appears the number crunching is the bottleneck.
How can I incorporate thesincos trick?

V(ph) = begin
    s, c = sincos(ph)
    1.0 - 0.8 + 0.8*(c + 1im*e*s)
end

for example.

Using sincos improves runtime by a few percent:

julia> @btime bg_sim_mwe()
  17.310 ms (102 allocations: 90.14 KiB)

I notice you also replaced p and e by their values. This improves runtime by 1-2%

julia> @btime bg_sim_mwe()
  17.075 ms (102 allocations: 90.14 KiB)

but I don’t think I can do this in my production code.

Oh, sorry, that was just a thing I tested before. Don’t do that :slight_smile:

Since sin and cos are your main bottle neck you can use a library that is optimized for computing these.

One of those is Yeppp.jl and using my upgrade for new Julia versions (https://github.com/JuliaMath/Yeppp.jl/pull/38) I get

function bg_sim_mwe(nB=100,nSamples=100)
    e = 0.9
    p = 0.8
    nt = 100

    t = range(0,stop=5.0,length=nt)

    V(ph, c, s) = 1.0 - p + p*(c + 1im*e*s)
    phi = zeros(nt)

    Vtotal = zeros(ComplexF64,nt)
    _cos = zeros(nt)
    _sin = zeros(nt)

    for a = 1:nSamples
        om = randn(nB)  # this replaces an actual calculation
        for it = 1:nt
            phi .= om .* t[it]
            Yeppp.cos!(_cos, phi)
            Yeppp.sin!(_sin, phi)

            Vtotal[it] += prod(v -> V(v...), zip(phi, _cos, _sin))
        end
    end
end
julia> @btime bg_sim_mwe()
  10.253 ms (20104 allocations: 716.89 KiB)

compared to before

julia> @btime bg_sim_mwe()
  21.192 ms (102 allocations: 90.14 KiB)
1 Like

Off topic: Is Yeppp 1.0 ready? Because I get this error when adding Yeppp:

(v1.0) pkg> add Yeppp
 Resolving package versions...
 Installed Yeppp ─ v0.2.0
  Updating `C:\Users\Seif\.julia\environments\v1.0\Project.toml`
  [6310b701] + Yeppp v0.2.0
  Updating `C:\Users\Seif\.julia\environments\v1.0\Manifest.toml`
  [6310b701] + Yeppp v0.2.0
  Building Yeppp → `C:\Users\Seif\.julia\packages\Yeppp\sjf1r\deps\build.log`
┌ Error: Error building `Yeppp`:
│ ERROR: LoadError: LoadError: UndefVarError: is_linux not defined
│ Stacktrace:
│  [1] top-level scope at none:0
│  [2] eval at .\boot.jl:319 [inlined]
│  [3] @static(::LineNumberNode, ::Module, ::Any) at .\osutils.jl:19
│  [4] include at .\boot.jl:317 [inlined]
│  [5] include_relative(::Module, ::String) at .\loading.jl:1038
│  [6] include(::Module, ::String) at .\sysimg.jl:29
│  [7] include(::String) at .\client.jl:388
│  [8] top-level scope at none:0
│ in expression starting at C:\Users\Seif\.julia\packages\Yeppp\sjf1r\deps\build.jl:12
│ in expression starting at C:\Users\Seif\.julia\packages\Yeppp\sjf1r\deps\build.jl:6
└ @ Pkg.Operations C:\cygwin\home\Administrator\buildbot\worker\package_win64\build\usr\share\julia\stdlib\v1.0\Pkg\src\Operations.jl:1068

As I said,

So add Yeppp#kc/fixes instead.

Thank you so much, that works. Just a warning about Libdl, is it serious?

julia> using Yeppp
[ Info: Precompiling Yeppp [6310b701-1812-5374-a82f-9f6f2d54a40a]
┌ Warning: Package Yeppp does not have Libdl in its dependencies:
│ - If you have Yeppp checked out for development and have
│   added Libdl as a dependency but haven't updated your primary
│   environment's manifest file, try `Pkg.resolve()`.
│ - Otherwise you may need to report an issue with Yeppp
└ Loading Libdl into Yeppp from project dependency, future warnings for Yeppp are suppressed.

Alternatively, you could eliminate the phi array completely in your inner loop

for it = 1:nt 
    prod = one(ComplexF64)
    ti = t[it]
    @simd for j = 1:nB 
       @inbounds ph = om[j] * ti
       s, c = sincos(ph)
       prod *= (1 - p) + p * complex(c, e*s)
    end
    Vtotal[it] += prod
end

Would be interesting to see how this compares.

julia> @btime bg_sim_mwe()
  17.201 ms (102 allocations: 90.14 KiB)

Interestingly, I got (using new Yeppp master, Mac OSX, Julia 0.7):

julia> function f!(dest, src) Yeppp.cos!(dest, src) end
f! (generic function with 1 method)

julia> function g!(dest, src) dest .= cos.(src) end
g! (generic function with 1 method)

julia> A = rand(100); B = similar(A);

julia> @btime f!($B, $A);
  608.676 ns (0 allocations: 0 bytes)

julia> @btime g!($B, $A);
  542.580 ns (0 allocations: 0 bytes)

So Julia’s broadcasting is faster!

No, at least not on my machine (Julia 1.0 Windows 10):

julia> function f!(dest, src) Yeppp.cos!(dest, src) end
f! (generic function with 1 method)

julia> function g!(dest, src) dest .= cos.(src) end
g! (generic function with 1 method)

julia> A = rand(100); B = similar(A);

julia> @btime f!($B, $A);
  257.266 ns (0 allocations: 0 bytes)

julia> @btime g!($B, $A);
  581.227 ns (0 allocations: 0 bytes)

Interesting. Might be a good idea to bring back Vectorize.jl and hook it into the new broadcasting machinery.

Thanks everybody for the suggestions! Here are the comparisons on my computer. Surprisingly, the differences are overall small. Any other approaches?

julia> @btime bg_sim_mwe()  # original function
  18.743 ms (30202 allocations: 1.01 MiB)

julia> @btime bg_sim_mwe1()  # fixing Complex -> ComplexF64
  18.104 ms (102 allocations: 90.14 KiB)

julia> @btime bg_sim_mwe2()  # using sincos() instead of sin() and cos()
  17.420 ms (102 allocations: 90.14 KiB)

julia> @btime bg_sim_mwe3()  # using cos! and sin! from latest Yeppp#master
  20.369 ms (20104 allocations: 716.89 KiB)

julia> @btime bg_sim_mwe4()  # devectorize, use @simd, use sincos()
  17.831 ms (102 allocations: 90.14 KiB)

Here is the code for all functions:

function bg_sim_mwe(nB=100,nSamples=100)
    e = 0.9
    p = 0.8
    nt = 100
    t = range(0,stop=5.0,length=nt)
    V(ph) = 1.0 - p + p*(cos(ph) + 1im*e*sin(ph))
    phi = zeros(nt)
    Vtotal = zeros(Complex,nt)
    for a = 1:nSamples
        om = randn(nB)  # this replaces an actual calculation
        for it = 1:nt
            phi .= om.*t[it]
            Vtotal[it] += prod(V,phi)
        end
    end
end


function bg_sim_mwe1(nB=100,nSamples=100)
    e = 0.9
    p = 0.8
    nt = 100
    t = range(0,stop=5.0,length=nt)
    V(ph) = 1.0 - p + p*(cos(ph) + 1im*e*sin(ph))
    phi = zeros(nt)
    Vtotal = zeros(ComplexF64,nt)
    for a = 1:nSamples
        om = randn(nB)  # this replaces an actual calculation
        for it = 1:nt
            phi .= om.*t[it]
            Vtotal[it] += prod(V,phi)
        end
    end
end


function bg_sim_mwe2(nB=100,nSamples=100)
    e = 0.9
    p = 0.8
    nt = 100
    t = range(0,stop=5.0,length=nt)
    V(ph) = begin
        s,c = sincos(ph)
        1.0 - p + p*(c + 1im*e*s)
     end
    phi = zeros(nt)
    Vtotal = zeros(ComplexF64,nt)
    for a = 1:nSamples
        om = randn(nB)  # this replaces an actual calculation
        for it = 1:nt
            phi .= om.*t[it]
            Vtotal[it] += prod(V,phi)
        end
    end
end

function bg_sim_mwe3(nB=100,nSamples=100)
    e = 0.9
    p = 0.8
    nt = 100
    t = range(0,stop=5.0,length=nt)
    V(ph, c, s) = 1.0 - p + p*(c + 1im*e*s)
    phi = zeros(nt)
    Vtotal = zeros(ComplexF64,nt)
    _cos = zeros(nt)
    _sin = zeros(nt)
    for a = 1:nSamples
        om = randn(nB)  # this replaces an actual calculation
        for it = 1:nt
            phi .= om .* t[it]
            Yeppp.cos!(_cos, phi)
            Yeppp.sin!(_sin, phi)
            Vtotal[it] += prod(v -> V(v...), zip(phi, _cos, _sin))
        end
    end
end

function bg_sim_mwe4(nB=100,nSamples=100)
    e = 0.9
    p = 0.8
    nt = 100
    t = range(0,stop=5.0,length=nt)
    V(ph) = begin
        s,c = sincos(ph)
        1.0 - p + p*(c + 1im*e*s)
     end
    phi = zeros(nt)
    Vtotal = zeros(ComplexF64,nt)
    for a = 1:nSamples
        om = randn(nB)  # this replaces an actual calculation
        for it = 1:nt
            prod = one(ComplexF64)
            ti = t[it]
            @simd for b = 1:nB
               @inbounds ph = om[b] * ti
               s, c = sincos(ph)
               prod *= (1 - p) + p * complex(c, e*s)
            end
            Vtotal[it] += prod
        end
    end
end