Help for improving the performance of my algorithm

mike_k · August 17, 2023, 2:25pm

I aim to develop a package for solving the standard quadratic optimization problem which maximizes (minimizes) a quadratic form over the unit simplex. That is \max x^T Q x : \sum_{i \in [1:n]} x_i = 1, x\in \mathbb{R}_+^n where Q \in \mathbb{R}^{n \times n} and symmetric. I use an algorithm called infection immunization dynamics which already works quite well. Without going into details on that algorithm, I wonder you have some tips how I can improve the performance of my function solve_StQP() in the MWE below. For n=400 I obtain

0.030732 seconds (17.69 k allocations: 16.187 MiB)

and for n=4000

1.220787 seconds (189.63 k allocations: 1.221 GiB, 4.55% gc time)

Thank you very much for your inupts. A (quite long) MWE is here:

using LinearAlgebra
using Random

# set parameters
ndims = 400             # dimensions
mtype = "PSD"           # matrix type "PSD" (positive semidefinite), "NSD" (negative semidefinite), "IND" (indefinite)
sense = :minimize       # objective sense (:minimize or :maximize)
Random.seed!(0)         # random seed


# THIS SHOULD BE IMPROVED
function solve_StQP(Q, sense::Symbol; tol::Float64=0.000001, x=ones(Float64,size(Q)[1]) ./ size(Q)[1], verbose=1, epssize=5)       
    if sense == :minimize
        Q = -Q
    end
    if verbose==1
        println("starting point: ", x)
    end
    
    #start vectors and matrices
    Q_x::Vector{Float64}    = Q * x             # used to avoid excessive number of multiplications later in loop (this vector will be updated instead)
    p::Float64              = x' * Q_x          # current payoff  x'Qx                  
    t::Vector{Float64}      = Q_x .- p          # tempvector = (e^i - x | x) for all i    

    # define other start values
    n::Int              = size(Q)[1]            # dimension of decision vector
    epsilon::Float64    = Inf                   # measures violation of Nash condition. If epsilon = 0, we have a Nash equilibrium.
    obj::Float64        = 0.0                   # objective value
    prec::Float64       = epssize * eps(Float64)# obj epsilon surrounding around zero (to avoid numerical instabilities)
    i::Int              = 0                     # index of infective pure / or co-strategy
    tempval::Float64    = 0.0                   # temp value used for identifying the largest abs(t[i]), thus, best direction i  
    delta::Float64      = 1.0                   # fraction of mutants that are infective
    niterations::Int    = 0                     # number of iterations
    num::Float64        = 0.0                   # dummy numerator
    denom::Float64      = 1.0                   # dummy denominator
    k::Float64          = 0.0                   # factor needed later
    #errorcount::Int     = 0

    #start loop
    while epsilon > tol  
        niterations += 1         
    
        # reset values
        i       = 0                                                        # index of pure / or co strategy
        tempval = 0                                                        # temp value used for identifying the largest abs(t[i]), thus, direction (or pure/co-strategy) i                                        
            
        # find most infective pure (co-)strategy i
        for j in eachindex(x)
            if (t[j] > prec) && (tempval < t[j] - prec)
                i = j
                tempval = t[i]
            elseif (t[j] < -prec) && (x[j] > prec) && (tempval < abs(t[j] - prec))
                i = j
                tempval = abs(t[i])
            else                         
                # do not update i
            end
        end
            
        if i == 0                                                       # break loop, if no strategy was found (we are already optimal)
            break
        end

        # select (infective) strategy, update delta, and inject mutants
        delta = 1.0
        if t[i] > prec                                                  # selection: pure strategy i is infective
            num = -t[i]                                                 # numerator
            denom = Q[i,i] - 2 * Q_x[i] + p                             # denominator 
            if denom < -prec
                delta = min(num / denom, 1)                             # update delta
            end
            Q_x = delta * ( Q[:,i] - Q_x ) + Q_x
            x = delta * ( LinearAlgebra.I[1:n,i] - x ) + x              # inject mutants
        elseif (t[i] < -prec) && (x[i] > prec)                          # selection: co-strategy of i is infective
            k = x[i] / ( x[i]-1 )
            num = k * (-t[i])                                           # numerator
            denom = k^2 * ( Q[i,i] - 2 * Q_x[i] + p )                   # denominator
            if denom < -prec
                delta = min(num / denom, 1)                             # update delta
            end
            Q_x = k * delta * ( Q[:,i] - Q_x ) + Q_x 
            x = delta * k * ( LinearAlgebra.I[1:n,i] -x ) +x            # inject mutants
        else                                                            # selection: population x is immune against all mutants (=nash equilibrium)
            Q_x = Q * x
        end                

        # update current payoff, and temp vector
        p = x' * Q_x                                                      # current payoff x'Qx
        t = Q_x .- p                                                      # tempvector   

        # update violation of Nash condition (used as stopping criteria).
        epsilon=0
        for i in 1:n
            epsilon = epsilon + ( min( x[i], -t[i] ) )^2 
        end

        # optional: print current payoff, and Nash condition violation
        if verbose == 1
            cur_obj = 0.0
            if sense == :maximize
                cur_obj = round(p, digits=4)
            else
                cur_obj = -round(p, digits=4)
            end
            println("obj: $cur_obj \t\tepsilon: $epsilon")
        end
    end# while

    
    # set objective
    if sense == :maximize
        obj = p
    else
        obj = -p
    end

    # round output vector
    x = round.(x, digits=6)

   # @info "# errorcount / niterations: $errorcount / $niterations"

    return obj, x
end

# DUMMY FUNCTIONS
function createSymmetricMatrix(ndims::Int, type::String="IND")
    A = LinearAlgebra.Symmetric(rand(ndims,ndims)) * 10
    if type == "PSD"
        A = A*A'
    end
    if type == "NSD"
        A = -A*A'
    end

    return LinearAlgebra.Symmetric(A)
end

function createRandomStartingPoint(Q)
    n = size(Q)[1]
    x = rand(n)
    x = x ./ sum(x)
    return x
end

function main(ndims, mtype, sense)
    Q = createSymmetricMatrix(ndims, mtype)
    xstart = createRandomStartingPoint(Q)       #optional random starting point

    # the performance of this function should be improved
    @time obj, x = solve_StQP(Q, sense; verbose=1)
    #@time obj, x = solve_StQP(Q, sense; x=xstart)      #optional with different starting point

    # print results
    println("obj: $obj")
    #println("x: $x")
    return
end
main(ndims, mtype, sense)

LaurentPlagne · August 17, 2023, 3:16pm

At first profile observation there is no obvious (perf) optimization mistake.
The profile shows (for n=4000) that the time is mostly spent at line 78 and 79.

LoopVectorization maybe interesting to test on these lines (first rewrite them as loops).

I added a @views macro at the begining of the function:

# THIS SHOULD BE IMPROVED
@views function solve_StQP(Q, sense::Symbol; tol::Float64=0.000001, x=ones(Float64,size(Q)[1]) ./ size(Q)[1], verbose=1, epssize=5)

And disabling the verbose flag it takes 435 ms on my laptop.

aegger · August 17, 2023, 3:33pm

It seems to me that you are updating the same variables with each iteration. It is my understanding that this code allocates temporary variables and therefore memory at each iteration instead of operating in-place.
Some links concerning in-place operations:
Mathematical Operations and Elementary Functions · The Julia Language,
Functions · The Julia Language and More Dots: Syntactic Loop Fusion in Julia (julialang.org)
Hope this helps!

mikmoore · August 17, 2023, 4:35pm

To echo the above, it appears you have a significant number of unnecessary allocations. These may be coming with a significant runtime cost. The linked articles on broadcasting are where you should start.

Since an earlier commenter pointed out a couple of hot lines in the profile, here is an example of reworking that part in particular:

# original
Q_x = k * delta * ( Q[:,i] - Q_x ) + Q_x 
x = delta * k * ( LinearAlgebra.I[1:n,i] -x ) +x

# option 1
Q_x .= (k * delta) .* (@view(Q[:,i]) .- Q_x) .+ Q_x
x .= (k * delta) .* ((1:n .== i) .- x) .+ x

# option 2
Q_x .= (k * delta) .* (@view(Q[:,i]) .- Q_x) .+ Q_x
x .*= 1 - (k * delta)
x[i] += (k * delta)

I wrote these without testing that they produce identical results to the original, so check them to make sure I got them right.

A key point is that they use .= to overwrite the left-hand array in-place. When they were written with =, they were creating a new array and then binding the left-hand variable to it. Other array operations (like scalar multiplication and array addition) also tend to allocate intermediate arrays (when not broadcast with .), so it’s important to get the dots everywhere they’re needed. Slices like Q[:,i] allocate new arrays, but a view via @view(Q[:,i]) or view(Q,:,i) will wrap the same memory as the original so will not allocate a new array.

There are some other places where these in-place updates should be applied (e.g., t = Q_x .- p could be t .= Q_x .- p). You can look into the @. macro to help with this. You can also replace Q_x = Q * x with mul!(Q_x, Q, x) to avoid that allocation.

Stylistically, your habit of type-declaring variables is fairly un-idiomatic. It’s a habit people bring from C/C++, but in Julia there is very rarely a performance benefit. Would it be so unreasonable to run the same computation with Float32 values? However, there are a few places in your current implementation where it saves you from what would otherwise be type instabilities and it would take some extra work to resolve those spots, so I won’t suggest you change it now.

LaurentPlagne · August 18, 2023, 3:06pm

Based on

@mikmoore’s option3
using a Dense Matrix instead of Symmetric Matrix Q=Array(Q)
using LoopVectorization

     @turbo Q_x .= (k * delta) .* (Q[:,i] .- Q_x) .+ Q_x
     @turbo x .*= 1 - (k * delta)
     x[i] += (k * delta)

I obtain 162 ms (n=4000) instead of 435 ms.

I let the OP assess the result correctness…

complete code

@views function solve_StQP(Q, sense::Symbol; tol::Float64=0.000001, x=ones(Float64,size(Q)[1]) ./ size(Q)[1], verbose=0, epssize=5)       
    verbose=0
    if sense == :minimize
        Q = -Q
    end
    if verbose==1
        println("starting pointxx: ", x)
    end
    
    #start vectors and matrices
    Q_x::Vector{Float64}    = Q * x             # used to avoid excessive number of multiplications later in loop (this vector will be updated instead)
    p::Float64              = x' * Q_x          # current payoff  x'Qx                  
    t::Vector{Float64}      = Q_x .- p          # tempvector = (e^i - x | x) for all i    

    # define other start values
    n::Int              = size(Q)[1]            # dimension of decision vector
    epsilon::Float64    = Inf                   # measures violation of Nash condition. If epsilon = 0, we have a Nash equilibrium.
    obj::Float64        = 0.0                   # objective value
    prec::Float64       = epssize * eps(Float64)# obj epsilon surrounding around zero (to avoid numerical instabilities)
    i::Int              = 0                     # index of infective pure / or co-strategy
    tempval::Float64    = 0.0                   # temp value used for identifying the largest abs(t[i]), thus, best direction i  
    delta::Float64      = 1.0                   # fraction of mutants that are infective
    niterations::Int    = 0                     # number of iterations
    num::Float64        = 0.0                   # dummy numerator
    denom::Float64      = 1.0                   # dummy denominator
    k::Float64          = 0.0                   # factor needed later
    #errorcount::Int     = 0

    #start loop
    while epsilon > tol  
        niterations += 1         
    
        # reset values
        i       = 0                                                        # index of pure / or co strategy
        tempval = 0                                                        # temp value used for identifying the largest abs(t[i]), thus, direction (or pure/co-strategy) i                                        
            
        # find most infective pure (co-)strategy i
        for j in eachindex(x)
            if (t[j] > prec) && (tempval < t[j] - prec)
                i = j
                tempval = t[i]
            elseif (t[j] < -prec) && (x[j] > prec) && (tempval < abs(t[j] - prec))
                i = j
                tempval = abs(t[i])
            else                         
                # do not update i
            end
        end
            
        if i == 0                                                       # break loop, if no strategy was found (we are already optimal)
            break
        end

        # select (infective) strategy, update delta, and inject mutants
        delta = 1.0
        if t[i] > prec                                                  # selection: pure strategy i is infective
            num = -t[i]                                                 # numerator
            denom = Q[i,i] - 2 * Q_x[i] + p                             # denominator 
            if denom < -prec
                delta = min(num / denom, 1)                             # update delta
            end
            @. Q_x = delta * ( Q[:,i] - Q_x ) + Q_x
            @. x = delta * ( LinearAlgebra.I[1:n,i] - x ) + x              # inject mutants
        elseif (t[i] < -prec) && (x[i] > prec)                          # selection: co-strategy of i is infective
            k = x[i] / ( x[i]-1 )
            num = k * (-t[i])                                           # numerator
            denom = k^2 * ( Q[i,i] - 2 * Q_x[i] + p )                   # denominator
            if denom < -prec
                delta = min(num / denom, 1)                             # update delta
            end
            @turbo Q_x .= (k * delta) .* (Q[:,i] .- Q_x) .+ Q_x
            @turbo x .*= 1 - (k * delta)
            x[i] += (k * delta)
        else                                                            # selection: population x is immune against all mutants (=nash equilibrium)
            Q_x = Q * x
        end                

        # update current payoff, and temp vector
        p = x' * Q_x                                                      # current payoff x'Qx
        t = Q_x .- p                                                      # tempvector   

        # update violation of Nash condition (used as stopping criteria).
        epsilon=0
        for i in 1:n
            epsilon = epsilon + ( min( x[i], -t[i] ) )^2 
        end

        # optional: print current payoff, and Nash condition violation
        if verbose == 1
            cur_obj = 0.0
            if sense == :maximize
                cur_obj = round(p, digits=4)
            else
                cur_obj = -round(p, digits=4)
            end
            println("obj: $cur_obj \t\tepsilon: $epsilon")
        end
    end# while

    
    # set objective
    if sense == :maximize
        obj = p
    else
        obj = -p
    end

    # round output vector
    x = round.(x, digits=6)

   # @info "# errorcount / niterations: $errorcount / $niterations"

    return obj, x
end

# DUMMY FUNCTIONS
function createSymmetricMatrix(ndims::Int, type::String="IND")
    A = LinearAlgebra.Symmetric(rand(ndims,ndims)) * 10
    if type == "PSD"
        A = A*A'
    end
    if type == "NSD"
        A = -A*A'
    end

    return LinearAlgebra.Symmetric(A)
end

function createRandomStartingPoint(Q)
    n = size(Q)[1]
    x = rand(n)
    x = x ./ sum(x)
    return x
end

function btime_main(ndims, mtype, sense)
    Qs = createSymmetricMatrix(ndims, mtype)
    xstart = createRandomStartingPoint(Qs)       #optional random starting point

    Q = Array(Qs)
    # the performance of this function should be improved
    obj,x=nothing,nothing
    @btime begin
        obj, x = solve_StQP($Q, $sense)
    end
    #@time obj, x = solve_StQP(Q, sense; x=xstart)      #optional with different starting point

    # print results
    println("obj: $obj")
    #println("x: $x")
    return
end


function main(ndims, mtype, sense)
    Qs = createSymmetricMatrix(ndims, mtype)
    xstart = createRandomStartingPoint(Qs)       #optional random starting point

    Q = Array(Qs)
    # the performance of this function should be improved
    obj, x = solve_StQP(Q, sense)
    #@time obj, x = solve_StQP(Q, sense; x=xstart)      #optional with different starting point

    # print results
    println("obj: $obj")
    #println("x: $x")
    return
end




function btime_noarg_main()
    # set parameters
    ndims = 4000             # dimensions
    mtype = "PSD"           # matrix type "PSD" (positive semidefinite), "NSD" (negative semidefinite), "IND" (indefinite)
    sense = :minimize       # objective sense (:minimize or :maximize)
    Random.seed!(0)         # random seed
    btime_main(ndims, mtype, sense)
end

function noarg_main()
    # set parameters
    ndims = 4000            # dimensions
    mtype = "PSD"           # matrix type "PSD" (positive semidefinite), "NSD" (negative semidefinite), "IND" (indefinite)
    sense = :minimize       # objective sense (:minimize or :maximize)
    Random.seed!(0)         # random seed
    main(ndims, mtype, sense)
end

mike_k · August 21, 2023, 11:31am

Let me cordially thank you all, for your valuable inputs. I was not aware that one can prevent memory allocation by using .= and @views. I achieved the best performance with a combination of the approaches suggested by @mikmoore and @LaurentPlagne. For n=5000 we have:

initial version: 1.159 s (92646 allocations: 1.84 GiB)
current version: 365.054 ms (13 allocations: 190.93 MiB)

on my old laptop.

I like the idea of supporting more data types than Float64 which is unnecessarily limiting. I read this blog regarding type stability, and understand that the optional keyword argument x=ones(Float64,...) causes trouble when set to x=ones(AbstractFloat,...). Can you please point me to other potential sources of type instabilities?

Here is the current MWE:

import LinearAlgebra: Symmetric, eigvals
import Random: seed!
import LoopVectorization: @turbo
import BenchmarkTools: @btime

# set parameters
ndims = 4000            # dimensions
mtype = "PSD"           # matrix type "PSD" (positive semidefinite), "NSD" (negative semidefinite), "IND" (indefinite)
sense = :minimize       # objective sense (:minimize or :maximize)
seed!(0)                # random seed


# current best version
@views function solveStQP_current_best_version(Q, sense::Symbol; tol::Float64=0.000001, x=ones(Float64,size(Q)[1]) ./ size(Q)[1], verbose=0, epssize=5)       
    verbose=0
    if sense == :minimize
        Q = -Q
    end
    if verbose==1
        println("starting pointxx: ", x)
    end
    
    #start vectors and matrices
    Q_x::Vector{Float64}    = Q * x             # used to avoid excessive number of multiplications later in loop (this vector will be updated instead)
    p::Float64              = x' * Q_x          # current payoff  x'Qx                  
    t::Vector{Float64}      = Q_x .- p          # tempvector = (e^i - x | x) for all i    

    # define other start values
    n::Int              = size(Q)[1]            # dimension of decision vector
    nash_error::Float64 = Inf                   # measures violation of Nash condition. If nash_error = 0, we have a Nash equilibrium.
    obj::Float64        = 0.0                   # objective value
    prec::Float64       = epssize * eps(Float64)# obj epsilon surrounding around zero (to avoid numerical instabilities)
    i::Int              = 0                     # index of infective pure / or co-strategy
    tempval::Float64    = 0.0                   # temp value used for identifying the largest abs(t[i]), thus, best direction i  
    delta::Float64      = 1.0                   # fraction of mutants that are infective
    niterations::Int    = 0                     # number of iterations
    num::Float64        = 0.0                   # dummy numerator
    denom::Float64      = 1.0                   # dummy denominator
    k::Float64          = 0.0                   # factor needed later

    #start loop
    while nash_error > tol  
        niterations += 1         
    
        # reset values
        i       = 0                                                        # index of pure / or co strategy
        tempval = 0                                                        # temp value used for identifying the largest abs(t[i]), thus, direction (or pure/co-strategy) i                                        
            
        # find most infective pure (co-)strategy i
        for j in eachindex(x)
            if (t[j] > prec) && (tempval < t[j] - prec)
                i = j
                tempval = t[i]
            elseif (t[j] < -prec) && (x[j] > prec) && (tempval < abs(t[j] - prec))
                i = j
                tempval = abs(t[i])
            else                         
                # do not update i
            end
        end
            
        if i == 0                                                       # break loop, if no strategy was found (we are already optimal)
            break
        end

        # select (infective) strategy, update delta, and inject mutants
        delta = 1.0
        if t[i] > prec                                                  # selection: pure strategy i is infective
            num = -t[i]                                                 # numerator
            denom = Q[i,i] - 2 * Q_x[i] + p                             # denominator 
            if denom < -prec
                delta = min(num / denom, 1)                             # update delta
            end
            @. Q_x = delta * ( Q[:,i] - Q_x ) + Q_x
            @. x = delta * ( (1:n .== i) - x ) + x              # inject mutants
        elseif (t[i] < -prec) && (x[i] > prec)                          # selection: co-strategy of i is infective
            k = x[i] / ( x[i]-1 )
            num = k * (-t[i])                                           # numerator
            denom = k^2 * ( Q[i,i] - 2 * Q_x[i] + p )                   # denominator
            if denom < -prec
                delta = min(num / denom, 1)                             # update delta
            end
            @turbo Q_x .= (k * delta) .* (Q[:,i] .- Q_x) .+ Q_x
            @turbo x .*= 1 - (k * delta)
            x[i] += (k * delta)
        else                                                            # selection: population x is immune against all mutants (=nash equilibrium)
            mul!(Q_x, Q, x)                                             # equivalent to Q_x = Q * x
        end                

        # update current payoff, and temp vector
        p = x' * Q_x                                                      # current payoff x'Qx
        @. t = Q_x - p                                                      # tempvector   

        # update violation of Nash condition (used as stopping criteria).
        nash_error=0
        for i in eachindex(x)
            nash_error = nash_error + ( min( x[i], -t[i] ) )^2 
        end

        # optional: print current payoff, and Nash condition violation
        if verbose == 1
            cur_obj = 0.0
            if sense == :maximize
                cur_obj = round(p, digits=4)
            else
                cur_obj = -round(p, digits=4)
            end
            println("obj: $cur_obj \t\nash_error: $nash_error")
        end
    end# while

    
    # set objective
    if sense == :maximize
        obj = p
    else
        obj = -p
    end

    # round output vector
    x = round.(x, digits=6)

    return obj, x
end

# DUMMY FUNCTIONS
function createSymmetricMatrix(ndims::Int, type::String="IND")
    A = Symmetric(rand(ndims,ndims)) * 10
    if type == "PSD"
        A = A*A'
    end
    if type == "NSD"
        A = -A*A'
    end

    return Symmetric(A)
end

function createRandomStartingPoint(Q)
    n = size(Q)[1]
    x = rand(n)
    x = x ./ sum(x)
    return x
end

# MAIN
function main(ndims, mtype, sense)
    seed!(0)
    Q = createSymmetricMatrix(ndims, mtype)

    # start function
    Q = Array(Q)
    @btime begin
       obj, x = solveStQP_current_best_version($Q, $sense)
       println("obj: $obj")
    end

    #optional: start from different starting point xstart
    #xstart = createRandomStartingPoint(Q)    
    #@time obj, x = solveStQP_current_best_version(Q, sense; x=xstart) 

    return
end
main(ndims, mtype, sense)

LaurentPlagne · August 21, 2023, 12:52pm

Hi, glad that it helps !

As @mikmoore wrote, you don’t have to annotate the type of your variables in the function since
they can be infered by Julia.

Some care must be taken for constant values like 0,1 and Inf that you may initialize like:

# current best version
@views function solveStQP_current_best_version(Q, sense::Symbol; tol=0.000001, x=nothing, verbose=0, epssize=5)  
    T = eltype(Q) # get the type of Q matrix element
    tol = T(tol)  # !!! Note that, for some reason, the default tol is too small for FP32 
    if x===nothing
        x = ones(T,size(Q)[1]) ./ size(Q)[1]
    end
    verbose=0
    if sense == :minimize
        Q = -Q # !!! This cost some time : if Q is not to be reuse that this could be replaced by Q .* = -one(T)
    end

   ....

    delta      = one(T)                   # fraction of mutants that are infective
    nash_error = typemax(T)                   # measures violation of Nash condition. If nash_error = 0, we have a Nash equilibrium.
    prec       = epssize * eps(T)     # obj epsilon surrounding around zero (to avoid numerical instabilities)
    ....                                        # index of pure / or co strategy
   tempval = zero(T)

generic version


import LinearAlgebra: Symmetric, eigvals
using Random
import Random: seed!
import LoopVectorization: @turbo
import BenchmarkTools: @btime


# current best version
@views function solveStQP_previous_best_version(Q, sense::Symbol; tol::Float64=0.000001, x=ones(Float64,size(Q)[1]) ./ size(Q)[1], verbose=0, epssize=5)       
    verbose=0
    if sense == :minimize
        Q = -Q
    end
    if verbose==1
        println("starting pointxx: ", x)
    end
    
    #start vectors and matrices
    Q_x::Vector{Float64}    = Q * x             # used to avoid excessive number of multiplications later in loop (this vector will be updated instead)
    p::Float64              = x' * Q_x          # current payoff  x'Qx                  
    t::Vector{Float64}      = Q_x .- p          # tempvector = (e^i - x | x) for all i    

    # define other start values
    n::Int              = size(Q)[1]            # dimension of decision vector
    nash_error::Float64 = Inf                   # measures violation of Nash condition. If nash_error = 0, we have a Nash equilibrium.
    obj::Float64        = 0.0                   # objective value
    prec::Float64       = epssize * eps(Float64)# obj epsilon surrounding around zero (to avoid numerical instabilities)
    i::Int              = 0                     # index of infective pure / or co-strategy
    tempval::Float64    = 0.0                   # temp value used for identifying the largest abs(t[i]), thus, best direction i  
    delta::Float64      = 1.0                   # fraction of mutants that are infective
    niterations::Int    = 0                     # number of iterations
    num::Float64        = 0.0                   # dummy numerator
    denom::Float64      = 1.0                   # dummy denominator
    k::Float64          = 0.0                   # factor needed later

    #start loop
    while nash_error > tol  
        niterations += 1         
    
        # reset values
        i       = 0                                                        # index of pure / or co strategy
        tempval = 0                                                        # temp value used for identifying the largest abs(t[i]), thus, direction (or pure/co-strategy) i                                        
            
        # find most infective pure (co-)strategy i
        for j in eachindex(x)
            if (t[j] > prec) && (tempval < t[j] - prec)
                i = j
                tempval = t[i]
            elseif (t[j] < -prec) && (x[j] > prec) && (tempval < abs(t[j] - prec))
                i = j
                tempval = abs(t[i])
            else                         
                # do not update i
            end
        end
            
        if i == 0                                                       # break loop, if no strategy was found (we are already optimal)
            break
        end

        # select (infective) strategy, update delta, and inject mutants
        delta = 1.0
        if t[i] > prec                                                  # selection: pure strategy i is infective
            num = -t[i]                                                 # numerator
            denom = Q[i,i] - 2 * Q_x[i] + p                             # denominator 
            if denom < -prec
                delta = min(num / denom, 1)                             # update delta
            end
            @. Q_x = delta * ( Q[:,i] - Q_x ) + Q_x
            @. x = delta * ( (1:n .== i) - x ) + x              # inject mutants
        elseif (t[i] < -prec) && (x[i] > prec)                          # selection: co-strategy of i is infective
            k = x[i] / ( x[i]-1 )
            num = k * (-t[i])                                           # numerator
            denom = k^2 * ( Q[i,i] - 2 * Q_x[i] + p )                   # denominator
            if denom < -prec
                delta = min(num / denom, 1)                             # update delta
            end
            @turbo Q_x .= (k * delta) .* (Q[:,i] .- Q_x) .+ Q_x
            @turbo x .*= 1 - (k * delta)
            x[i] += (k * delta)
        else                                                            # selection: population x is immune against all mutants (=nash equilibrium)
            mul!(Q_x, Q, x)                                             # equivalent to Q_x = Q * x
        end                

        # update current payoff, and temp vector
        p = x' * Q_x                                                      # current payoff x'Qx
        @. t = Q_x - p                                                      # tempvector   

        # update violation of Nash condition (used as stopping criteria).
        nash_error=0
        for i in eachindex(x)
            nash_error = nash_error + ( min( x[i], -t[i] ) )^2 
        end

        # optional: print current payoff, and Nash condition violation
        if verbose == 1
            cur_obj = 0.0
            if sense == :maximize
                cur_obj = round(p, digits=4)
            else
                cur_obj = -round(p, digits=4)
            end
            println("obj: $cur_obj \t\nash_error: $nash_error")
        end
    end# while

    
    # set objective
    if sense == :maximize
        obj = p
    else
        obj = -p
    end

    # round output vector
    x = round.(x, digits=6)

    return obj, x
end

# current best version
@views function solveStQP_current_best_version(Q, sense::Symbol; tol=0.000001, x=nothing, verbose=0, epssize=5)  
    T = eltype(Q) # get the type of Q matrix element
    tol = T(tol)
    if x===nothing
        x = ones(T,size(Q)[1]) ./ size(Q)[1]
    end
    verbose=0
    if sense == :minimize
        Q = -Q # This cost some time : if Q is not to be reuse that this could be replaced by Q .* = -one(T)
    end
    if verbose==1
        println("starting pointxx: ", x)
    end

    #start vectors and matrices
    Q_x   = Q * x             # used to avoid excessive number of multiplications later in loop (this vector will be updated instead)
    p     = x' * Q_x          # current payoff  x'Qx                  
    t     = Q_x .- p          # tempvector = (e^i - x | x) for all i    

    # define other start values
    n            = size(Q)[1]            # dimension of decision vector
    nash_error = typemax(T)                   # measures violation of Nash condition. If nash_error = 0, we have a Nash equilibrium.
    prec       = epssize * eps(T)     # obj epsilon surrounding around zero (to avoid numerical instabilities)
    delta      = one(T)                   # fraction of mutants that are infective
    niterations    = 0                     # number of iterations

    #start loop
    while nash_error > tol  
        niterations += 1         
    
        # reset values
        i       = 0                                                        # index of pure / or co strategy
        tempval = zero(T)                                                        # temp value used for identifying the largest abs(t[i]), thus, direction (or pure/co-strategy) i                                        
            
        # find most infective pure (co-)strategy i
        for j in eachindex(x)
            if (t[j] > prec) && (tempval < t[j] - prec)
                i = j
                tempval = t[i]
            elseif @inbounds (t[j] < -prec) && (x[j] > prec) && (tempval < abs(t[j] - prec))
                i = j
                tempval = abs(t[i])
            else                         
                # do not update i
            end
        end
            
        i == 0 && break                                                       # break loop, if no strategy was found (we are already optimal)

        # select (infective) strategy, update delta, and inject mutants
        delta = one(T)
        if t[i] > prec                                                  # selection: pure strategy i is infective
            num = -t[i]                                                 # numerator
            denom = Q[i,i] - 2 * Q_x[i] + p                             # denominator 
            if denom < -prec
                delta = min(num / denom, one(T))                             # update delta
            end
            @. Q_x = delta * ( Q[:,i] - Q_x ) + Q_x
            @. x = delta * ( (1:n .== i) - x ) + x              # inject mutants
        elseif (t[i] < -prec) && (x[i] > prec)                          # selection: co-strategy of i is infective
            k = x[i] / ( x[i] - one(T) )
            num = k * (-t[i])                                           # numerator
            denom = k^2 * ( Q[i,i] - 2 * Q_x[i] + p )                   # denominator
            if denom < -prec
                delta = min(num / denom, one(T))                             # update delta
            end
            @turbo @. Q_x = (k * delta) * (Q[:,i] - Q_x) + Q_x
            @turbo @. x *= one(T) - (k * delta)
            x[i] += (k * delta)
        else                                                            # selection: population x is immune against all mutants (=nash equilibrium)
            mul!(Q_x, Q, x)                                             # equivalent to Q_x = Q * x
        end                

        # update current payoff, and temp vector
        p = x' * Q_x                                                      # current payoff x'Qx
        @. t = Q_x - p                                                      # tempvector   

        # update violation of Nash condition (used as stopping criteria).
        nash_error = zero(T) 
        for i in eachindex(x)
            nash_error = nash_error + ( min( x[i], -t[i] ) )^2 
        end

        # optional: print current payoff, and Nash condition violation
        if verbose == 1
            cur_obj = zero(T)
            if sense == :maximize
                cur_obj = round(p, digits=4)
            else
                cur_obj = -round(p, digits=4)
            end
            println("obj: $cur_obj \t\nash_error: $nash_error")
        end
    end# while

    @show niterations
    # set objective
    obj = sense == :maximize ? p : -p
    # round output vector
    x = round.(x, digits=6)

    return obj, x
end

# DUMMY FUNCTIONS
function createSymmetricMatrix(T,ndims::Int, type::String="IND")
    A = Symmetric(rand(T,ndims,ndims)) * T(10)
    if type == "PSD"
        A = A*A'
    end
    if type == "NSD"
        A = -A*A'
    end

    return Symmetric(A)
end

function createRandomStartingPoint(Q)
    n = size(Q)[1]
    x = rand(n)
    x = x ./ sum(x)
    return x
end

# MAIN
function main(ndims, mtype, sense,T)
    seed!(0)
    Q = createSymmetricMatrix(T,ndims, mtype)

    # start function
    Q = Array(Q)
    @time begin
       obj, x = solveStQP_current_best_version(Q, sense)
       println("obj: $obj")
    end

    #optional: start from different starting point xstart
    #xstart = createRandomStartingPoint(Q)    
    #@time obj, x = solveStQP_current_best_version(Q, sense; x=xstart) 

    return
end


function btime_main(ndims, mtype, sense)
    Qs = createSymmetricMatrix(ndims, mtype)
    xstart = createRandomStartingPoint(Qs)       #optional random starting point

    Q = Array(Qs)
    # the performance of this function should be improved
    obj,x=nothing,nothing
    @btime begin
        obj, x = solve_StQP($Q, $sense)
    end
    #@time obj, x = solve_StQP(Q, sense; x=xstart)      #optional with different starting point

    # print results
    println("obj: $obj")
    #println("x: $x")
    return
end



function noarg_main()
    # set parameters
    ndims = 4000            # dimensions
    mtype = "PSD"           # matrix type "PSD" (positive semidefinite), "NSD" (negative semidefinite), "IND" (indefinite)
    sense = :minimize       # objective sense (:minimize or :maximize)
    Random.seed!(0)         # random seed
    main(ndims, mtype, sense,Float64)
end