`rev_cumsum_exp!()` optimization

Another weird operation to be optimized:

function rev_cumsum_exp!(A,B,source)
    # This is equivalent to : 
    #       A .= exp(source)
    #       B .= 1 ./ reverse(cumsum(reverse(A))))
    s = zero(eltype(A))
    for j in length(source):-1:1
        A[j] = exp(source[j])
        s += A[j]
        B[j] = inv(s)
    end
    return nothing
end

# Sample data: 
using BenchmarkTools
N=10_000
source, A, B = randn(N), zeros(N), zeros(N)
@benchmark rev_cumsum_exp!(A,B,source)

This yields:

BenchmarkTools.Trial: 10000 samples with 1 evaluation per sample.
 Range (min … max):   85.600 μs … 737.400 μs  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):      96.600 μs               ┊ GC (median):    0.00%
 Time  (mean ± σ):   107.616 μs ±  32.025 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

  ▅ █▂ ▇  ▆▁ ▁▄   ▅                       ▂▁   ▁▁           ▂   ▁
  █▆██▇██▇██▇██▇▅▆█▇▆▆▇█▅▅▆▅▆▄▅▅▄▅▂▅▄▅▄▅▆▅██▇▇▇███▇▆▄▆▇▅▆▅█▇█▆▆ █
  85.6 μs       Histogram: log(frequency) by time        211 μs <

 Memory estimate: 0 bytes, allocs estimate: 0.

But If i try to profile it:

N=100_000_000
source, A, B = randn(N), zeros(N), zeros(N)
@profview rev_cumsum_exp!(A,B,source)

I get:

Is there still something to be done or am I doomed ? I need both the A and B outputs to be produced..

You could turn off bounds checking; this gives a 10% speed-up on my device for your specific example.

1 Like

Maybe there is some approximation trick which can be used (e.g. using exp(x) ≈ 1+x). In any case, that would require more context on the problem.
Can you give a sample of the values which would go into source?

You are right. Here is a full example, where I highlighted as a function the sensitive operation (eating 90% of my runtime). Its a mix between the code we were looking at here and the one from the previous thread, where i ended up using your solution.

using RDatasets
using LinearAlgebra
using BenchmarkTools
using StatsBase

# The dataset: 
const colon = let
    df = dataset("survival","colon")
    df = dropmissing(df, [:Nodes, :Differ])
    
    T = Float64.(df.Time)
    Δ = Int64.(df.Status)
    X = Float64.(hcat(df.Sex, df.Age, df.Obstruct,
    df.Perfor, df.Adhere, df.Nodes,
    df.Differ, df.Extent, df.Surg,
    df.Node4, df.EType))
    (T,Δ,X)
end

struct Cox
    X::Matrix{Float64}
    T::Vector{Float64}
    Δ::Vector{Bool}
    sX::Vector{Float64}
    G::Vector{Float64}
    η::Vector{Float64}
    B::Vector{Float64}
    I::Vector{Int64}
    J::Vector{Int64}
    _A::Vector{Float64}
    _B::Vector{Float64}
    function Cox(T,Δ,X)

        # Allocate: 
        n,m = size(X)
        G, B  = zeros(m), zeros(m)
        η, _A, _B  = zeros(n), zeros(n), zeros(n)

        # Precompute a few things: 
        # This should also be optimized, its taking a lot of time right now..
        o = sortperm(T)
        To = T[o]
        Δo = Bool.(Δ[o])
        Xo = X[o,:]
        sX = X'Δ
        Ro = StatsBase.competerank(To)
        I = Ro[Δo]

        # J is a form of ranks, filtered on Δ. Did not really found a function to do it. 
        J = zeros(Int64,n)
        for j in 1:n
            if Δo[j]
                for i in 1:n
                    J[i] += (To[i] >= To[j])
                end
            end
        end
        
        # Compute hessian bounds:
        for l in 1:m # for each dimension. 
            lastj = n+1
            Mₓ = Xo[end,l]
            mₓ = Xo[end,l]
            for i in n:-1:1
                for j in lastj-1:-1:Ro[i]
                    Mₓ = max(Xo[j,l], Mₓ)
                    mₓ = min(Xo[j,l], mₓ)
                end
                lastj = Ro[i]
                B[l] += (1/4) * Δ[i] * (Mₓ-mₓ)^2
            end
        end

        # Instantiate: 
        new(Xo, To, Δo, sX, G, η, B, I, J, _A, _B)
    end
end

function most_of_my_runtime!(A,B,η,I,J)
    # This is equivalent to : 
    #   A .= exp.(η)
    #   B .= 1 ./ reverse(cumsum(reverse(A)))
    #   A .*= cumsum(B[I])[J]
    # But 40% faster for the moment.

    aₖ = bₖ = cₖ = zero(eltype(A))
    n, lastj = length(η), 0
    local j

    @inbounds for k in n:-1:1
        aₖ = exp(η[k])
        A[k] = aₖ
        bₖ += aₖ
        B[k] = bₖ
    end
    @inbounds for k in 1:n
        for outer j in lastj+1:J[k]
            cₖ += inv(B[I[j]])
        end
        lastj = j
        A[k] *= cₖ
    end
end

function update!(β, M::Cox)
    mul!(M.η, M.X, β)
    most_of_my_runtime!(M._A, M._B, M.η, M.I, M.J)
    mul!(M.G, M.X', M._A)
    M.G .= M.sX .- M.G
    β .+= M.G ./ M.B 
    return nothing
end

function getβ(M::Cox; max_iter = 10000, tol = 1e-6)

    β = zeros(size(M.X, 2))
    βᵢ = similar(β)
    for i in 1:max_iter
        βᵢ .= β
        update!(β, M) 
        gap = norm(βᵢ - β)
        # println(gap)
        if gap < tol
            break
        end
    end
    return β
end

M = Cox(colon...)
getβ(M)
@profview [getβ(M) for i in 1:100]

The profview I have looks like:

I thought about trying to intertwine the two loops, but i was not able to find a better version yet.

Where are you using M._B (after its calculation) ?
If it is used only later (as some dispersion measure), perhaps it can be calculated only in final update!.

It is not used later, only M._A is used. M._B is temporary storage.

1 Like

On my machine the original code produces

julia> @btime rev_cumsum_exp!($A,$B,$source)
  20.600 μs (0 allocations: 0 bytes)

I then used LoopVectorization to speed up the exponential calculation:

using LoopVectorization
function rev_cumsum_exp2!(A,B,source)
    @tturbo for j in eachindex(A,source)
        A[j] = exp(source[j])
    end
    s = zero(eltype(A))
    for j in Iterators.reverse(eachindex(A,B))
        s += A[j]
        B[j] = inv(s)
    end
    return nothing
end

This produces:

julia> A2, B2 = similar(A), similar(B);

julia> @btime rev_cumsum_exp2!($A2, $B2, $source) 
  8.500 μs (0 allocations: 0 bytes)

for a net speedup of 20.6/8.5 or approximately 2.4x. Checking that the results are correct:

julia> A2 ≈ A
true

julia> B2 ≈ B
true

Details of my setup:

julia> versioninfo()
Julia Version 1.11.5
Commit 760b2e5b73 (2025-04-14 06:53 UTC)
Build Info:
  Official https://julialang.org/ release
Platform Info:
  OS: Windows (x86_64-w64-mingw32)
  CPU: 28 × Intel(R) Core(TM) i7-14700
  WORD_SIZE: 64
  LLVM: libLLVM-16.0.6 (ORCJIT, alderlake)
Threads: 28 default, 0 interactive, 14 GC (on 28 virtual cores)
Environment:
  JULIA_EDITOR = "C:\Program Files\Microsoft VS Code\Code.exe"
1 Like

Whats the purpose of the extra loop? Can’t you just do

function rev_cumsum_exp2!(B, source)
    s = zero(eltype(B))
    @tturbo for j in reverse(eachindex(B, source))
        s += exp(source[j]) 
        B[j] = inv(s)
    end
    return nothing
end

The first loop computes the exponentials as was done in the originally posted code

Perhaps the following is faster:

function most_of_my_runtime!(A,B,η,I,J)
    aₖ = bₖ = cₖ = zero(eltype(A))

    @tturbo for i in eachindex(η)
        bₖ = exp(η[i])
        B[i] = bₖ
        aₖ += bₖ
    end

    local j
    lastj, lasti = 0, 1
    @inbounds for k in eachindex(η)
        for outer j in lastj+1:J[k]
            if I[j] > lasti
                for i in lasti:I[j]-1
                    aₖ -= B[i]
                end
                lasti = I[j]
            end
            cₖ += inv(aₖ)
        end
        lastj = j
        A[k] = B[k]*cₖ
    end
    return
end

This is a strange cumsum! the trick here is to exploit the monotonicity of both I and J to save on some work.

1 Like

So @Dan you exploited the fact that reverse(cumsum(reverse(A))) == sum(A) .- cumsum([0,A[1:end-1]...])… That is smart indeed. I thought about it, but then since the next cumsum has to happen in the reverse order to the first, I do not thing there is the possibility to do it in one pass other the dataset.

I ended up moving the crazy indexing logic to the constructor, to have it out of the hot loop, and realizing that each B[i] inside a k-adding-group are in fact all equal. Take a look at this version :

Complete code
using RDatasets
using LinearAlgebra
using BenchmarkTools
using StatsBase

# The dataset: 
const colon = let
    df = dataset("survival","colon")
    df = dropmissing(df, [:Nodes, :Differ])
    
    T = Float64.(df.Time)
    Δ = Int64.(df.Status)
    X = Float64.(hcat(df.Sex, df.Age, df.Obstruct,
    df.Perfor, df.Adhere, df.Nodes,
    df.Differ, df.Extent, df.Surg,
    df.Node4, df.EType))
    (T,Δ,X)
end

struct Cox
    X::Matrix{Float64}
    T::Vector{Float64}
    Δ::Vector{Bool}
    sX::Vector{Float64}
    G::Vector{Float64}
    η::Vector{Float64}
    B::Vector{Float64}
    K::Vector{Int64}
    _A::Vector{Float64}
    _B::Vector{Float64}
    function Cox(T,Δ,X)

        # Allocate: 
        n,m = size(X)
        G, B  = zeros(m), zeros(m)
        η, _A, _B  = zeros(n), zeros(n), zeros(n)

        # Precompute a few things: 
        # This should also be optimized, its taking a lot of time right now..
        o = sortperm(T)
        To = T[o]
        Δo = Bool.(Δ[o])
        Xo = X[o,:]
        sX = X'Δ
        
        # Indexing hell for I,J and now K
        Ro = StatsBase.competerank(To)
        I = Ro[Δo]
        J = zeros(Int64,n)
        for rⱼ in I
            J[rⱼ] += 1
        end
        for j in 2:n
            J[j] += J[j-1]
        end

        K = Int64[]
        Jₖ₋₁ = 0
        for Jₖ in J
            push!(K,length((Jₖ₋₁+1):Jₖ))
            Jₖ₋₁ = Jₖ
        end
        
        # Compute hessian bounds:
        for l in 1:m # for each dimension. 
            lastj = n+1
            Mₓ = Xo[end,l]
            mₓ = Xo[end,l]
            Bₗ = 0.0
            for i in n:-1:1
                for j in Ro[i]:(lastj-1)
                    Mₓ = max(Xo[j,l], Mₓ)
                    mₓ = min(Xo[j,l], mₓ)
                end
                lastj = Ro[i]
                Bₗ += (1/4) * Δ[i] * (Mₓ-mₓ)^2
            end
            B[l] = Bₗ
        end

        # Instantiate: 
        new(Xo, To, Δo, sX, G, η, B, K, _A, _B)
    end
end

function most_of_my_runtime!(A,B,η,K)
    aₖ = bₖ = cₖ = zero(eltype(A))
    n = length(η)
    
    @inbounds for k in n:-1:1
        aₖ = exp(η[k])
        A[k] = aₖ
        bₖ += aₖ
        B[k] = bₖ
    end
    @inbounds for (k, (nₖ, bₖ)) in enumerate(zip(K,B))
        cₖ += nₖ / bₖ
        A[k] *= cₖ
    end
end

function most_of_my_runtime_V2!(A,B,η,K)
    ######### Equivalent to : 
    ####    A .= exp.(η)
    ####    B .= reverse(cumsum(reverse(A)))
    ####    A .*= cumsum(K ./ B)
    ####
    #### Remark that K is half full of zeros
    ################################

    aₖ = bₖ = cₖ = zero(eltype(A))
    n = length(η)
    
    @inbounds for k in n:-1:1
        aₖ = exp(η[k])
        A[k] = aₖ
        bₖ += aₖ
        B[k] = bₖ
    end
    @inbounds for (k, (nₖ, bₖ)) in enumerate(zip(K,B))
        (nₖ > 0) && (cₖ += nₖ / bₖ)
        A[k] *= cₖ
    end
end

function update!(β, M::Cox)
    mul!(M.η, M.X, β)
    most_of_my_runtime_V2!(M._A, M._B, M.η, M.K)
    mul!(M.G, M.X', M._A)
    M.G .= M.sX .- M.G
    β .+= M.G ./ M.B 
    return nothing
end

function getβ(M::Cox; max_iter = 10000, tol = 1e-6)

    β = zeros(size(M.X, 2))
    βᵢ = similar(β)
    for i in 1:max_iter
        βᵢ .= β
        update!(β, M) 
        gap = norm(βᵢ - β)
        # println(gap)
        if gap < tol
            break
        end
    end
    return β
end

M = Cox(colon...)
getβ(M)
@profview [getβ(M) for i in 1:100]

Maybe some better structure for K could be found since 60% of the n\_k are actually zeros..

1 Like

In what context are you using this function? In general, if you really care about the performance of a little loop like this, it’s often a sign that you should merge it with other calculations.

The “vectorized” style (ala Matlab, Python, …) where you do a bunch of little operations as separate loops, producing an intermediate array for each step, is often suboptimal from a performance perspective.

You can find the last version of the context in the hidden block of this response. I am trying to merge this loop with other loops around it, but its not really successfull right now ^^ Please, be my guest and take a look if you want to