C++ code much faster than Julia how can I optimize it?

pabloferz · October 9, 2022, 5:43pm

Here’s a more faithful translation of the C++ code:

KMeans1D.jl

module Kmeans1D


function smawk(nrows, ncols, lookup, k)
    result = Vector{Int32}(undef, nrows)
    rows = collect(Int32.(1:nrows))
    cols = collect(Int32.(1:ncols))
    smawk!(result, rows, cols, lookup, k)
end

function smawk!(result, rows, cols, lookup, k)
    length(rows) == 0 && return result

    ## REDUCE
    #
    _cols = Int32[]
    for col in cols
        @inbounds while true
            length(_cols) == 0 && break
            row = rows[length(_cols)]
            if lookup(k, row, col) >= lookup(k, row, _cols[end])
                break
            end
            pop!(_cols)
        end
        if length(_cols) < length(rows)
            push!(_cols, col)
        end
    end

    # Call recursively on odd-indexed rows
    odd_rows = Int32[]
    @inbounds for i in 2:2:length(rows)
        push!(odd_rows, rows[i])
    end

    smawk!(result, odd_rows, _cols, lookup, k)

    col_idx_lookup = Dict{Int32, Int32}()
    @inbounds for idx in 1:length(_cols)
        col_idx_lookup[_cols[idx]] = idx
    end

    ## INTERPOLATE

    # Fill-in even-indexed rows
    start = 1
    @inbounds for r in 1:2:length(rows)
        row = rows[r]
        stop = length(_cols) - 1
        if r < (length(rows) - 1)
            stop = col_idx_lookup[result[rows[r + 1]]]
        end
        argmin = _cols[start]
        min = lookup(k, row, argmin)
        for c in start+1:stop+1
            value = lookup(k, row, _cols[c])
            if (c == start) || (value < min)
                argmin = _cols[c]
                min = value
            end
        end
        result[row] = argmin
        start = stop
    end

    return result
end

struct CostCalculator
    cumsum::Vector{Float32}
    cumsum2::Vector{Float32}

    function CostCalculator(array, n::Integer)
        cumsum = Float32[0.0]
        cumsum2 = Float32[0.0]
        @inbounds for i in 1:n
            x = array[i]
            push!(cumsum, x + cumsum[i])
            push!(cumsum2, x * x + cumsum2[i])
        end
        return new(cumsum, cumsum2)
    end
end

function calc(cc::CostCalculator, i, j)
    if j < i
        return zero(eltype(cc.cumsum))
    end

    mu = (cc.cumsum[j + 1] - cc.cumsum[i]) / (j - i + 1)
    result = cc.cumsum2[j + 1] - cc.cumsum2[i]
    result += (j - i + 1) * (mu * mu)
    result -= (2 * mu) * (cc.cumsum[j + 1] - cc.cumsum[i])

    return result
end


struct LookUp
    calculator::CostCalculator
    D::Matrix{Float32}
end

function (lu::LookUp)(k, i, j)
    col = min(i, j - 1)
    if col == 0
        col = size(lu.D, 2) + col
    end
    return lu.D[k - 1, col] + calc(lu.calculator, j, i)
end

function cluster(array, k)
    n = length(array)
    return cluster(array, n, min(k, n))
end

function cluster(array, n, k)
    # Sort input array and save info for de-sorting
    sort_idx = sortperm(array)
    undo_sort_lookup = Vector{Int32}(undef, n)
    sorted_array = Vector{Float32}(undef, n)

    @inbounds for i in 1:n
        sorted_array[i] = array[sort_idx[i]]
        undo_sort_lookup[sort_idx[i]] = i
    end

    #Set D and T using dynamic programming algorithm
    cost_calculator = CostCalculator(sorted_array, n)
    D = Matrix{Float32}(undef, k, n)
    T = Matrix{Int32}(undef, k, n)
    lookup = LookUp(cost_calculator, D)

    @inbounds for i in 1:n
        D[1, i] = calc(cost_calculator, 1, i)
        T[1, i] = 1
    end

    for k_ in 2:k
        row_argmins = smawk(n, n, lookup, k_)
        @inbounds for i in 1:n
            argmin = row_argmins[i]
            min = lookup(k_, i, argmin)
            D[k_, i] = min
            T[k_, i] = argmin
        end
    end

    #Extract cluster assignments by backtracking
    centroids = zeros(k)
    sorted_clusters = Vector{Int32}(undef, n)
    t = n + 1
    k_ = k
    n_ = n

    @inbounds while t > 1
        t_ = t
        t = T[k_, n_]
        centroid = 0.0
        for i in t:t_-1
            sorted_clusters[i] = k_
            centroid += (sorted_array[i] - centroid) / (i - t + 1)
        end
        centroids[k_] = centroid
        k_ -= 1
        n_ = t - 1
    end

    clusters = Vector{Int32}(undef, n)
    @inbounds for i in 1:n
        clusters[i] = sorted_clusters[undo_sort_lookup[i]]
    end

    return centroids, clusters
end


end  # module

As others have mentioned, there are issues regarding the type instability of calc and the use of some globals, but the main issue is that your CostCalculator has abstract fields Array{Float32} as opposed Vector{Float32} (AKA Array{Float32, 1}), so julia cannot really optimize the code paths involving CostCalculator.

With the code above I get around 5 ms.

Topic		Replies	Views
Optimizing Calculation in Julia compared to C (New to Julia) Performance	25	2528	January 1, 2020
Julia not 220 times faster than C++ General Usage question , performance	10	1266	May 5, 2022
Do C++ performance optimisations apply to Julia? Performance	28	2693	June 25, 2022
Making efficient some small subroutines Performance	22	1120	May 8, 2020
Blog post: How to optimise Julia code: A practical guide Community	27	3708	June 27, 2022

C++ code much faster than Julia how can I optimize it?

Related topics