C++ code much faster than Julia how can I optimize it?

Here’s a more faithful translation of the C++ code:

KMeans1D.jl
module Kmeans1D


function smawk(nrows, ncols, lookup, k)
    result = Vector{Int32}(undef, nrows)
    rows = collect(Int32.(1:nrows))
    cols = collect(Int32.(1:ncols))
    smawk!(result, rows, cols, lookup, k)
end

function smawk!(result, rows, cols, lookup, k)
    length(rows) == 0 && return result

    ## REDUCE
    #
    _cols = Int32[]
    for col in cols
        @inbounds while true
            length(_cols) == 0 && break
            row = rows[length(_cols)]
            if lookup(k, row, col) >= lookup(k, row, _cols[end])
                break
            end
            pop!(_cols)
        end
        if length(_cols) < length(rows)
            push!(_cols, col)
        end
    end

    # Call recursively on odd-indexed rows
    odd_rows = Int32[]
    @inbounds for i in 2:2:length(rows)
        push!(odd_rows, rows[i])
    end

    smawk!(result, odd_rows, _cols, lookup, k)

    col_idx_lookup = Dict{Int32, Int32}()
    @inbounds for idx in 1:length(_cols)
        col_idx_lookup[_cols[idx]] = idx
    end

    ## INTERPOLATE

    # Fill-in even-indexed rows
    start = 1
    @inbounds for r in 1:2:length(rows)
        row = rows[r]
        stop = length(_cols) - 1
        if r < (length(rows) - 1)
            stop = col_idx_lookup[result[rows[r + 1]]]
        end
        argmin = _cols[start]
        min = lookup(k, row, argmin)
        for c in start+1:stop+1
            value = lookup(k, row, _cols[c])
            if (c == start) || (value < min)
                argmin = _cols[c]
                min = value
            end
        end
        result[row] = argmin
        start = stop
    end

    return result
end

struct CostCalculator
    cumsum::Vector{Float32}
    cumsum2::Vector{Float32}

    function CostCalculator(array, n::Integer)
        cumsum = Float32[0.0]
        cumsum2 = Float32[0.0]
        @inbounds for i in 1:n
            x = array[i]
            push!(cumsum, x + cumsum[i])
            push!(cumsum2, x * x + cumsum2[i])
        end
        return new(cumsum, cumsum2)
    end
end

function calc(cc::CostCalculator, i, j)
    if j < i
        return zero(eltype(cc.cumsum))
    end

    mu = (cc.cumsum[j + 1] - cc.cumsum[i]) / (j - i + 1)
    result = cc.cumsum2[j + 1] - cc.cumsum2[i]
    result += (j - i + 1) * (mu * mu)
    result -= (2 * mu) * (cc.cumsum[j + 1] - cc.cumsum[i])

    return result
end


struct LookUp
    calculator::CostCalculator
    D::Matrix{Float32}
end

function (lu::LookUp)(k, i, j)
    col = min(i, j - 1)
    if col == 0
        col = size(lu.D, 2) + col
    end
    return lu.D[k - 1, col] + calc(lu.calculator, j, i)
end

function cluster(array, k)
    n = length(array)
    return cluster(array, n, min(k, n))
end

function cluster(array, n, k)
    # Sort input array and save info for de-sorting
    sort_idx = sortperm(array)
    undo_sort_lookup = Vector{Int32}(undef, n)
    sorted_array = Vector{Float32}(undef, n)

    @inbounds for i in 1:n
        sorted_array[i] = array[sort_idx[i]]
        undo_sort_lookup[sort_idx[i]] = i
    end

    #Set D and T using dynamic programming algorithm
    cost_calculator = CostCalculator(sorted_array, n)
    D = Matrix{Float32}(undef, k, n)
    T = Matrix{Int32}(undef, k, n)
    lookup = LookUp(cost_calculator, D)

    @inbounds for i in 1:n
        D[1, i] = calc(cost_calculator, 1, i)
        T[1, i] = 1
    end

    for k_ in 2:k
        row_argmins = smawk(n, n, lookup, k_)
        @inbounds for i in 1:n
            argmin = row_argmins[i]
            min = lookup(k_, i, argmin)
            D[k_, i] = min
            T[k_, i] = argmin
        end
    end

    #Extract cluster assignments by backtracking
    centroids = zeros(k)
    sorted_clusters = Vector{Int32}(undef, n)
    t = n + 1
    k_ = k
    n_ = n

    @inbounds while t > 1
        t_ = t
        t = T[k_, n_]
        centroid = 0.0
        for i in t:t_-1
            sorted_clusters[i] = k_
            centroid += (sorted_array[i] - centroid) / (i - t + 1)
        end
        centroids[k_] = centroid
        k_ -= 1
        n_ = t - 1
    end

    clusters = Vector{Int32}(undef, n)
    @inbounds for i in 1:n
        clusters[i] = sorted_clusters[undo_sort_lookup[i]]
    end

    return centroids, clusters
end


end  # module

As others have mentioned, there are issues regarding the type instability of calc and the use of some globals, but the main issue is that your CostCalculator has abstract fields Array{Float32} as opposed Vector{Float32} (AKA Array{Float32, 1}), so julia cannot really optimize the code paths involving CostCalculator.

With the code above I get around 5 ms.

9 Likes