C++ code much faster than Julia how can I optimize it?

And using a Vector{Int32} instead of a Dict gets it down to 2.1 seconds. So probably >5x faster than the C++?

Kmeans1d no Dict
module Kmeans1D

function smawk!(result, rows, cols, allocs, lookup, k)
    length(rows) == 0 && return result

    (; cols_alloc, odd_rows, col_idx_lookup) = allocs[1]
    ## REDUCE
    ncols = 0
    @inbounds for col in cols
        while true
            ncols == 0 && break
            row = rows[ncols]
            if lookup(k, row, col) >= lookup(k, row, cols_alloc[ncols])
                break
            end
            ncols -= 1
        end
        if ncols < length(rows)
            ncols += 1
            cols_alloc[ncols] = col
        end
    end
    _cols = view(cols_alloc, Base.OneTo(ncols))

    # Call recursively on odd-indexed rows
    @inbounds for i in 2:2:length(rows)
        odd_rows[i >> 1] = rows[i]
    end

    smawk!(result, odd_rows, _cols, view(allocs, 2:lastindex(allocs)), lookup, k)

    @inbounds for idx in 1:ncols
        col_idx_lookup[_cols[idx]] = idx
    end

    ## INTERPOLATE

    # Fill-in even-indexed rows
    start = 1
    @inbounds for r in 1:2:length(rows)
        row = rows[r]
        stop = length(_cols) - 1
        if r < (length(rows) - 1)
            stop = col_idx_lookup[result[rows[r + 1]]]
        end
        argmin = _cols[start]
        min = lookup(k, row, argmin)
        for c in start+1:stop+1
            value = lookup(k, row, _cols[c])
            if (c == start) || (value < min)
                argmin = _cols[c]
                min = value
            end
        end
        result[row] = argmin
        start = stop
    end

    return result
end

struct CostCalculator
    cumsum::Vector{Float32}
    cumsum2::Vector{Float32}

    function CostCalculator(array, n::Integer)
        cumsum = zeros(n+1)
        cumsum2 = zeros(n+1)
        @inbounds for i in 1:n
            x = array[i]
            cumsum[i+1] = x + cumsum[i]
            cumsum2[i+1] = x * x + cumsum2[i]
        end
        @assert length(cumsum) == length(cumsum2) == n + 1
        return new(cumsum, cumsum2)
    end
end

function calc(cc::CostCalculator, i, j)
    if j < i
        return zero(eltype(cc.cumsum))
    end

    @inbounds begin
        mu = (cc.cumsum[j + 1] - cc.cumsum[i]) / (j - i + 1)
        result = cc.cumsum2[j + 1] - cc.cumsum2[i]
        result += (j - i + 1) * (mu * mu)
        result -= (2 * mu) * (cc.cumsum[j + 1] - cc.cumsum[i])
    end

    return result
end


struct LookUp
    calculator::CostCalculator
    D::Matrix{Float32}
end

function (lu::LookUp)(k, i, j)
    col = min(i, j - 1)
    if col == 0
        col = size(lu.D, 2) + col
    end
    return @inbounds lu.D[k - 1, col] + calc(lu.calculator, j, i)
end

function cluster(array, k)
    n = length(array)
    return cluster(array, n, min(k, n))
end

function cluster(array, n, k)
    # Sort input array and save info for de-sorting
    sort_idx = sortperm(array)
    undo_sort_lookup = Vector{Int32}(undef, n)
    sorted_array = Vector{Float32}(undef, n)

    @inbounds for i in 1:n
        sorted_array[i] = array[sort_idx[i]]
        undo_sort_lookup[sort_idx[i]] = i
    end

    #Set D and T using dynamic programming algorithm
    cost_calculator = CostCalculator(sorted_array, n)
    D = Matrix{Float32}(undef, k, n)
    T = Matrix{Int32}(undef, k, n)
    lookup = LookUp(cost_calculator, D)

    @inbounds for i in 1:n
        D[1, i] = calc(cost_calculator, 1, i)
        T[1, i] = 1
    end

    row_argmins = Vector{Int32}(undef, n)
    rows = Int32.(1:n)
    cols = Int32.(1:n)
    allocs = NamedTuple{(:odd_rows,:cols_alloc,:col_idx_lookup),Tuple{Vector{Int32},Vector{Int32},Vector{Int32}}}[]
    l = length(rows)
    while true 
        l == 0 && break
        odd_rows = Vector{Int32}(2:2:l)
        cols_alloc = zeros(Int32, l)
        col_idx_lookup = Vector{Int32}(undef, length(rows))
        push!(allocs, (; odd_rows, cols_alloc, col_idx_lookup))
        l ÷= 2
    end

    for k_ in 2:k
        smawk!(row_argmins, rows, cols, allocs, lookup, k_)
        @inbounds for i in 1:n
            argmin = row_argmins[i]
            min = lookup(k_, i, argmin)
            D[k_, i] = min
            T[k_, i] = argmin
        end
    end

    #Extract cluster assignments by backtracking
    centroids = zeros(k)
    sorted_clusters = Vector{Int32}(undef, n)
    t = n + 1
    k_ = k
    n_ = n

    @inbounds while t > 1
        t_ = t
        t = T[k_, n_]
        centroid = 0.0
        for i in t:t_-1
            sorted_clusters[i] = k_
            centroid += (sorted_array[i] - centroid) / (i - t + 1)
        end
        centroids[k_] = centroid
        k_ -= 1
        n_ = t - 1
    end

    clusters = Vector{Int32}(undef, n)
    @inbounds for i in 1:n
        clusters[i] = sorted_clusters[undo_sort_lookup[i]]
    end

    return centroids, clusters
end

end  # module

2 Likes