Here’s a more faithful translation of the C++ code:
KMeans1D.jl
module Kmeans1D
function smawk(nrows, ncols, lookup, k)
result = Vector{Int32}(undef, nrows)
rows = collect(Int32.(1:nrows))
cols = collect(Int32.(1:ncols))
smawk!(result, rows, cols, lookup, k)
end
function smawk!(result, rows, cols, lookup, k)
length(rows) == 0 && return result
## REDUCE
#
_cols = Int32[]
for col in cols
@inbounds while true
length(_cols) == 0 && break
row = rows[length(_cols)]
if lookup(k, row, col) >= lookup(k, row, _cols[end])
break
end
pop!(_cols)
end
if length(_cols) < length(rows)
push!(_cols, col)
end
end
# Call recursively on odd-indexed rows
odd_rows = Int32[]
@inbounds for i in 2:2:length(rows)
push!(odd_rows, rows[i])
end
smawk!(result, odd_rows, _cols, lookup, k)
col_idx_lookup = Dict{Int32, Int32}()
@inbounds for idx in 1:length(_cols)
col_idx_lookup[_cols[idx]] = idx
end
## INTERPOLATE
# Fill-in even-indexed rows
start = 1
@inbounds for r in 1:2:length(rows)
row = rows[r]
stop = length(_cols) - 1
if r < (length(rows) - 1)
stop = col_idx_lookup[result[rows[r + 1]]]
end
argmin = _cols[start]
min = lookup(k, row, argmin)
for c in start+1:stop+1
value = lookup(k, row, _cols[c])
if (c == start) || (value < min)
argmin = _cols[c]
min = value
end
end
result[row] = argmin
start = stop
end
return result
end
struct CostCalculator
cumsum::Vector{Float32}
cumsum2::Vector{Float32}
function CostCalculator(array, n::Integer)
cumsum = Float32[0.0]
cumsum2 = Float32[0.0]
@inbounds for i in 1:n
x = array[i]
push!(cumsum, x + cumsum[i])
push!(cumsum2, x * x + cumsum2[i])
end
return new(cumsum, cumsum2)
end
end
function calc(cc::CostCalculator, i, j)
if j < i
return zero(eltype(cc.cumsum))
end
mu = (cc.cumsum[j + 1] - cc.cumsum[i]) / (j - i + 1)
result = cc.cumsum2[j + 1] - cc.cumsum2[i]
result += (j - i + 1) * (mu * mu)
result -= (2 * mu) * (cc.cumsum[j + 1] - cc.cumsum[i])
return result
end
struct LookUp
calculator::CostCalculator
D::Matrix{Float32}
end
function (lu::LookUp)(k, i, j)
col = min(i, j - 1)
if col == 0
col = size(lu.D, 2) + col
end
return lu.D[k - 1, col] + calc(lu.calculator, j, i)
end
function cluster(array, k)
n = length(array)
return cluster(array, n, min(k, n))
end
function cluster(array, n, k)
# Sort input array and save info for de-sorting
sort_idx = sortperm(array)
undo_sort_lookup = Vector{Int32}(undef, n)
sorted_array = Vector{Float32}(undef, n)
@inbounds for i in 1:n
sorted_array[i] = array[sort_idx[i]]
undo_sort_lookup[sort_idx[i]] = i
end
#Set D and T using dynamic programming algorithm
cost_calculator = CostCalculator(sorted_array, n)
D = Matrix{Float32}(undef, k, n)
T = Matrix{Int32}(undef, k, n)
lookup = LookUp(cost_calculator, D)
@inbounds for i in 1:n
D[1, i] = calc(cost_calculator, 1, i)
T[1, i] = 1
end
for k_ in 2:k
row_argmins = smawk(n, n, lookup, k_)
@inbounds for i in 1:n
argmin = row_argmins[i]
min = lookup(k_, i, argmin)
D[k_, i] = min
T[k_, i] = argmin
end
end
#Extract cluster assignments by backtracking
centroids = zeros(k)
sorted_clusters = Vector{Int32}(undef, n)
t = n + 1
k_ = k
n_ = n
@inbounds while t > 1
t_ = t
t = T[k_, n_]
centroid = 0.0
for i in t:t_-1
sorted_clusters[i] = k_
centroid += (sorted_array[i] - centroid) / (i - t + 1)
end
centroids[k_] = centroid
k_ -= 1
n_ = t - 1
end
clusters = Vector{Int32}(undef, n)
@inbounds for i in 1:n
clusters[i] = sorted_clusters[undo_sort_lookup[i]]
end
return centroids, clusters
end
end # module
As others have mentioned, there are issues regarding the type instability of calc
and the use of some globals, but the main issue is that your CostCalculator
has abstract fields Array{Float32}
as opposed Vector{Float32}
(AKA Array{Float32, 1}
), so julia cannot really optimize the code paths involving CostCalculator
.
With the code above I get around 5 ms.