Hi everyone,
I have been learning Julia since January by trying to implement classic machine learning algorithms from scratch. My latest project is the KMeans clustering algorithm. I have been successful with this implementation (by comparing results with the implementation sklearn with Python).
The Python implementation (using Python 3.81
, sklearn 0.22.1
, and IPython 7.12.0
) is as follows;
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
from sklearn.datasets import make_blobs
X, y = make_blobs(n_samples=1000000, n_features=3,
centers=3, cluster_std=0.9, random_state=80)
def test_speed(x):
"""
Elbow method selection method for best k in terms of inertia
"""
for i in range(2, 11):
model = KMeans(n_clusters=i, init='k-means++', max_iter=300).fit(x)
l, c, s = model.labels_, model.cluster_centers_, model.inertia_
%timeit test_speed(X)
which gives this result:
1min 20s ± 5.57 s per loop (mean ± std. dev. of 7 runs, 1 loop each)
My current Julia implementation is as follows:
using Pkg
# Replace python environment to suit your needs
ENV["PYTHON"] = "YOUR PYTHON ENVIRONMENT WITH NEEDED PACKAGES"
Pkg.build("PyCall") # Build PyCall to suit the specified Python env
using PyCall
using Plots
using LinearAlgebra
using Statistics
using StatsBase
using BenchmarkTools
using Distances
# import the same data
data = pyimport("sklearn.datasets")
X, y = data.make_blobs(n_samples=1000000, n_features=3,
centers=3, cluster_std=0.9, random_state=80);
"""
smart_init(X, k; init="k-means++")
This function handles the random initialisation of the centroids from the
design matrix (X) and desired groups (k) that a user supplies.
"""
function smart_init(X::Array{Float64, 2}, k::Int; init::String="k-means++")
n_row, n_col = size(X)
if init == "k-means++"
# randonmly select the first centroid from the data (X)
centroids = zeros(k, n_col)
rand_idx = rand(1:n_row)
centroids[1, :] = X[rand_idx, :]
# compute distances from the first centroid chosen to all the other data points
first_centroid_matrix = convert(Matrix, centroids[1, :]')
# flatten distances
distances = vec(pairwise(Euclidean(), X, first_centroid_matrix, dims = 1))
for i = 2:k
# choose the next centroid, the probability for each data point to be chosen
# is directly proportional to its squared distance from the nearest centroid
prob = distances .^ 2
r_idx = sample(1:n_row, ProbabilityWeights(prob))
centroids[i, :] = X[r_idx, :]
# Ignore setting the last centroid to help the separation of centroids
if i == (k-1)
break
end
# compute distances from the centroids to all data points
current_centroid_matrix = convert(Matrix, centroids[i, :]')
new_distances = vec(pairwise(Euclidean(), X, current_centroid_matrix, dims = 1))
# and update the squared distance as the minimum distance to all centroid
distances = minimum([distances, new_distances])
end
else
# randomly select points from the design matrix as the initial centroids
rand_indices = rand(1:n_row, k)
centroids = X[rand_indices, :]
end
return centroids, n_row, n_col
end
"""
sum_of_squares(x, labels, centre, k)
This function computes the total sum of squares
"""
function sum_of_squares(x::Array{Float64,2}, labels::Array{Int64,1}, centre::Array, k::Int)
ss = 0
for j = 1:k
group_data = x[findall(labels .== j), :]
group_centroid_matrix = convert(Matrix, centre[j, :]')
group_distance = pairwise(Euclidean(), group_data, group_centroid_matrix, dims=1)
ss += sum(group_distance .^ 2)
end
return ss
end
"""
Kmeans(design_matrix, k; k_init="k-means++", max_iters=300, tol=1e-4, verbose=true)
This main function employs the K-means algorithm to cluster all examples
in the training data (design_matrix) into k groups using either the
`k-means++` or random initialisation.
"""
function Kmeans(design_matrix::Array{Float64, 2}, k::Int64; k_init::String="k-means++",
max_iters::Int64=300, tol=1e-4, verbose::Bool=true)
centroids, n_row, n_col = smart_init(design_matrix, k, init=k_init)
labels = rand(1:k, n_row)
distances = zeros(n_row)
J_previous = Inf64
# Update centroids & labels with closest members until convergence
for iter = 1:max_iters
nearest_neighbour = pairwise(Euclidean(), design_matrix, centroids, dims=1)
min_val_idx = findmin.(eachrow(nearest_neighbour))
distances = [x[1] for x in min_val_idx]
labels = [x[2] for x in min_val_idx]
centroids = [ mean( X[findall(labels .== j), : ], dims = 1) for j = 1:k]
centroids = reduce(vcat, centroids)
# Cost objective
J = (norm(distances) ^ 2) / n_row
if verbose
# Show progress and terminate if J stopped decreasing.
println("Iteration ", iter, ": Jclust = ", J, ".")
end;
# Final Step: Check for convergence
if iter > 1 && abs(J - J_previous) < (tol * J)
sum_squares = sum_of_squares(design_matrix, labels, centroids, k)
# Terminate algorithm with the assumption that K-means has converged
if verbose
println("Successfully terminated with convergence.")
end
return labels, centroids, sum_squares
elseif iter == max_iters && abs(J - J_previous) > (tol * J)
throw(error("Failed to converge Check data and/or implementation or increase max_iter."))
end;
J_previous = J
end
end
# Test the speed of the implementation
function test_speed(x)
for i = 2:10
l, c, s = Kmeans(X, i, k_init="k-means++", verbose=false)
end
end
# Benchmark results
r = @benchmark test_speed(X) samples=7 seconds=300
The benchmarking results are impressive as my-not-so-optimized implementation is already almost 5x faster than a heavily optimized Python library like sklearn;
BenchmarkTools.Trial:
memory estimate: 14.43 GiB
allocs estimate: 76011811
--------------
minimum time: 15.255 s (14.98% GC)
median time: 16.433 s (14.41% GC)
mean time: 16.422 s (14.80% GC)
maximum time: 17.681 s (17.37% GC)
--------------
samples: 7
evals/sample: 1
Are there any optimization tips that can even squeeze more performance from Julia? Thanks in advance for even reading this long post! I love Julia!