I’ve written a Julia script to calculate the silhouette coefficient of the ClusterAnalysis.jl
Kmeans’ result, but it’s highly inefficient and slow. I’ve written the procedure for both common and simplified versions of the metric calculation. I put the code down below and asking for any help to improve the efficiency. Also, I put the result of profiling by VSCode’s Julia extension to provide additional information.
1. The code
## 1. Packages
using ClusterAnalysis, DataFrames, Statistics, Tables, BenchmarkTools
## 2. Data
df = DataFrame(rand(Int64, 100_000, 3), :auto);
model = kmeans(df, 4);
## 3. Types declaration
abstract type Silhouette end
struct Common <: Silhouette end
struct Simplified <: Silhouette end
## 4. Functions
"""
euclidean_dist(a::AbstractVector{T}, b::AbstractVector{S})::Float64 where {T <: Real, S<:Real}
Calculate the euclidean distance between two data points.
# Arguments (positional)
- `a`: First vector.
- `b`: Second vector.
# Returns
A Float64 value as the distance.
# Example
function euclidean_dist(a::AbstractVector{T}, b::AbstractVector{T})::Float64 where T <: Real
@assert size(a) == size(b) "a and b must have the same size"
@assert typeof(a) == typeof(b) "a and b must have the same type"
return sqrt(sum((big.(a) - big.(b)).^2))
end
julia> a = rand(1, 2); b = rand(1, 2);
julia> euclidean_dist(a[1, :], b[1, :])
0.47287559342076224
"""
function euclidean_dist(a::AbstractVector{T}, b::AbstractVector{S})::Float64 where {T <: Real, S<:Real}
@assert size(a) == size(b) "a and b must have the same size"
sum_dist = zero(Float64)
@simd for idx in eachindex(a)
@inbounds sum_dist += (big(a[idx]) - big(b[idx]))^2
end
return sqrt(sum_dist)
end
"""
aᵢ(data::AbstractArray{T}, labels::AbstractVector{S}, i::Int64, method::Common) where {T<:Real, S<:Real}
aᵢ(data::AbstractArray{T}, labels::AbstractVector{S}, i::Int64, method::Simplified, centers) where {T<:Real, S<:Real}
Calculate the euclidean distance between i'th sample and data points in the same
cluster.
# Arguments (positional)
- `data`: data set.
- `labels`: cluster identity of each sample
- `i`: The index of the sample.
# Returns
A Float64 value that represents the sum distance between sample i and data points
in the same cluster.
"""
function aᵢ(data::AbstractArray{T}, labels::AbstractVector{S}, i::Int64, method::Common) where {T<:Real, S<:Real}
labelᵢ = labels[i]
same_cluster_members_idx = findall(isequal(labelᵢ), labels)
n = length(same_cluster_members_idx)
sum_dist = zero(Float64)
@simd for idx in same_cluster_members_idx
@inbounds sum_dist += euclidean_dist(data[i, :], data[idx, :])
end
return sum_dist/(n-1)
end
function aᵢ(data::AbstractArray{T}, labels::AbstractVector{S}, i::Int64, method::Simplified, centers) where {T<:Real, S<:Real}
labelᵢ = labels[i]
return euclidean_dist(data[i,:], centers[labelᵢ])
end
"""
bᵢ(data::AbstractArray{T}, labels::AbstractVector{S}, i::Int64, method::Common, centers) where {T<:Real, S<:Real}
bᵢ(data::AbstractArray{T}, labels::AbstractVector{S}, i::Int64, method::Simplified, centers) where {T<:Real, S<:Real}
Calculate the euclidean distance between i'th sample and data points in other clusters.
# Arguments (positional)
- `data`: data set.
- `labels`: cluster identity of each sample
- `i`: The index of the sample.
- `method`: Common or Simplified version of the silhouette.
- `centers`: Cluster centers.
# Returns
A Float64 value represents the sum distance between sample i and data points in other clusters.
"""
function bᵢ(data::AbstractArray{T}, labels::AbstractVector{S}, i::Int64, method::Common, centers) where {T<:Real, S<:Real}
labelᵢ = labels[i]
diff_cluster_members_idx = findall(!isequal(labelᵢ), labels)
labels_ = labels[diff_cluster_members_idx]
dissim_labels = unique(labels[diff_cluster_members_idx])
labels_ = replace(labels_, (dissim_labels.=>[(1:length(dissim_labels))...])...)
mean_dist = zeros(size(centers, 1)-1)
for (n_idx,idx) in zip(labels_ , diff_cluster_members_idx)
@inbounds mean_dist[n_idx] += euclidean_dist(data[i, :], data[idx, :])
end
return minimum(mean_dist)
end
function bᵢ(data::AbstractArray{T}, labels::AbstractVector{S}, i::Int64, method::Simplified, centers) where {T<:Real, S<:Real}
labelᵢ = labels[i]
unique_labels = unique(labels)
clusters_to_iterate = [idx for idx=eachindex(unique_labels) if idx!=labelᵢ]
mean_dist = zeros(length(clusters_to_iterate))
@simd for clus_idx in eachindex(clusters_to_iterate)
@inbounds mean_dist[clus_idx] = euclidean_dist(
data[i,:],
centers[clusters_to_iterate[clus_idx]]
)
end
return minimum(mean_dist)
end
"""
sᵢ(aᵢ, bᵢ)
Calculate the silhouette of i'th sample.
# Arguments (positional)
- `aᵢ`: The sum distance between i'th sample and data points in the same cluster.
- `bᵢ`: The sum distance between i'th sample and data points in other clusters.
# Returns
A Float64 value that represents the silhouette of the i'th sample.
"""
function sᵢ(aᵢ, bᵢ)
return (bᵢ - aᵢ)/max(aᵢ, bᵢ)
end
"""
Silouhette(input_data, model::KmeansResult)
Calculate the silhouette coefficient of the clustering result.
# Arguments (positional)
- `input_data`: data set.
- `model`: a `KmeansResult` object.
# Returns
A Float64 value represents the silhouette coefficient of the clustering result.
"""
function Silouhette(input_data, model::KmeansResult)
Tables.istable(input_data) ? data=Tables.matrix(input_data) : error()
labels = model.cluster
centers = model.centroids
if big(size(data, 1))^2 > 10^3
method = Simplified()
sᵢs = [
sᵢ(
aᵢ(data, labels, i, method, centers),
bᵢ(data, labels, i, method, centers)
)
for i=1:size(data, 1)
]
else
method = Common()
sᵢs = [
sᵢ(
aᵢ(data, labels, i, method),
bᵢ(data, labels, i, method, centers)
)
for i=1:size(data, 1)
]
end
return mean(sᵢs)
end
## 5. Run
Silouhette(df, model)
2. Profiling
The picture has an acceptable quality, so it’s better to watch it closely. This result was achieved by running the code on the same df
and model
that I defined in the code above.