# Silhouette coefficient calculation

I’ve written a Julia script to calculate the silhouette coefficient of the `ClusterAnalysis.jl` Kmeans’ result, but it’s highly inefficient and slow. I’ve written the procedure for both common and simplified versions of the metric calculation. I put the code down below and asking for any help to improve the efficiency. Also, I put the result of profiling by VSCode’s Julia extension to provide additional information.

# 1. The code

``````## 1. Packages
using ClusterAnalysis, DataFrames, Statistics, Tables, BenchmarkTools

## 2. Data
df = DataFrame(rand(Int64, 100_000, 3), :auto);
model = kmeans(df, 4);

## 3. Types declaration
abstract type Silhouette end
struct Common <: Silhouette end
struct Simplified <: Silhouette end

## 4. Functions
"""

euclidean_dist(a::AbstractVector{T}, b::AbstractVector{S})::Float64 where {T <: Real, S<:Real}

Calculate the euclidean distance between two data points.

# Arguments (positional)
- `a`: First vector.
- `b`: Second vector.

# Returns
A Float64 value as the distance.

# Example

function euclidean_dist(a::AbstractVector{T}, b::AbstractVector{T})::Float64 where T <: Real
@assert size(a) == size(b) "a and b must have the same size"
@assert typeof(a) == typeof(b) "a and b must have the same type"
return sqrt(sum((big.(a) - big.(b)).^2))
end

julia> a = rand(1, 2); b = rand(1, 2);
julia> euclidean_dist(a[1, :], b[1, :])
0.47287559342076224

"""
function euclidean_dist(a::AbstractVector{T}, b::AbstractVector{S})::Float64 where {T <: Real, S<:Real}
@assert size(a) == size(b) "a and b must have the same size"
sum_dist = zero(Float64)

@simd for idx in eachindex(a)
@inbounds sum_dist += (big(a[idx]) - big(b[idx]))^2
end

return sqrt(sum_dist)
end

"""

aᵢ(data::AbstractArray{T}, labels::AbstractVector{S}, i::Int64, method::Common) where {T<:Real, S<:Real}
aᵢ(data::AbstractArray{T}, labels::AbstractVector{S}, i::Int64, method::Simplified, centers) where {T<:Real, S<:Real}

Calculate the euclidean distance between i'th sample and data points in the same
cluster.

# Arguments (positional)
- `data`: data set.
- `labels`: cluster identity of each sample
- `i`: The index of the sample.

# Returns
A Float64 value that represents the sum distance between sample i and data points
in the same cluster.
"""

function aᵢ(data::AbstractArray{T}, labels::AbstractVector{S}, i::Int64, method::Common) where {T<:Real, S<:Real}
labelᵢ = labels[i]
same_cluster_members_idx = findall(isequal(labelᵢ), labels)
n = length(same_cluster_members_idx)

sum_dist = zero(Float64)
@simd for idx in same_cluster_members_idx
@inbounds sum_dist += euclidean_dist(data[i, :], data[idx, :])
end

return sum_dist/(n-1)
end

function aᵢ(data::AbstractArray{T}, labels::AbstractVector{S}, i::Int64, method::Simplified, centers) where {T<:Real, S<:Real}
labelᵢ = labels[i]
return euclidean_dist(data[i,:], centers[labelᵢ])
end

"""
bᵢ(data::AbstractArray{T}, labels::AbstractVector{S}, i::Int64, method::Common, centers) where {T<:Real, S<:Real}
bᵢ(data::AbstractArray{T}, labels::AbstractVector{S}, i::Int64, method::Simplified, centers) where {T<:Real, S<:Real}

Calculate the euclidean distance between i'th sample and data points in other clusters.

# Arguments (positional)
- `data`: data set.
- `labels`: cluster identity of each sample
- `i`: The index of the sample.
- `method`: Common or Simplified version of the silhouette.
- `centers`: Cluster centers.

# Returns
A Float64 value represents the sum distance between sample i and data points in other clusters.
"""

function bᵢ(data::AbstractArray{T}, labels::AbstractVector{S}, i::Int64, method::Common, centers) where {T<:Real, S<:Real}
labelᵢ = labels[i]
diff_cluster_members_idx = findall(!isequal(labelᵢ), labels)
labels_ = labels[diff_cluster_members_idx]
dissim_labels = unique(labels[diff_cluster_members_idx])
labels_ = replace(labels_, (dissim_labels.=>[(1:length(dissim_labels))...])...)
mean_dist = zeros(size(centers, 1)-1)
for (n_idx,idx) in zip(labels_ , diff_cluster_members_idx)
@inbounds mean_dist[n_idx] += euclidean_dist(data[i, :], data[idx, :])
end

return minimum(mean_dist)
end

function bᵢ(data::AbstractArray{T}, labels::AbstractVector{S}, i::Int64, method::Simplified, centers) where {T<:Real, S<:Real}
labelᵢ = labels[i]
unique_labels = unique(labels)
clusters_to_iterate = [idx for idx=eachindex(unique_labels) if idx!=labelᵢ]
mean_dist = zeros(length(clusters_to_iterate))
@simd for clus_idx in eachindex(clusters_to_iterate)
@inbounds mean_dist[clus_idx] = euclidean_dist(
data[i,:],
centers[clusters_to_iterate[clus_idx]]
)
end

return minimum(mean_dist)
end

"""
sᵢ(aᵢ, bᵢ)

Calculate the silhouette of i'th sample.

# Arguments (positional)
- `aᵢ`: The sum distance between i'th sample and data points in the same cluster.
- `bᵢ`: The sum distance between i'th sample and data points in other clusters.

# Returns
A Float64 value that represents the silhouette of the i'th sample.

"""
function sᵢ(aᵢ, bᵢ)
return (bᵢ - aᵢ)/max(aᵢ, bᵢ)
end

"""
Silouhette(input_data, model::KmeansResult)

Calculate the silhouette coefficient of the clustering result.

# Arguments (positional)
- `input_data`: data set.
- `model`: a `KmeansResult` object.

# Returns
A Float64 value represents the silhouette coefficient of the clustering result.
"""
function Silouhette(input_data, model::KmeansResult)
Tables.istable(input_data) ? data=Tables.matrix(input_data) : error()
labels = model.cluster
centers = model.centroids
if big(size(data, 1))^2 > 10^3
method = Simplified()
sᵢs = [
sᵢ(
aᵢ(data, labels, i, method, centers),
bᵢ(data, labels, i, method, centers)
)
for i=1:size(data, 1)
]
else
method = Common()
sᵢs = [
sᵢ(
aᵢ(data, labels, i, method),
bᵢ(data, labels, i, method, centers)
)
for i=1:size(data, 1)
]
end

return mean(sᵢs)
end

## 5. Run
Silouhette(df, model)
``````

# 2. Profiling

The picture has an acceptable quality, so it’s better to watch it closely. This result was achieved by running the code on the same `df` and `model` that I defined in the code above.

1 Like

Any reason not to use GitHub - JuliaStats/Distances.jl: A Julia package for evaluating distances (metrics) between vectors. for the actual distance computations ?

Since I want to contribute to the development of `ClusterAnalysis.jl`, I decided not to add any additional dependencies to their package. That’s all reason behind it. They defined their own distance function here.

Just FYI, there is an implementation of silhouettes available here
https://juliastats.org/Clustering.jl/stable/validate.html#Silhouettes-1

1 Like

Thanks, I knew that. It seems I should take a look to find my answer.

Functions that changed:
1. The euclidean distance function got updated:

``````function euclidean(a::AbstractVector, b::AbstractArray)
√(sum((a' .- b).^2))
end
``````
2. All the `aᵢ` and the `bᵢ` implementations got updated:

``````function aᵢ(data::AbstractArray{T}, labels::AbstractVector{S}, i::Int64, method::Common) where {T<:Real, S<:Real}
labelᵢ = labels[i]
same_cluster_members_idx = findall(isequal(labelᵢ), labels)
n = length(same_cluster_members_idx)

sum_dist = euclidean(data[i, :], data[same_cluster_members_idx, :])
return sum_dist/(n-1)
end

function aᵢ(data::AbstractArray{T}, labels::AbstractVector{S}, i::Int64, method::Simplified, centers) where {T<:Real, S<:Real}
labelᵢ = labels[i]
return euclidean(data[i, :], centers[labelᵢ]')
end

function bᵢ(data::AbstractArray{T}, labels::AbstractVector{S}, i::Int64, method::Common, centers) where {T<:Real, S<:Real}
labelᵢ = labels[i]
dissim_labels = [idx for idx=1:length(centers) if idx!=labelᵢ]
mean_dist = similar(dissim_labels, Float64)
idx = 0
for (idx,j) in enumerate(dissim_labels)
related_idx = findall(isequal(j), labels)
@inbounds mean_dist[idx] = euclidean(data[i, :], data[related_idx, :])
end

return minimum(mean_dist)
end

function bᵢ(data::AbstractArray{T}, labels::AbstractVector{S}, i::Int64, method::Simplified, centers) where {T<:Real, S<:Real}
labelᵢ = labels[i]
clusters_to_iterate = [idx for idx=1:length(centers) if idx!=labelᵢ]
center = vcat(transpose.(centers)...)
mean_dist = [euclidean(data[i, :], center[clus_idx, :]) for clus_idx in clusters_to_iterate]

return minimum(mean_dist)
end
``````
Finally, benchmarking and the results (1000 data points and 4 clusters):
``````using ClusterAnalysis, DataFrames, Statistics, Tables, BenchmarkTools

df = DataFrame(rand(Int64, 1000, 2), :auto);
model = kmeans(df, 4);

@benchmark Silouhette(\$df, \$model)
BenchmarkTools.Trial: 1741 samples with 1 evaluation.
Range (min … max):  2.165 ms … 13.458 ms  ┊ GC (min … max): 0.00% … 63.10%
Time  (median):     2.393 ms              ┊ GC (median):    0.00%
Time  (mean ± σ):   2.851 ms ±  1.478 ms  ┊ GC (mean ± σ):  9.71% ± 13.98%

█▆▅▄▃▄▄▂▂▁▁
████████████▅▆▅▄▄▄▁▁▁▁▁▁▄▄▄▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▅▇▆▄▄▅▇▇▅▅ █
2.16 ms      Histogram: log(frequency) by time     10.1 ms <

Memory estimate: 1.76 MiB, allocs estimate: 33516.
``````

And plotting the results:

``````plot(
scatter(
Matrix(df)[:, 1],
Matrix(df)[:, 2],
)
)

plot(
kind=:scatter,
Matrix(df)[:, 1],
Matrix(df)[:, 2],
group=model.cluster,
legend=false,
title="K-means clustering",
xlabel="X",
ylabel="Y",
markersize=4,
markerstrokewidth=0,
markeralpha=0.5,
markershape=:circle,
color=[:red :blue :green :orange],
size=(600, 400)

)
``````

benchmarking (100,000 data points and 4 clusters):
``````df = DataFrame(rand(Int64, 100_000, 2), :auto);
model = kmeans(df, 4);

@benchmark Silouhette(\$df, \$model)
BenchmarkTools.Trial: 17 samples with 1 evaluation.
Range (min … max):  271.559 ms … 353.798 ms  ┊ GC (min … max): 11.00% … 9.50%
Time  (median):     300.393 ms               ┊ GC (median):    10.40%
Time  (mean ± σ):   305.673 ms ±  22.780 ms  ┊ GC (mean ± σ):  10.45% ± 1.08%

▁       █    █▁▁  ▁  █   ▁         ▁ ▁ ▁       ▁▁           ▁
█▁▁▁▁▁▁▁█▁▁▁▁███▁▁█▁▁█▁▁▁█▁▁▁▁▁▁▁▁▁█▁█▁█▁▁▁▁▁▁▁██▁▁▁▁▁▁▁▁▁▁▁█ ▁
272 ms           Histogram: frequency by time          354 ms <

Memory estimate: 177.74 MiB, allocs estimate: 3498518.
``````