Silhouette coefficient calculation

Shayan · November 7, 2022, 8:24am

I’ve written a Julia script to calculate the silhouette coefficient of the ClusterAnalysis.jl Kmeans’ result, but it’s highly inefficient and slow. I’ve written the procedure for both common and simplified versions of the metric calculation. I put the code down below and asking for any help to improve the efficiency. Also, I put the result of profiling by VSCode’s Julia extension to provide additional information.

1. The code

## 1. Packages
using ClusterAnalysis, DataFrames, Statistics, Tables, BenchmarkTools

## 2. Data
df = DataFrame(rand(Int64, 100_000, 3), :auto);
model = kmeans(df, 4);

## 3. Types declaration
abstract type Silhouette end
struct Common <: Silhouette end
struct Simplified <: Silhouette end

## 4. Functions
"""

    euclidean_dist(a::AbstractVector{T}, b::AbstractVector{S})::Float64 where {T <: Real, S<:Real}

Calculate the euclidean distance between two data points.

# Arguments (positional)
- `a`: First vector.
- `b`: Second vector.

# Returns
A Float64 value as the distance.

# Example

    function euclidean_dist(a::AbstractVector{T}, b::AbstractVector{T})::Float64 where T <: Real
        @assert size(a) == size(b) "a and b must have the same size"
        @assert typeof(a) == typeof(b) "a and b must have the same type"
        return sqrt(sum((big.(a) - big.(b)).^2))
    end

julia> a = rand(1, 2); b = rand(1, 2);
julia> euclidean_dist(a[1, :], b[1, :])
0.47287559342076224


"""
function euclidean_dist(a::AbstractVector{T}, b::AbstractVector{S})::Float64 where {T <: Real, S<:Real}
    @assert size(a) == size(b) "a and b must have the same size"
    sum_dist = zero(Float64)

    @simd for idx in eachindex(a)
        @inbounds sum_dist += (big(a[idx]) - big(b[idx]))^2
    end

    return sqrt(sum_dist)
end

"""

    aᵢ(data::AbstractArray{T}, labels::AbstractVector{S}, i::Int64, method::Common) where {T<:Real, S<:Real}
    aᵢ(data::AbstractArray{T}, labels::AbstractVector{S}, i::Int64, method::Simplified, centers) where {T<:Real, S<:Real}

Calculate the euclidean distance between i'th sample and data points in the same
cluster.

# Arguments (positional)
- `data`: data set.
- `labels`: cluster identity of each sample
- `i`: The index of the sample.

# Returns
A Float64 value that represents the sum distance between sample i and data points
in the same cluster.
"""

function aᵢ(data::AbstractArray{T}, labels::AbstractVector{S}, i::Int64, method::Common) where {T<:Real, S<:Real}
    labelᵢ = labels[i]
    same_cluster_members_idx = findall(isequal(labelᵢ), labels)
    n = length(same_cluster_members_idx)

    sum_dist = zero(Float64)
    @simd for idx in same_cluster_members_idx
        @inbounds sum_dist += euclidean_dist(data[i, :], data[idx, :])
    end

    return sum_dist/(n-1)
end

function aᵢ(data::AbstractArray{T}, labels::AbstractVector{S}, i::Int64, method::Simplified, centers) where {T<:Real, S<:Real}
    labelᵢ = labels[i]
    return euclidean_dist(data[i,:], centers[labelᵢ])
end

"""
    bᵢ(data::AbstractArray{T}, labels::AbstractVector{S}, i::Int64, method::Common, centers) where {T<:Real, S<:Real}
    bᵢ(data::AbstractArray{T}, labels::AbstractVector{S}, i::Int64, method::Simplified, centers) where {T<:Real, S<:Real}

Calculate the euclidean distance between i'th sample and data points in other clusters.

# Arguments (positional)
- `data`: data set.
- `labels`: cluster identity of each sample
- `i`: The index of the sample.
- `method`: Common or Simplified version of the silhouette.
- `centers`: Cluster centers.

# Returns
A Float64 value represents the sum distance between sample i and data points in other clusters.
"""

function bᵢ(data::AbstractArray{T}, labels::AbstractVector{S}, i::Int64, method::Common, centers) where {T<:Real, S<:Real}
    labelᵢ = labels[i]
    diff_cluster_members_idx = findall(!isequal(labelᵢ), labels)
    labels_ = labels[diff_cluster_members_idx]
    dissim_labels = unique(labels[diff_cluster_members_idx])
    labels_ = replace(labels_, (dissim_labels.=>[(1:length(dissim_labels))...])...)
    mean_dist = zeros(size(centers, 1)-1)
    for (n_idx,idx) in zip(labels_ , diff_cluster_members_idx)
        @inbounds mean_dist[n_idx] += euclidean_dist(data[i, :], data[idx, :])
    end

    return minimum(mean_dist)
end

function bᵢ(data::AbstractArray{T}, labels::AbstractVector{S}, i::Int64, method::Simplified, centers) where {T<:Real, S<:Real}
    labelᵢ = labels[i]
    unique_labels = unique(labels)
    clusters_to_iterate = [idx for idx=eachindex(unique_labels) if idx!=labelᵢ]
    mean_dist = zeros(length(clusters_to_iterate))
    @simd for clus_idx in eachindex(clusters_to_iterate)
        @inbounds mean_dist[clus_idx] = euclidean_dist(
            data[i,:],
            centers[clusters_to_iterate[clus_idx]]
        )
    end

    return minimum(mean_dist)
end

"""
    sᵢ(aᵢ, bᵢ)

Calculate the silhouette of i'th sample.

# Arguments (positional)
- `aᵢ`: The sum distance between i'th sample and data points in the same cluster.
- `bᵢ`: The sum distance between i'th sample and data points in other clusters.

# Returns
A Float64 value that represents the silhouette of the i'th sample.

"""
function sᵢ(aᵢ, bᵢ)
    return (bᵢ - aᵢ)/max(aᵢ, bᵢ)
end

"""
    Silouhette(input_data, model::KmeansResult)

Calculate the silhouette coefficient of the clustering result.

# Arguments (positional)
- `input_data`: data set.
- `model`: a `KmeansResult` object.

# Returns
A Float64 value represents the silhouette coefficient of the clustering result.
"""
function Silouhette(input_data, model::KmeansResult)
    Tables.istable(input_data) ? data=Tables.matrix(input_data) : error()
    labels = model.cluster
    centers = model.centroids
    if big(size(data, 1))^2 > 10^3
        method = Simplified()
        sᵢs = [
            sᵢ(
                aᵢ(data, labels, i, method, centers),
                bᵢ(data, labels, i, method, centers)
            )
            for i=1:size(data, 1)
        ]
    else
        method = Common()
        sᵢs = [
            sᵢ(
                aᵢ(data, labels, i, method),
                bᵢ(data, labels, i, method, centers)
            )
            for i=1:size(data, 1)
        ]
    end

    return mean(sᵢs)
end

## 5. Run
Silouhette(df, model)

2. Profiling

The picture has an acceptable quality, so it’s better to watch it closely. This result was achieved by running the code on the same df and model that I defined in the code above.

BambOoxX · November 7, 2022, 8:42am

Any reason not to use GitHub - JuliaStats/Distances.jl: A Julia package for evaluating distances (metrics) between vectors. for the actual distance computations ?

Shayan · November 7, 2022, 8:44am

Since I want to contribute to the development of ClusterAnalysis.jl, I decided not to add any additional dependencies to their package. That’s all reason behind it. They defined their own distance function here.

baggepinnen · November 7, 2022, 10:13am

Just FYI, there is an implementation of silhouettes available here
https://juliastats.org/Clustering.jl/stable/validate.html#Silhouettes-1

Shayan · November 7, 2022, 10:24am

Thanks, I knew that. It seems I should take a look to find my answer.

Shayan · November 7, 2022, 1:49pm

I made it.

Functions that changed:

The euclidean distance function got updated:

function euclidean(a::AbstractVector, b::AbstractArray)
    √(sum((a' .- b).^2))
end

All the aᵢ and the bᵢ implementations got updated:

function aᵢ(data::AbstractArray{T}, labels::AbstractVector{S}, i::Int64, method::Common) where {T<:Real, S<:Real}
    labelᵢ = labels[i]
    same_cluster_members_idx = findall(isequal(labelᵢ), labels)
    n = length(same_cluster_members_idx)

    sum_dist = euclidean(data[i, :], data[same_cluster_members_idx, :])
    return sum_dist/(n-1)
end

function aᵢ(data::AbstractArray{T}, labels::AbstractVector{S}, i::Int64, method::Simplified, centers) where {T<:Real, S<:Real}
    labelᵢ = labels[i]
    return euclidean(data[i, :], centers[labelᵢ]')
end

function bᵢ(data::AbstractArray{T}, labels::AbstractVector{S}, i::Int64, method::Common, centers) where {T<:Real, S<:Real}
    labelᵢ = labels[i]
    dissim_labels = [idx for idx=1:length(centers) if idx!=labelᵢ]
    mean_dist = similar(dissim_labels, Float64)
    idx = 0
    for (idx,j) in enumerate(dissim_labels)
        related_idx = findall(isequal(j), labels)
        @inbounds mean_dist[idx] = euclidean(data[i, :], data[related_idx, :])
    end

    return minimum(mean_dist)
end

function bᵢ(data::AbstractArray{T}, labels::AbstractVector{S}, i::Int64, method::Simplified, centers) where {T<:Real, S<:Real}
    labelᵢ = labels[i]
    clusters_to_iterate = [idx for idx=1:length(centers) if idx!=labelᵢ]
    center = vcat(transpose.(centers)...)
    mean_dist = [euclidean(data[i, :], center[clus_idx, :]) for clus_idx in clusters_to_iterate]

    return minimum(mean_dist)
end

Finally, benchmarking and the results (1000 data points and 4 clusters):

using ClusterAnalysis, DataFrames, Statistics, Tables, BenchmarkTools

df = DataFrame(rand(Int64, 1000, 2), :auto);
model = kmeans(df, 4);

@benchmark Silouhette($df, $model)
BenchmarkTools.Trial: 1741 samples with 1 evaluation.
 Range (min … max):  2.165 ms … 13.458 ms  ┊ GC (min … max): 0.00% … 63.10%
 Time  (median):     2.393 ms              ┊ GC (median):    0.00%
 Time  (mean ± σ):   2.851 ms ±  1.478 ms  ┊ GC (mean ± σ):  9.71% ± 13.98%

  █▆▅▄▃▄▄▂▂▁▁
  ████████████▅▆▅▄▄▄▁▁▁▁▁▁▄▄▄▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▅▇▆▄▄▅▇▇▅▅ █
  2.16 ms      Histogram: log(frequency) by time     10.1 ms <

 Memory estimate: 1.76 MiB, allocs estimate: 33516.

And plotting the results:

plot(
    scatter(
        Matrix(df)[:, 1],
        Matrix(df)[:, 2],
   )
)

plot(
        kind=:scatter,
        Matrix(df)[:, 1],
        Matrix(df)[:, 2],
        group=model.cluster,
        legend=false,
        title="K-means clustering",
        xlabel="X",
        ylabel="Y",
        markersize=4,
        markerstrokewidth=0,
        markeralpha=0.5,
        markershape=:circle,
        color=[:red :blue :green :orange],
        size=(600, 400)

)

benchmarking (100,000 data points and 4 clusters):

df = DataFrame(rand(Int64, 100_000, 2), :auto);
model = kmeans(df, 4);

@benchmark Silouhette($df, $model)
BenchmarkTools.Trial: 17 samples with 1 evaluation.
 Range (min … max):  271.559 ms … 353.798 ms  ┊ GC (min … max): 11.00% … 9.50%
 Time  (median):     300.393 ms               ┊ GC (median):    10.40%
 Time  (mean ± σ):   305.673 ms ±  22.780 ms  ┊ GC (mean ± σ):  10.45% ± 1.08%

  ▁       █    █▁▁  ▁  █   ▁         ▁ ▁ ▁       ▁▁           ▁  
  █▁▁▁▁▁▁▁█▁▁▁▁███▁▁█▁▁█▁▁▁█▁▁▁▁▁▁▁▁▁█▁█▁█▁▁▁▁▁▁▁██▁▁▁▁▁▁▁▁▁▁▁█ ▁
  272 ms           Histogram: frequency by time          354 ms <

 Memory estimate: 177.74 MiB, allocs estimate: 3498518.

Topic		Replies	Views
Clustering.jl Silhouettes Distances General Usage k-means	8	2151	July 18, 2022
How to make suitable data for silhouettes() function of Clustering package? General Usage package	7	978	October 3, 2018
Optimization tips for my julia code. Can I make it even faster and/or memory efficient? Performance question , python	24	4212	February 15, 2020
Kmeans problem General Usage	1	702	February 5, 2020
DimensionMismatch: The size of a distance matrix ((1, 440)) doesn't match the length of assignment vector (440) Machine Learning clustering	3	253	November 25, 2022

Silhouette coefficient calculation

1. The code

2. Profiling

Related topics