Silhouette coefficient calculation

Shayan · November 7, 2022, 1:49pm

I made it.

Functions that changed:

The euclidean distance function got updated:

function euclidean(a::AbstractVector, b::AbstractArray)
    √(sum((a' .- b).^2))
end

All the aᵢ and the bᵢ implementations got updated:

function aᵢ(data::AbstractArray{T}, labels::AbstractVector{S}, i::Int64, method::Common) where {T<:Real, S<:Real}
    labelᵢ = labels[i]
    same_cluster_members_idx = findall(isequal(labelᵢ), labels)
    n = length(same_cluster_members_idx)

    sum_dist = euclidean(data[i, :], data[same_cluster_members_idx, :])
    return sum_dist/(n-1)
end

function aᵢ(data::AbstractArray{T}, labels::AbstractVector{S}, i::Int64, method::Simplified, centers) where {T<:Real, S<:Real}
    labelᵢ = labels[i]
    return euclidean(data[i, :], centers[labelᵢ]')
end

function bᵢ(data::AbstractArray{T}, labels::AbstractVector{S}, i::Int64, method::Common, centers) where {T<:Real, S<:Real}
    labelᵢ = labels[i]
    dissim_labels = [idx for idx=1:length(centers) if idx!=labelᵢ]
    mean_dist = similar(dissim_labels, Float64)
    idx = 0
    for (idx,j) in enumerate(dissim_labels)
        related_idx = findall(isequal(j), labels)
        @inbounds mean_dist[idx] = euclidean(data[i, :], data[related_idx, :])
    end

    return minimum(mean_dist)
end

function bᵢ(data::AbstractArray{T}, labels::AbstractVector{S}, i::Int64, method::Simplified, centers) where {T<:Real, S<:Real}
    labelᵢ = labels[i]
    clusters_to_iterate = [idx for idx=1:length(centers) if idx!=labelᵢ]
    center = vcat(transpose.(centers)...)
    mean_dist = [euclidean(data[i, :], center[clus_idx, :]) for clus_idx in clusters_to_iterate]

    return minimum(mean_dist)
end

Finally, benchmarking and the results (1000 data points and 4 clusters):

using ClusterAnalysis, DataFrames, Statistics, Tables, BenchmarkTools

df = DataFrame(rand(Int64, 1000, 2), :auto);
model = kmeans(df, 4);

@benchmark Silouhette($df, $model)
BenchmarkTools.Trial: 1741 samples with 1 evaluation.
 Range (min … max):  2.165 ms … 13.458 ms  ┊ GC (min … max): 0.00% … 63.10%
 Time  (median):     2.393 ms              ┊ GC (median):    0.00%
 Time  (mean ± σ):   2.851 ms ±  1.478 ms  ┊ GC (mean ± σ):  9.71% ± 13.98%

  █▆▅▄▃▄▄▂▂▁▁
  ████████████▅▆▅▄▄▄▁▁▁▁▁▁▄▄▄▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▅▇▆▄▄▅▇▇▅▅ █
  2.16 ms      Histogram: log(frequency) by time     10.1 ms <

 Memory estimate: 1.76 MiB, allocs estimate: 33516.

And plotting the results:

plot(
    scatter(
        Matrix(df)[:, 1],
        Matrix(df)[:, 2],
   )
)

plot(
        kind=:scatter,
        Matrix(df)[:, 1],
        Matrix(df)[:, 2],
        group=model.cluster,
        legend=false,
        title="K-means clustering",
        xlabel="X",
        ylabel="Y",
        markersize=4,
        markerstrokewidth=0,
        markeralpha=0.5,
        markershape=:circle,
        color=[:red :blue :green :orange],
        size=(600, 400)

)

benchmarking (100,000 data points and 4 clusters):

df = DataFrame(rand(Int64, 100_000, 2), :auto);
model = kmeans(df, 4);

@benchmark Silouhette($df, $model)
BenchmarkTools.Trial: 17 samples with 1 evaluation.
 Range (min … max):  271.559 ms … 353.798 ms  ┊ GC (min … max): 11.00% … 9.50%
 Time  (median):     300.393 ms               ┊ GC (median):    10.40%
 Time  (mean ± σ):   305.673 ms ±  22.780 ms  ┊ GC (mean ± σ):  10.45% ± 1.08%

  ▁       █    █▁▁  ▁  █   ▁         ▁ ▁ ▁       ▁▁           ▁  
  █▁▁▁▁▁▁▁█▁▁▁▁███▁▁█▁▁█▁▁▁█▁▁▁▁▁▁▁▁▁█▁█▁█▁▁▁▁▁▁▁██▁▁▁▁▁▁▁▁▁▁▁█ ▁
  272 ms           Histogram: frequency by time          354 ms <

 Memory estimate: 177.74 MiB, allocs estimate: 3498518.

Topic		Replies	Views
Clustering.jl Silhouettes Distances General Usage k-means	8	2123	July 18, 2022
How to make suitable data for silhouettes() function of Clustering package? General Usage package	7	974	October 3, 2018
Optimization tips for my julia code. Can I make it even faster and/or memory efficient? Performance question , python	24	4204	February 15, 2020
DimensionMismatch: The size of a distance matrix ((1, 440)) doesn't match the length of assignment vector (440) Machine Learning clustering	3	249	November 25, 2022
Faster squared euclidean distance calculation Performance	11	1759	October 2, 2021

Silhouette coefficient calculation

Related topics