cas-pml/ML/aufgaben/kmeans/kmeans_digits.py

"""
Use k-means to try to match handwritten digits and see if changing the parameters
results in better recognition.

- This is an example of an unsupervised ML algorithm
    - it has no labels on the training data
    - it discovers the structure on its own
    - thus the cluster numbers are arbitrary and do not correspond to the class labels
"""

import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.decomposition import PCA

# get the digits dataset
digits = datasets.load_digits()

# 100 samples pro ziffer
# 64 pixel pro zahl
print(digits.data.shape)

# ausprobieren verschiedener parameter
# kmeans = KMeans(n_clusters=10, init="random", n_init=1)
# kmeans = KMeans(n_clusters=10)
kmeans = KMeans(n_clusters=10, init="k-means++", n_init=10)
kmeans.fit(digits.data)

print(list(zip(digits.target, kmeans.labels_)))
print(metrics.homogeneity_score(digits.target, kmeans.labels_))
print(metrics.completeness_score(digits.target, kmeans.labels_))
print(metrics.adjusted_rand_score(digits.target, kmeans.labels_))
print(metrics.silhouette_score(digits.data, kmeans.labels_))

pca = PCA(n_components=2)
X2d = pca.fit_transform(digits.data)
centroids2d = pca.transform(kmeans.cluster_centers_)

plt.figure(figsize=(10, 8))
scatter = plt.scatter(X2d[:, 0], X2d[:, 1], c=kmeans.labels_, cmap='tab10', s=10, alpha=0.6)
plt.scatter(centroids2d[:, 0], centroids2d[:, 1], c='red', marker='X', s=200, edgecolors='black')
plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} var)')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} var)')
plt.title('K-Means on Digits (PCA projection)')
plt.colorbar(scatter, label='Cluster')
plt.savefig('kmeans_digits.png', dpi=150, bbox_inches='tight')

fig, axes = plt.subplots(2, 5, figsize=(10, 4))
for i, ax in enumerate(axes.flat):
    ax.imshow(kmeans.cluster_centers_[i].reshape(8, 8), cmap='gray_r')
    ax.set_title(f'Cluster {i}')
    ax.axis('off')
fig.savefig('kmeans_digits_centroids.png', dpi=150, bbox_inches='tight')

"""
Takaway:
- Hier ist k-means nicht der richtige algorithmus, weil die Daten nicht schön kugelförmig verteilt sind und sich nicht gut clustern lassen.
"""