cas-pml/ML/aufgaben/kmeans/kmeans_digits.py

"""
Use k-means to try to match handwritten digits and see if changing the parameters
results in better recognition.

- This is an example of an unsupervised ML algorithm
    - it has no labels in the training data
    - it discovers the structure on its own
    - thus the cluster numbers are arbitrary and do not correspond to the class labels

Takaway:
- Hier ist k-means nicht der beste algorithmus, weil die Daten nicht in schön kugelförmig
  verteilten Clustern angeordnet sind und k-means Mühe hat die Centroiden sauber zu bestimmen.
"""

import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# get the digits dataset
digits = datasets.load_digits()

# 100 samples pro ziffer
# 64 pixel pro zahl
print(digits.data.shape)

# ausprobieren verschiedener parameter
# kmeans = KMeans(n_clusters=10, init="random", n_init=1)
# kmeans = KMeans(n_clusters=10)
kmeans = KMeans(n_clusters=10, init="k-means++", n_init=10)
kmeans.fit(digits.data)

print(list(zip(digits.target, kmeans.labels_)))
print(metrics.homogeneity_score(digits.target, kmeans.labels_))
print(metrics.completeness_score(digits.target, kmeans.labels_))
print(metrics.adjusted_rand_score(digits.target, kmeans.labels_))
print(metrics.silhouette_score(digits.data, kmeans.labels_))

# Wikipedia:
# PCA (Principal Component Analysis) finds the directions in your data with
# the most variance and projects everything onto those axes.
#
# Irgendwas mit Eigenvektoren und Kovarianz Matrix, TODO: Anschauen
# Transformiert 64 dimensionalen Vektor möglichst gut in eine 2D Projektion
pca = PCA(n_components=2)
X2d = pca.fit_transform(digits.data)
centroids2d = pca.transform(kmeans.cluster_centers_)

# Punktewolke plotten und Centroiden einzeichnen, tab10 gibt 10 versch. Farben für die Legende
# Hier sieht man grosse überlappung zwischen den Clustern -> ein Hinweis, das K-Means nicht optimal ist?
plt.figure(figsize=(10, 8))
scatter = plt.scatter(X2d[:, 0], X2d[:, 1], c=kmeans.labels_, cmap='tab10', s=10, alpha=0.6)
plt.scatter(centroids2d[:, 0], centroids2d[:, 1], c='red', marker='X', s=200, edgecolors='black')
plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} var)')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} var)')
plt.title('K-Means on Digits (PCA projection)')
plt.colorbar(scatter, label='Cluster')
plt.savefig('kmeans_digits.png', dpi=150, bbox_inches='tight')

# Centroiden als 8x8 Bild darstellen, indem man das "durchschnittliche zeichen" um das Zentrum plottet
# Dieser plot zeigt was die K-Means "gelernt" hat, man sieht die Zuweisung von Cluster zu Zahl sofort
fig, axes = plt.subplots(2, 5, figsize=(10, 4))
for i, ax in enumerate(axes.flat):
    ax.imshow(kmeans.cluster_centers_[i].reshape(8, 8), cmap='gray_r')
    ax.set_title(f'Cluster {i}')
    ax.axis('off')
fig.savefig('kmeans_digits_centroids.png', dpi=150, bbox_inches='tight')