refactor: update kmeans examples with better plots

This commit is contained in:
2026-05-01 11:04:31 +02:00
parent 959e53b7b3
commit d5258a6edf
5 changed files with 20 additions and 10 deletions
Binary file not shown.

Before

Width:  |  Height:  |  Size: 402 KiB

After

Width:  |  Height:  |  Size: 403 KiB

+16 -7
View File
@@ -3,16 +3,20 @@ Use k-means to try to match handwritten digits and see if changing the parameter
results in better recognition. results in better recognition.
- This is an example of an unsupervised ML algorithm - This is an example of an unsupervised ML algorithm
- it has no labels on the training data - it has no labels in the training data
- it discovers the structure on its own - it discovers the structure on its own
- thus the cluster numbers are arbitrary and do not correspond to the class labels - thus the cluster numbers are arbitrary and do not correspond to the class labels
Takaway:
- Hier ist k-means nicht der beste algorithmus, weil die Daten nicht in schön kugelförmig
verteilten Clustern angeordnet sind und k-means Mühe hat die Centroiden sauber zu bestimmen.
""" """
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from sklearn import datasets from sklearn import datasets
from sklearn.cluster import KMeans
from sklearn import metrics from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA from sklearn.decomposition import PCA
# get the digits dataset # get the digits dataset
@@ -34,10 +38,18 @@ print(metrics.completeness_score(digits.target, kmeans.labels_))
print(metrics.adjusted_rand_score(digits.target, kmeans.labels_)) print(metrics.adjusted_rand_score(digits.target, kmeans.labels_))
print(metrics.silhouette_score(digits.data, kmeans.labels_)) print(metrics.silhouette_score(digits.data, kmeans.labels_))
# Wikipedia:
# PCA (Principal Component Analysis) finds the directions in your data with
# the most variance and projects everything onto those axes.
#
# Irgendwas mit Eigenvektoren und Kovarianz Matrix, TODO: Anschauen
# Transformiert 64 dimensionalen Vektor möglichst gut in eine 2D Projektion
pca = PCA(n_components=2) pca = PCA(n_components=2)
X2d = pca.fit_transform(digits.data) X2d = pca.fit_transform(digits.data)
centroids2d = pca.transform(kmeans.cluster_centers_) centroids2d = pca.transform(kmeans.cluster_centers_)
# Punktewolke plotten und Centroiden einzeichnen, tab10 gibt 10 versch. Farben für die Legende
# Hier sieht man grosse überlappung zwischen den Clustern -> ein Hinweis, das K-Means nicht optimal ist?
plt.figure(figsize=(10, 8)) plt.figure(figsize=(10, 8))
scatter = plt.scatter(X2d[:, 0], X2d[:, 1], c=kmeans.labels_, cmap='tab10', s=10, alpha=0.6) scatter = plt.scatter(X2d[:, 0], X2d[:, 1], c=kmeans.labels_, cmap='tab10', s=10, alpha=0.6)
plt.scatter(centroids2d[:, 0], centroids2d[:, 1], c='red', marker='X', s=200, edgecolors='black') plt.scatter(centroids2d[:, 0], centroids2d[:, 1], c='red', marker='X', s=200, edgecolors='black')
@@ -47,14 +59,11 @@ plt.title('K-Means on Digits (PCA projection)')
plt.colorbar(scatter, label='Cluster') plt.colorbar(scatter, label='Cluster')
plt.savefig('kmeans_digits.png', dpi=150, bbox_inches='tight') plt.savefig('kmeans_digits.png', dpi=150, bbox_inches='tight')
# Centroiden als 8x8 Bild darstellen, indem man das "durchschnittliche zeichen" um das Zentrum plottet
# Dieser plot zeigt was die K-Means "gelernt" hat, man sieht die Zuweisung von Cluster zu Zahl sofort
fig, axes = plt.subplots(2, 5, figsize=(10, 4)) fig, axes = plt.subplots(2, 5, figsize=(10, 4))
for i, ax in enumerate(axes.flat): for i, ax in enumerate(axes.flat):
ax.imshow(kmeans.cluster_centers_[i].reshape(8, 8), cmap='gray_r') ax.imshow(kmeans.cluster_centers_[i].reshape(8, 8), cmap='gray_r')
ax.set_title(f'Cluster {i}') ax.set_title(f'Cluster {i}')
ax.axis('off') ax.axis('off')
fig.savefig('kmeans_digits_centroids.png', dpi=150, bbox_inches='tight') fig.savefig('kmeans_digits_centroids.png', dpi=150, bbox_inches='tight')
"""
Takaway:
- Hier ist k-means nicht der richtige algorithmus, weil die Daten nicht schön kugelförmig verteilt sind und sich nicht gut clustern lassen.
"""
Binary file not shown.

Before

Width:  |  Height:  |  Size: 16 KiB

After

Width:  |  Height:  |  Size: 16 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 94 KiB

After

Width:  |  Height:  |  Size: 96 KiB

+4 -3
View File
@@ -30,7 +30,7 @@ kmeans = KMeans(n_clusters=3)
# fit auf daten # fit auf daten
kmeans.fit(iris.data) kmeans.fit(iris.data)
# gegenüberstellung gold standard vs prediction # Gegenüberstellung gold standard vs prediction
print("gold standard vs. prediction") print("gold standard vs. prediction")
for target_label, predicted_label in zip(iris.target, kmeans.labels_): for target_label, predicted_label in zip(iris.target, kmeans.labels_):
print(f"{target_label} -> {predicted_label}") print(f"{target_label} -> {predicted_label}")
@@ -41,12 +41,13 @@ print(metrics.completeness_score(iris.target, kmeans.labels_))
print(metrics.adjusted_rand_score(iris.target, kmeans.labels_)) print(metrics.adjusted_rand_score(iris.target, kmeans.labels_))
print(metrics.silhouette_score(iris.data, kmeans.labels_)) print(metrics.silhouette_score(iris.data, kmeans.labels_))
# plot vorbereiten # plot vorbereiten (Idee von kmeans digits)
# Transformation 4D nach 2D via Projektionsfit
pca = PCA(n_components=2) pca = PCA(n_components=2)
X2d = pca.fit_transform(iris.data) X2d = pca.fit_transform(iris.data)
centroids2d = pca.transform(kmeans.cluster_centers_) centroids2d = pca.transform(kmeans.cluster_centers_)
# plot # plotten der Punktewolke und einzeichnen der Centroiden
plt.scatter(X2d[:, 0], X2d[:, 1], c=kmeans.labels_, cmap="viridis", s=30, alpha=0.7) plt.scatter(X2d[:, 0], X2d[:, 1], c=kmeans.labels_, cmap="viridis", s=30, alpha=0.7)
plt.scatter( plt.scatter(
centroids2d[:, 0], centroids2d[:, 1], c="red", marker="X", s=200, edgecolors="black" centroids2d[:, 0], centroids2d[:, 1], c="red", marker="X", s=200, edgecolors="black"