refactor: update kmeans examples with better plots
This commit is contained in:
Binary file not shown.
|
Before Width: | Height: | Size: 402 KiB After Width: | Height: | Size: 403 KiB |
@@ -3,16 +3,20 @@ Use k-means to try to match handwritten digits and see if changing the parameter
|
|||||||
results in better recognition.
|
results in better recognition.
|
||||||
|
|
||||||
- This is an example of an unsupervised ML algorithm
|
- This is an example of an unsupervised ML algorithm
|
||||||
- it has no labels on the training data
|
- it has no labels in the training data
|
||||||
- it discovers the structure on its own
|
- it discovers the structure on its own
|
||||||
- thus the cluster numbers are arbitrary and do not correspond to the class labels
|
- thus the cluster numbers are arbitrary and do not correspond to the class labels
|
||||||
|
|
||||||
|
Takaway:
|
||||||
|
- Hier ist k-means nicht der beste algorithmus, weil die Daten nicht in schön kugelförmig
|
||||||
|
verteilten Clustern angeordnet sind und k-means Mühe hat die Centroiden sauber zu bestimmen.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
from sklearn import datasets
|
from sklearn import datasets
|
||||||
from sklearn.cluster import KMeans
|
|
||||||
from sklearn import metrics
|
from sklearn import metrics
|
||||||
|
from sklearn.cluster import KMeans
|
||||||
from sklearn.decomposition import PCA
|
from sklearn.decomposition import PCA
|
||||||
|
|
||||||
# get the digits dataset
|
# get the digits dataset
|
||||||
@@ -34,10 +38,18 @@ print(metrics.completeness_score(digits.target, kmeans.labels_))
|
|||||||
print(metrics.adjusted_rand_score(digits.target, kmeans.labels_))
|
print(metrics.adjusted_rand_score(digits.target, kmeans.labels_))
|
||||||
print(metrics.silhouette_score(digits.data, kmeans.labels_))
|
print(metrics.silhouette_score(digits.data, kmeans.labels_))
|
||||||
|
|
||||||
|
# Wikipedia:
|
||||||
|
# PCA (Principal Component Analysis) finds the directions in your data with
|
||||||
|
# the most variance and projects everything onto those axes.
|
||||||
|
#
|
||||||
|
# Irgendwas mit Eigenvektoren und Kovarianz Matrix, TODO: Anschauen
|
||||||
|
# Transformiert 64 dimensionalen Vektor möglichst gut in eine 2D Projektion
|
||||||
pca = PCA(n_components=2)
|
pca = PCA(n_components=2)
|
||||||
X2d = pca.fit_transform(digits.data)
|
X2d = pca.fit_transform(digits.data)
|
||||||
centroids2d = pca.transform(kmeans.cluster_centers_)
|
centroids2d = pca.transform(kmeans.cluster_centers_)
|
||||||
|
|
||||||
|
# Punktewolke plotten und Centroiden einzeichnen, tab10 gibt 10 versch. Farben für die Legende
|
||||||
|
# Hier sieht man grosse überlappung zwischen den Clustern -> ein Hinweis, das K-Means nicht optimal ist?
|
||||||
plt.figure(figsize=(10, 8))
|
plt.figure(figsize=(10, 8))
|
||||||
scatter = plt.scatter(X2d[:, 0], X2d[:, 1], c=kmeans.labels_, cmap='tab10', s=10, alpha=0.6)
|
scatter = plt.scatter(X2d[:, 0], X2d[:, 1], c=kmeans.labels_, cmap='tab10', s=10, alpha=0.6)
|
||||||
plt.scatter(centroids2d[:, 0], centroids2d[:, 1], c='red', marker='X', s=200, edgecolors='black')
|
plt.scatter(centroids2d[:, 0], centroids2d[:, 1], c='red', marker='X', s=200, edgecolors='black')
|
||||||
@@ -47,14 +59,11 @@ plt.title('K-Means on Digits (PCA projection)')
|
|||||||
plt.colorbar(scatter, label='Cluster')
|
plt.colorbar(scatter, label='Cluster')
|
||||||
plt.savefig('kmeans_digits.png', dpi=150, bbox_inches='tight')
|
plt.savefig('kmeans_digits.png', dpi=150, bbox_inches='tight')
|
||||||
|
|
||||||
|
# Centroiden als 8x8 Bild darstellen, indem man das "durchschnittliche zeichen" um das Zentrum plottet
|
||||||
|
# Dieser plot zeigt was die K-Means "gelernt" hat, man sieht die Zuweisung von Cluster zu Zahl sofort
|
||||||
fig, axes = plt.subplots(2, 5, figsize=(10, 4))
|
fig, axes = plt.subplots(2, 5, figsize=(10, 4))
|
||||||
for i, ax in enumerate(axes.flat):
|
for i, ax in enumerate(axes.flat):
|
||||||
ax.imshow(kmeans.cluster_centers_[i].reshape(8, 8), cmap='gray_r')
|
ax.imshow(kmeans.cluster_centers_[i].reshape(8, 8), cmap='gray_r')
|
||||||
ax.set_title(f'Cluster {i}')
|
ax.set_title(f'Cluster {i}')
|
||||||
ax.axis('off')
|
ax.axis('off')
|
||||||
fig.savefig('kmeans_digits_centroids.png', dpi=150, bbox_inches='tight')
|
fig.savefig('kmeans_digits_centroids.png', dpi=150, bbox_inches='tight')
|
||||||
|
|
||||||
"""
|
|
||||||
Takaway:
|
|
||||||
- Hier ist k-means nicht der richtige algorithmus, weil die Daten nicht schön kugelförmig verteilt sind und sich nicht gut clustern lassen.
|
|
||||||
"""
|
|
||||||
|
|||||||
Binary file not shown.
|
Before Width: | Height: | Size: 16 KiB After Width: | Height: | Size: 16 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 94 KiB After Width: | Height: | Size: 96 KiB |
@@ -30,7 +30,7 @@ kmeans = KMeans(n_clusters=3)
|
|||||||
# fit auf daten
|
# fit auf daten
|
||||||
kmeans.fit(iris.data)
|
kmeans.fit(iris.data)
|
||||||
|
|
||||||
# gegenüberstellung gold standard vs prediction
|
# Gegenüberstellung gold standard vs prediction
|
||||||
print("gold standard vs. prediction")
|
print("gold standard vs. prediction")
|
||||||
for target_label, predicted_label in zip(iris.target, kmeans.labels_):
|
for target_label, predicted_label in zip(iris.target, kmeans.labels_):
|
||||||
print(f"{target_label} -> {predicted_label}")
|
print(f"{target_label} -> {predicted_label}")
|
||||||
@@ -41,12 +41,13 @@ print(metrics.completeness_score(iris.target, kmeans.labels_))
|
|||||||
print(metrics.adjusted_rand_score(iris.target, kmeans.labels_))
|
print(metrics.adjusted_rand_score(iris.target, kmeans.labels_))
|
||||||
print(metrics.silhouette_score(iris.data, kmeans.labels_))
|
print(metrics.silhouette_score(iris.data, kmeans.labels_))
|
||||||
|
|
||||||
# plot vorbereiten
|
# plot vorbereiten (Idee von kmeans digits)
|
||||||
|
# Transformation 4D nach 2D via Projektionsfit
|
||||||
pca = PCA(n_components=2)
|
pca = PCA(n_components=2)
|
||||||
X2d = pca.fit_transform(iris.data)
|
X2d = pca.fit_transform(iris.data)
|
||||||
centroids2d = pca.transform(kmeans.cluster_centers_)
|
centroids2d = pca.transform(kmeans.cluster_centers_)
|
||||||
|
|
||||||
# plot
|
# plotten der Punktewolke und einzeichnen der Centroiden
|
||||||
plt.scatter(X2d[:, 0], X2d[:, 1], c=kmeans.labels_, cmap="viridis", s=30, alpha=0.7)
|
plt.scatter(X2d[:, 0], X2d[:, 1], c=kmeans.labels_, cmap="viridis", s=30, alpha=0.7)
|
||||||
plt.scatter(
|
plt.scatter(
|
||||||
centroids2d[:, 0], centroids2d[:, 1], c="red", marker="X", s=200, edgecolors="black"
|
centroids2d[:, 0], centroids2d[:, 1], c="red", marker="X", s=200, edgecolors="black"
|
||||||
|
|||||||
Reference in New Issue
Block a user