refactor: rename folder structure to sort for algorithms

2026-04-30 18:59:56 +02:00
parent 9a8d59290b
commit dbc2b765a7
13 changed files with 124 additions and 54 deletions
@@ -0,0 +1,60 @@
+"""
+Use k-means to try to match handwritten digits and see if changing the parameters
+results in better recognition.
+
+- This is an example of an unsupervised ML algorithm
+    - it has no labels on the training data
+    - it discovers the structure on its own
+    - thus the cluster numbers are arbitrary and do not correspond to the class labels
+"""
+
+import matplotlib.pyplot as plt
+
+from sklearn import datasets
+from sklearn.cluster import KMeans
+from sklearn import metrics
+from sklearn.decomposition import PCA
+
+# get the digits dataset
+digits = datasets.load_digits()
+
+# 100 samples pro ziffer
+# 64 pixel pro zahl
+print(digits.data.shape)
+
+# ausprobieren verschiedener parameter
+# kmeans = KMeans(n_clusters=10, init="random", n_init=1)
+# kmeans = KMeans(n_clusters=10)
+kmeans = KMeans(n_clusters=10, init="k-means++", n_init=10)
+kmeans.fit(digits.data)
+
+print(list(zip(digits.target, kmeans.labels_)))
+print(metrics.homogeneity_score(digits.target, kmeans.labels_))
+print(metrics.completeness_score(digits.target, kmeans.labels_))
+print(metrics.adjusted_rand_score(digits.target, kmeans.labels_))
+print(metrics.silhouette_score(digits.data, kmeans.labels_))
+
+pca = PCA(n_components=2)
+X2d = pca.fit_transform(digits.data)
+centroids2d = pca.transform(kmeans.cluster_centers_)
+
+plt.figure(figsize=(10, 8))
+scatter = plt.scatter(X2d[:, 0], X2d[:, 1], c=kmeans.labels_, cmap='tab10', s=10, alpha=0.6)
+plt.scatter(centroids2d[:, 0], centroids2d[:, 1], c='red', marker='X', s=200, edgecolors='black')
+plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} var)')
+plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} var)')
+plt.title('K-Means on Digits (PCA projection)')
+plt.colorbar(scatter, label='Cluster')
+plt.savefig('kmeans_digits.png', dpi=150, bbox_inches='tight')
+
+fig, axes = plt.subplots(2, 5, figsize=(10, 4))
+for i, ax in enumerate(axes.flat):
+    ax.imshow(kmeans.cluster_centers_[i].reshape(8, 8), cmap='gray_r')
+    ax.set_title(f'Cluster {i}')
+    ax.axis('off')
+fig.savefig('kmeans_digits_centroids.png', dpi=150, bbox_inches='tight')
+
+"""
+Takaway:
+- Hier ist k-means nicht der richtige algorithmus, weil die Daten nicht schön kugelförmig verteilt sind und sich nicht gut clustern lassen.
+"""