refactor: rename folder structure to sort for algorithms

This commit is contained in:
2026-04-30 18:59:56 +02:00
parent 9a8d59290b
commit dbc2b765a7
13 changed files with 124 additions and 54 deletions
Binary file not shown.

Before

Width:  |  Height:  |  Size: 280 KiB

-24
View File
@@ -1,24 +0,0 @@
#import numpy as np
from sklearn import datasets
from sklearn.cluster import KMeans
from sklearn import metrics
digits = datasets.load_digits()
# 100 samples pro ziffer
# 64 pixel pro zahl
print(digits.data.shape)
#print(len(np.unique(digits.target)))
# 10 cluster, random, n_init=1
kmeans = KMeans(n_clusters=10, init='random', n_init=1)
kmeans.fit(digits.data)
print(list(zip(digits.target, kmeans.labels_)))
print(metrics.homogeneity_score(digits.target, kmeans.labels_))
print(metrics.completeness_score(digits.target, kmeans.labels_))
print(metrics.adjusted_rand_score(digits.target, kmeans.labels_))
print(metrics.silhouette_score(digits.data, kmeans.labels_))
# auch hier ist kmeans nicht der richtige algorithmus, weil die Daten nicht schön kugelförmig verteilt sind und sich nicht gut clustern lassen
-28
View File
@@ -1,28 +0,0 @@
from sklearn import datasets
from sklearn.cluster import KMeans
from sklearn import metrics
iris = datasets.load_iris()
# print 150 samples
print(iris.target)
# clusters=3, centroiden zufällig wählen, n_init=50
kmeans = KMeans(n_clusters=3, init='random', n_init=50)
# fit auf daten
kmeans.fit(iris.data)
# print alle daten
#print(list(zip(iris.target, kmeans.labels_)))
# gegenüberstellung
print("gold standard vs. prediction")
for target_label, predicted_label in zip(iris.target, kmeans.labels_):
print(f'{target_label} vs. {predicted_label}')
print(metrics.homogeneity_score(iris.target, kmeans.labels_))
print(metrics.completeness_score(iris.target, kmeans.labels_))
print(metrics.adjusted_rand_score(iris.target, kmeans.labels_))
print(metrics.silhouette_score(iris.data, kmeans.labels_))
# erkenntnis, der Algo ist nicht perfekt für diese Art von Daten!!
Binary file not shown.

After

Width:  |  Height:  |  Size: 281 KiB

@@ -1,5 +1,9 @@
"""
Use a decision tree classifier to predict flowers based on sepal and petal length/width
Use a decisiontree classifier to predict flowers based on sepal and petal features
- This is an example of a supervised ML algorithm
- it has labels on the training data
- you tell the model: this is class X during training
"""
import matplotlib.pyplot as plt
@@ -35,4 +39,4 @@ tree.plot_tree(
rounded=True,
ax=ax,
)
fig.savefig("tree.png", dpi=150, bbox_inches="tight")
fig.savefig("decisiontree_iris.png", dpi=150, bbox_inches="tight")
Binary file not shown.

After

Width:  |  Height:  |  Size: 402 KiB

+60
View File
@@ -0,0 +1,60 @@
"""
Use k-means to try to match handwritten digits and see if changing the parameters
results in better recognition.
- This is an example of an unsupervised ML algorithm
- it has no labels on the training data
- it discovers the structure on its own
- thus the cluster numbers are arbitrary and do not correspond to the class labels
"""
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.decomposition import PCA
# get the digits dataset
digits = datasets.load_digits()
# 100 samples pro ziffer
# 64 pixel pro zahl
print(digits.data.shape)
# ausprobieren verschiedener parameter
# kmeans = KMeans(n_clusters=10, init="random", n_init=1)
# kmeans = KMeans(n_clusters=10)
kmeans = KMeans(n_clusters=10, init="k-means++", n_init=10)
kmeans.fit(digits.data)
print(list(zip(digits.target, kmeans.labels_)))
print(metrics.homogeneity_score(digits.target, kmeans.labels_))
print(metrics.completeness_score(digits.target, kmeans.labels_))
print(metrics.adjusted_rand_score(digits.target, kmeans.labels_))
print(metrics.silhouette_score(digits.data, kmeans.labels_))
pca = PCA(n_components=2)
X2d = pca.fit_transform(digits.data)
centroids2d = pca.transform(kmeans.cluster_centers_)
plt.figure(figsize=(10, 8))
scatter = plt.scatter(X2d[:, 0], X2d[:, 1], c=kmeans.labels_, cmap='tab10', s=10, alpha=0.6)
plt.scatter(centroids2d[:, 0], centroids2d[:, 1], c='red', marker='X', s=200, edgecolors='black')
plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} var)')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} var)')
plt.title('K-Means on Digits (PCA projection)')
plt.colorbar(scatter, label='Cluster')
plt.savefig('kmeans_digits.png', dpi=150, bbox_inches='tight')
fig, axes = plt.subplots(2, 5, figsize=(10, 4))
for i, ax in enumerate(axes.flat):
ax.imshow(kmeans.cluster_centers_[i].reshape(8, 8), cmap='gray_r')
ax.set_title(f'Cluster {i}')
ax.axis('off')
fig.savefig('kmeans_digits_centroids.png', dpi=150, bbox_inches='tight')
"""
Takaway:
- Hier ist k-means nicht der richtige algorithmus, weil die Daten nicht schön kugelförmig verteilt sind und sich nicht gut clustern lassen.
"""
Binary file not shown.

After

Width:  |  Height:  |  Size: 16 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 94 KiB

+58
View File
@@ -0,0 +1,58 @@
"""
Aufgabe: k-means classifier verwenden um cluster im iris datenset zu finden und aufgrund von features Klassen von Samples predicten
Erkenntnis aus dieser Aufgabe
- der k-means Algorithmus ist nicht perfekt für diese Art von Daten geeignet
- wahrscheinlich weil die Cluster geometrisch zu wenig kugelförmig sind
"""
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn import datasets
from sklearn.cluster import KMeans
from sklearn import metrics
# iris datenset laden
iris = datasets.load_iris()
# print 150 samples
print(iris.target)
"""
Egal wie die parameter gewählt werden, die metriken sind nicht sehr gut.
"""
# kmeans = KMeans(n_clusters=3, init="random", n_init=1)
# kmeans = KMeans(n_clusters=3, init='random', n_init=50)
# kmeans = KMeans(n_clusters=3, init='k-means++', n_init=10)
kmeans = KMeans(n_clusters=3)
# fit auf daten
kmeans.fit(iris.data)
# gegenüberstellung gold standard vs prediction
print("gold standard vs. prediction")
for target_label, predicted_label in zip(iris.target, kmeans.labels_):
print(f"{target_label} -> {predicted_label}")
# ausgeben aller relevanten metriken
print(metrics.homogeneity_score(iris.target, kmeans.labels_))
print(metrics.completeness_score(iris.target, kmeans.labels_))
print(metrics.adjusted_rand_score(iris.target, kmeans.labels_))
print(metrics.silhouette_score(iris.data, kmeans.labels_))
# plot vorbereiten
pca = PCA(n_components=2)
X2d = pca.fit_transform(iris.data)
centroids2d = pca.transform(kmeans.cluster_centers_)
# plot
plt.scatter(X2d[:, 0], X2d[:, 1], c=kmeans.labels_, cmap="viridis", s=30, alpha=0.7)
plt.scatter(
centroids2d[:, 0], centroids2d[:, 1], c="red", marker="X", s=200, edgecolors="black"
)
plt.xlabel(f"PC1 ({pca.explained_variance_ratio_[0]:.1%} var)")
plt.ylabel(f"PC2 ({pca.explained_variance_ratio_[1]:.1%} var)")
plt.title("K-Means on Iris (PCA projection)")
plt.colorbar(label="Cluster")
plt.savefig("kmeans_iris.png", dpi=150)