diff --git a/ML/aufgaben/a1/tree.png b/ML/aufgaben/a1/tree.png deleted file mode 100644 index 98bda30..0000000 Binary files a/ML/aufgaben/a1/tree.png and /dev/null differ diff --git a/ML/aufgaben/a2/kmeans_digits.py b/ML/aufgaben/a2/kmeans_digits.py deleted file mode 100644 index ad6119a..0000000 --- a/ML/aufgaben/a2/kmeans_digits.py +++ /dev/null @@ -1,24 +0,0 @@ -#import numpy as np - -from sklearn import datasets -from sklearn.cluster import KMeans -from sklearn import metrics - -digits = datasets.load_digits() - -# 100 samples pro ziffer -# 64 pixel pro zahl -print(digits.data.shape) -#print(len(np.unique(digits.target))) - -# 10 cluster, random, n_init=1 -kmeans = KMeans(n_clusters=10, init='random', n_init=1) -kmeans.fit(digits.data) - -print(list(zip(digits.target, kmeans.labels_))) -print(metrics.homogeneity_score(digits.target, kmeans.labels_)) -print(metrics.completeness_score(digits.target, kmeans.labels_)) -print(metrics.adjusted_rand_score(digits.target, kmeans.labels_)) -print(metrics.silhouette_score(digits.data, kmeans.labels_)) - -# auch hier ist kmeans nicht der richtige algorithmus, weil die Daten nicht schön kugelförmig verteilt sind und sich nicht gut clustern lassen diff --git a/ML/aufgaben/a2/kmeans_iris.py b/ML/aufgaben/a2/kmeans_iris.py deleted file mode 100644 index 9b95a70..0000000 --- a/ML/aufgaben/a2/kmeans_iris.py +++ /dev/null @@ -1,28 +0,0 @@ -from sklearn import datasets -from sklearn.cluster import KMeans -from sklearn import metrics - -iris = datasets.load_iris() - -# print 150 samples -print(iris.target) - -# clusters=3, centroiden zufällig wählen, n_init=50 -kmeans = KMeans(n_clusters=3, init='random', n_init=50) -# fit auf daten -kmeans.fit(iris.data) - -# print alle daten -#print(list(zip(iris.target, kmeans.labels_))) - -# gegenüberstellung -print("gold standard vs. prediction") -for target_label, predicted_label in zip(iris.target, kmeans.labels_): - print(f'{target_label} vs. {predicted_label}') - -print(metrics.homogeneity_score(iris.target, kmeans.labels_)) -print(metrics.completeness_score(iris.target, kmeans.labels_)) -print(metrics.adjusted_rand_score(iris.target, kmeans.labels_)) -print(metrics.silhouette_score(iris.data, kmeans.labels_)) - -# erkenntnis, der Algo ist nicht perfekt für diese Art von Daten!! diff --git a/ML/aufgaben/a1/decisiontree_iris.ipynb b/ML/aufgaben/decisiontree/decisiontree_iris.ipynb similarity index 100% rename from ML/aufgaben/a1/decisiontree_iris.ipynb rename to ML/aufgaben/decisiontree/decisiontree_iris.ipynb diff --git a/ML/aufgaben/decisiontree/decisiontree_iris.png b/ML/aufgaben/decisiontree/decisiontree_iris.png new file mode 100644 index 0000000..2172774 Binary files /dev/null and b/ML/aufgaben/decisiontree/decisiontree_iris.png differ diff --git a/ML/aufgaben/a1/decisiontree_iris.py b/ML/aufgaben/decisiontree/decisiontree_iris.py similarity index 74% rename from ML/aufgaben/a1/decisiontree_iris.py rename to ML/aufgaben/decisiontree/decisiontree_iris.py index cb25aa9..d361715 100644 --- a/ML/aufgaben/a1/decisiontree_iris.py +++ b/ML/aufgaben/decisiontree/decisiontree_iris.py @@ -1,5 +1,9 @@ """ -Use a decision tree classifier to predict flowers based on sepal and petal length/width +Use a decisiontree classifier to predict flowers based on sepal and petal features + +- This is an example of a supervised ML algorithm + - it has labels on the training data + - you tell the model: this is class X during training """ import matplotlib.pyplot as plt @@ -35,4 +39,4 @@ tree.plot_tree( rounded=True, ax=ax, ) -fig.savefig("tree.png", dpi=150, bbox_inches="tight") +fig.savefig("decisiontree_iris.png", dpi=150, bbox_inches="tight") diff --git a/ML/aufgaben/a2/kmeans_digits.ipynb b/ML/aufgaben/kmeans/kmeans_digits.ipynb similarity index 100% rename from ML/aufgaben/a2/kmeans_digits.ipynb rename to ML/aufgaben/kmeans/kmeans_digits.ipynb diff --git a/ML/aufgaben/kmeans/kmeans_digits.png b/ML/aufgaben/kmeans/kmeans_digits.png new file mode 100644 index 0000000..5150ef4 Binary files /dev/null and b/ML/aufgaben/kmeans/kmeans_digits.png differ diff --git a/ML/aufgaben/kmeans/kmeans_digits.py b/ML/aufgaben/kmeans/kmeans_digits.py new file mode 100644 index 0000000..ed7a449 --- /dev/null +++ b/ML/aufgaben/kmeans/kmeans_digits.py @@ -0,0 +1,60 @@ +""" +Use k-means to try to match handwritten digits and see if changing the parameters +results in better recognition. + +- This is an example of an unsupervised ML algorithm + - it has no labels on the training data + - it discovers the structure on its own + - thus the cluster numbers are arbitrary and do not correspond to the class labels +""" + +import matplotlib.pyplot as plt + +from sklearn import datasets +from sklearn.cluster import KMeans +from sklearn import metrics +from sklearn.decomposition import PCA + +# get the digits dataset +digits = datasets.load_digits() + +# 100 samples pro ziffer +# 64 pixel pro zahl +print(digits.data.shape) + +# ausprobieren verschiedener parameter +# kmeans = KMeans(n_clusters=10, init="random", n_init=1) +# kmeans = KMeans(n_clusters=10) +kmeans = KMeans(n_clusters=10, init="k-means++", n_init=10) +kmeans.fit(digits.data) + +print(list(zip(digits.target, kmeans.labels_))) +print(metrics.homogeneity_score(digits.target, kmeans.labels_)) +print(metrics.completeness_score(digits.target, kmeans.labels_)) +print(metrics.adjusted_rand_score(digits.target, kmeans.labels_)) +print(metrics.silhouette_score(digits.data, kmeans.labels_)) + +pca = PCA(n_components=2) +X2d = pca.fit_transform(digits.data) +centroids2d = pca.transform(kmeans.cluster_centers_) + +plt.figure(figsize=(10, 8)) +scatter = plt.scatter(X2d[:, 0], X2d[:, 1], c=kmeans.labels_, cmap='tab10', s=10, alpha=0.6) +plt.scatter(centroids2d[:, 0], centroids2d[:, 1], c='red', marker='X', s=200, edgecolors='black') +plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} var)') +plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} var)') +plt.title('K-Means on Digits (PCA projection)') +plt.colorbar(scatter, label='Cluster') +plt.savefig('kmeans_digits.png', dpi=150, bbox_inches='tight') + +fig, axes = plt.subplots(2, 5, figsize=(10, 4)) +for i, ax in enumerate(axes.flat): + ax.imshow(kmeans.cluster_centers_[i].reshape(8, 8), cmap='gray_r') + ax.set_title(f'Cluster {i}') + ax.axis('off') +fig.savefig('kmeans_digits_centroids.png', dpi=150, bbox_inches='tight') + +""" +Takaway: +- Hier ist k-means nicht der richtige algorithmus, weil die Daten nicht schön kugelförmig verteilt sind und sich nicht gut clustern lassen. +""" diff --git a/ML/aufgaben/kmeans/kmeans_digits_centroids.png b/ML/aufgaben/kmeans/kmeans_digits_centroids.png new file mode 100644 index 0000000..3e6cf27 Binary files /dev/null and b/ML/aufgaben/kmeans/kmeans_digits_centroids.png differ diff --git a/ML/aufgaben/a2/kmeans_iris.ipynb b/ML/aufgaben/kmeans/kmeans_iris.ipynb similarity index 100% rename from ML/aufgaben/a2/kmeans_iris.ipynb rename to ML/aufgaben/kmeans/kmeans_iris.ipynb diff --git a/ML/aufgaben/kmeans/kmeans_iris.png b/ML/aufgaben/kmeans/kmeans_iris.png new file mode 100644 index 0000000..7c952f8 Binary files /dev/null and b/ML/aufgaben/kmeans/kmeans_iris.png differ diff --git a/ML/aufgaben/kmeans/kmeans_iris.py b/ML/aufgaben/kmeans/kmeans_iris.py new file mode 100644 index 0000000..bc7bade --- /dev/null +++ b/ML/aufgaben/kmeans/kmeans_iris.py @@ -0,0 +1,58 @@ +""" +Aufgabe: k-means classifier verwenden um cluster im iris datenset zu finden und aufgrund von features Klassen von Samples predicten + +Erkenntnis aus dieser Aufgabe + - der k-means Algorithmus ist nicht perfekt für diese Art von Daten geeignet + - wahrscheinlich weil die Cluster geometrisch zu wenig kugelförmig sind +""" + +import matplotlib.pyplot as plt + +from sklearn.decomposition import PCA +from sklearn import datasets +from sklearn.cluster import KMeans +from sklearn import metrics + +# iris datenset laden +iris = datasets.load_iris() + +# print 150 samples +print(iris.target) + +""" +Egal wie die parameter gewählt werden, die metriken sind nicht sehr gut. +""" +# kmeans = KMeans(n_clusters=3, init="random", n_init=1) +# kmeans = KMeans(n_clusters=3, init='random', n_init=50) +# kmeans = KMeans(n_clusters=3, init='k-means++', n_init=10) +kmeans = KMeans(n_clusters=3) + +# fit auf daten +kmeans.fit(iris.data) + +# gegenüberstellung gold standard vs prediction +print("gold standard vs. prediction") +for target_label, predicted_label in zip(iris.target, kmeans.labels_): + print(f"{target_label} -> {predicted_label}") + +# ausgeben aller relevanten metriken +print(metrics.homogeneity_score(iris.target, kmeans.labels_)) +print(metrics.completeness_score(iris.target, kmeans.labels_)) +print(metrics.adjusted_rand_score(iris.target, kmeans.labels_)) +print(metrics.silhouette_score(iris.data, kmeans.labels_)) + +# plot vorbereiten +pca = PCA(n_components=2) +X2d = pca.fit_transform(iris.data) +centroids2d = pca.transform(kmeans.cluster_centers_) + +# plot +plt.scatter(X2d[:, 0], X2d[:, 1], c=kmeans.labels_, cmap="viridis", s=30, alpha=0.7) +plt.scatter( + centroids2d[:, 0], centroids2d[:, 1], c="red", marker="X", s=200, edgecolors="black" +) +plt.xlabel(f"PC1 ({pca.explained_variance_ratio_[0]:.1%} var)") +plt.ylabel(f"PC2 ({pca.explained_variance_ratio_[1]:.1%} var)") +plt.title("K-Means on Iris (PCA projection)") +plt.colorbar(label="Cluster") +plt.savefig("kmeans_iris.png", dpi=150)