refactor: rename folder structure to sort for algorithms
This commit is contained in:
Binary file not shown.
|
Before Width: | Height: | Size: 280 KiB |
@@ -1,24 +0,0 @@
|
|||||||
#import numpy as np
|
|
||||||
|
|
||||||
from sklearn import datasets
|
|
||||||
from sklearn.cluster import KMeans
|
|
||||||
from sklearn import metrics
|
|
||||||
|
|
||||||
digits = datasets.load_digits()
|
|
||||||
|
|
||||||
# 100 samples pro ziffer
|
|
||||||
# 64 pixel pro zahl
|
|
||||||
print(digits.data.shape)
|
|
||||||
#print(len(np.unique(digits.target)))
|
|
||||||
|
|
||||||
# 10 cluster, random, n_init=1
|
|
||||||
kmeans = KMeans(n_clusters=10, init='random', n_init=1)
|
|
||||||
kmeans.fit(digits.data)
|
|
||||||
|
|
||||||
print(list(zip(digits.target, kmeans.labels_)))
|
|
||||||
print(metrics.homogeneity_score(digits.target, kmeans.labels_))
|
|
||||||
print(metrics.completeness_score(digits.target, kmeans.labels_))
|
|
||||||
print(metrics.adjusted_rand_score(digits.target, kmeans.labels_))
|
|
||||||
print(metrics.silhouette_score(digits.data, kmeans.labels_))
|
|
||||||
|
|
||||||
# auch hier ist kmeans nicht der richtige algorithmus, weil die Daten nicht schön kugelförmig verteilt sind und sich nicht gut clustern lassen
|
|
||||||
@@ -1,28 +0,0 @@
|
|||||||
from sklearn import datasets
|
|
||||||
from sklearn.cluster import KMeans
|
|
||||||
from sklearn import metrics
|
|
||||||
|
|
||||||
iris = datasets.load_iris()
|
|
||||||
|
|
||||||
# print 150 samples
|
|
||||||
print(iris.target)
|
|
||||||
|
|
||||||
# clusters=3, centroiden zufällig wählen, n_init=50
|
|
||||||
kmeans = KMeans(n_clusters=3, init='random', n_init=50)
|
|
||||||
# fit auf daten
|
|
||||||
kmeans.fit(iris.data)
|
|
||||||
|
|
||||||
# print alle daten
|
|
||||||
#print(list(zip(iris.target, kmeans.labels_)))
|
|
||||||
|
|
||||||
# gegenüberstellung
|
|
||||||
print("gold standard vs. prediction")
|
|
||||||
for target_label, predicted_label in zip(iris.target, kmeans.labels_):
|
|
||||||
print(f'{target_label} vs. {predicted_label}')
|
|
||||||
|
|
||||||
print(metrics.homogeneity_score(iris.target, kmeans.labels_))
|
|
||||||
print(metrics.completeness_score(iris.target, kmeans.labels_))
|
|
||||||
print(metrics.adjusted_rand_score(iris.target, kmeans.labels_))
|
|
||||||
print(metrics.silhouette_score(iris.data, kmeans.labels_))
|
|
||||||
|
|
||||||
# erkenntnis, der Algo ist nicht perfekt für diese Art von Daten!!
|
|
||||||
Binary file not shown.
|
After Width: | Height: | Size: 281 KiB |
@@ -1,5 +1,9 @@
|
|||||||
"""
|
"""
|
||||||
Use a decision tree classifier to predict flowers based on sepal and petal length/width
|
Use a decisiontree classifier to predict flowers based on sepal and petal features
|
||||||
|
|
||||||
|
- This is an example of a supervised ML algorithm
|
||||||
|
- it has labels on the training data
|
||||||
|
- you tell the model: this is class X during training
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
@@ -35,4 +39,4 @@ tree.plot_tree(
|
|||||||
rounded=True,
|
rounded=True,
|
||||||
ax=ax,
|
ax=ax,
|
||||||
)
|
)
|
||||||
fig.savefig("tree.png", dpi=150, bbox_inches="tight")
|
fig.savefig("decisiontree_iris.png", dpi=150, bbox_inches="tight")
|
||||||
Binary file not shown.
|
After Width: | Height: | Size: 402 KiB |
@@ -0,0 +1,60 @@
|
|||||||
|
"""
|
||||||
|
Use k-means to try to match handwritten digits and see if changing the parameters
|
||||||
|
results in better recognition.
|
||||||
|
|
||||||
|
- This is an example of an unsupervised ML algorithm
|
||||||
|
- it has no labels on the training data
|
||||||
|
- it discovers the structure on its own
|
||||||
|
- thus the cluster numbers are arbitrary and do not correspond to the class labels
|
||||||
|
"""
|
||||||
|
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
from sklearn import datasets
|
||||||
|
from sklearn.cluster import KMeans
|
||||||
|
from sklearn import metrics
|
||||||
|
from sklearn.decomposition import PCA
|
||||||
|
|
||||||
|
# get the digits dataset
|
||||||
|
digits = datasets.load_digits()
|
||||||
|
|
||||||
|
# 100 samples pro ziffer
|
||||||
|
# 64 pixel pro zahl
|
||||||
|
print(digits.data.shape)
|
||||||
|
|
||||||
|
# ausprobieren verschiedener parameter
|
||||||
|
# kmeans = KMeans(n_clusters=10, init="random", n_init=1)
|
||||||
|
# kmeans = KMeans(n_clusters=10)
|
||||||
|
kmeans = KMeans(n_clusters=10, init="k-means++", n_init=10)
|
||||||
|
kmeans.fit(digits.data)
|
||||||
|
|
||||||
|
print(list(zip(digits.target, kmeans.labels_)))
|
||||||
|
print(metrics.homogeneity_score(digits.target, kmeans.labels_))
|
||||||
|
print(metrics.completeness_score(digits.target, kmeans.labels_))
|
||||||
|
print(metrics.adjusted_rand_score(digits.target, kmeans.labels_))
|
||||||
|
print(metrics.silhouette_score(digits.data, kmeans.labels_))
|
||||||
|
|
||||||
|
pca = PCA(n_components=2)
|
||||||
|
X2d = pca.fit_transform(digits.data)
|
||||||
|
centroids2d = pca.transform(kmeans.cluster_centers_)
|
||||||
|
|
||||||
|
plt.figure(figsize=(10, 8))
|
||||||
|
scatter = plt.scatter(X2d[:, 0], X2d[:, 1], c=kmeans.labels_, cmap='tab10', s=10, alpha=0.6)
|
||||||
|
plt.scatter(centroids2d[:, 0], centroids2d[:, 1], c='red', marker='X', s=200, edgecolors='black')
|
||||||
|
plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} var)')
|
||||||
|
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} var)')
|
||||||
|
plt.title('K-Means on Digits (PCA projection)')
|
||||||
|
plt.colorbar(scatter, label='Cluster')
|
||||||
|
plt.savefig('kmeans_digits.png', dpi=150, bbox_inches='tight')
|
||||||
|
|
||||||
|
fig, axes = plt.subplots(2, 5, figsize=(10, 4))
|
||||||
|
for i, ax in enumerate(axes.flat):
|
||||||
|
ax.imshow(kmeans.cluster_centers_[i].reshape(8, 8), cmap='gray_r')
|
||||||
|
ax.set_title(f'Cluster {i}')
|
||||||
|
ax.axis('off')
|
||||||
|
fig.savefig('kmeans_digits_centroids.png', dpi=150, bbox_inches='tight')
|
||||||
|
|
||||||
|
"""
|
||||||
|
Takaway:
|
||||||
|
- Hier ist k-means nicht der richtige algorithmus, weil die Daten nicht schön kugelförmig verteilt sind und sich nicht gut clustern lassen.
|
||||||
|
"""
|
||||||
Binary file not shown.
|
After Width: | Height: | Size: 16 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 94 KiB |
@@ -0,0 +1,58 @@
|
|||||||
|
"""
|
||||||
|
Aufgabe: k-means classifier verwenden um cluster im iris datenset zu finden und aufgrund von features Klassen von Samples predicten
|
||||||
|
|
||||||
|
Erkenntnis aus dieser Aufgabe
|
||||||
|
- der k-means Algorithmus ist nicht perfekt für diese Art von Daten geeignet
|
||||||
|
- wahrscheinlich weil die Cluster geometrisch zu wenig kugelförmig sind
|
||||||
|
"""
|
||||||
|
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
from sklearn.decomposition import PCA
|
||||||
|
from sklearn import datasets
|
||||||
|
from sklearn.cluster import KMeans
|
||||||
|
from sklearn import metrics
|
||||||
|
|
||||||
|
# iris datenset laden
|
||||||
|
iris = datasets.load_iris()
|
||||||
|
|
||||||
|
# print 150 samples
|
||||||
|
print(iris.target)
|
||||||
|
|
||||||
|
"""
|
||||||
|
Egal wie die parameter gewählt werden, die metriken sind nicht sehr gut.
|
||||||
|
"""
|
||||||
|
# kmeans = KMeans(n_clusters=3, init="random", n_init=1)
|
||||||
|
# kmeans = KMeans(n_clusters=3, init='random', n_init=50)
|
||||||
|
# kmeans = KMeans(n_clusters=3, init='k-means++', n_init=10)
|
||||||
|
kmeans = KMeans(n_clusters=3)
|
||||||
|
|
||||||
|
# fit auf daten
|
||||||
|
kmeans.fit(iris.data)
|
||||||
|
|
||||||
|
# gegenüberstellung gold standard vs prediction
|
||||||
|
print("gold standard vs. prediction")
|
||||||
|
for target_label, predicted_label in zip(iris.target, kmeans.labels_):
|
||||||
|
print(f"{target_label} -> {predicted_label}")
|
||||||
|
|
||||||
|
# ausgeben aller relevanten metriken
|
||||||
|
print(metrics.homogeneity_score(iris.target, kmeans.labels_))
|
||||||
|
print(metrics.completeness_score(iris.target, kmeans.labels_))
|
||||||
|
print(metrics.adjusted_rand_score(iris.target, kmeans.labels_))
|
||||||
|
print(metrics.silhouette_score(iris.data, kmeans.labels_))
|
||||||
|
|
||||||
|
# plot vorbereiten
|
||||||
|
pca = PCA(n_components=2)
|
||||||
|
X2d = pca.fit_transform(iris.data)
|
||||||
|
centroids2d = pca.transform(kmeans.cluster_centers_)
|
||||||
|
|
||||||
|
# plot
|
||||||
|
plt.scatter(X2d[:, 0], X2d[:, 1], c=kmeans.labels_, cmap="viridis", s=30, alpha=0.7)
|
||||||
|
plt.scatter(
|
||||||
|
centroids2d[:, 0], centroids2d[:, 1], c="red", marker="X", s=200, edgecolors="black"
|
||||||
|
)
|
||||||
|
plt.xlabel(f"PC1 ({pca.explained_variance_ratio_[0]:.1%} var)")
|
||||||
|
plt.ylabel(f"PC2 ({pca.explained_variance_ratio_[1]:.1%} var)")
|
||||||
|
plt.title("K-Means on Iris (PCA projection)")
|
||||||
|
plt.colorbar(label="Cluster")
|
||||||
|
plt.savefig("kmeans_iris.png", dpi=150)
|
||||||
Reference in New Issue
Block a user