Files
cas-pml/ML/aufgaben/comparison/compare_ml_algorihms.py
T

97 lines
3.6 KiB
Python

"""
Compare different ML algorithms against iris & digits dataset.
Supervised:
- Decision Tree
- Naive Bayes (GaussianNB)
Unsuprvised:
- K-Means
Use metrics (classification_report) to try to evaluate the algorithms.
"""
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, adjusted_rand_score
import numpy as np
def kmeans_true_labels(X, y, n_classes):
"""
Since k-means is unsupervised it comes up with its own classes, that
do not reflect the classes from the gold standard.
This function maps each cluster to its majority true label, to help compute accuracy.
Each cluster gets assigned the most common true label in it.
"""
# train classifier and do a fit, set the rng seed to a fixed value
kmeans = KMeans(n_clusters=n_classes, init="k-means++", n_init=10, random_state=42)
kmeans.fit(X)
# creates an empty array the same shape as the cluster assignments
labels = np.zeros_like(kmeans.labels_)
# for each cluster i ...
for i in range(n_classes):
# boolean array, true for every sample that K-Means put in cluster i
mask = kmeans.labels_ == i
# just skip empty clusters
if mask.sum() == 0:
continue
# set true label as the label most prominent in this cluster
# e.g: if cluster 2 contains a mix of digits 7, 7, 7, 3, 7 this gives you 7
labels[mask] = np.bincount(y[mask]).argmax()
return labels, kmeans
def evaluate(name, dataset, target_names):
"""
Evaluate unsupervised and supervised ML algorithms on the same dataset, split with the train_test_split function.
Use the classification_report to evaluate the function.
"""
print(f"\n{'=' * 60}")
print(f" {name}")
print(f"{'=' * 60}")
# split the dataset into train and test data and use a fixed rng seed
X_train, X_test, y_train, y_test = train_test_split(
dataset.data, dataset.target, test_size=0.3, random_state=42
)
# all the supervised leanring algorithms to test
algorithms = [
("Decision Tree", DecisionTreeClassifier(random_state=42)),
("Naive Bayes", GaussianNB()),
("Random Forest", RandomForestClassifier(n_estimators=100, random_state=42)),
]
# do the test on the supervised learning algorithms
for classifier_name, classifier in algorithms:
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print(f"\n--- {classifier_name} ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.3f}")
print(f"Adj. Rand: {adjusted_rand_score(y_test, y_pred):.3f}")
print(classification_report(y_test, y_pred, target_names=target_names))
# tdo the test on the unsupervised learning algorithm (evaluated on full dataset)
n_classes = len(target_names)
mapped_labels, kmeans = kmeans_true_labels(dataset.data, dataset.target, n_classes)
print(f"\n--- K-Means (mapped) ---")
print(f"Accuracy: {accuracy_score(dataset.target, mapped_labels):.3f}")
print(f"Adj. Rand: {adjusted_rand_score(dataset.target, kmeans.labels_):.3f}")
print(classification_report(dataset.target, mapped_labels, target_names=target_names))
# evaluate all ML algorithms on iris
iris = datasets.load_iris()
evaluate("IRIS", iris, list(iris.target_names))
# evaluate all ML algorithms on digits
digits = datasets.load_digits()
evaluate("DIGITS", digits, [str(n) for n in digits.target_names])