cas-pml/ML/aufgaben/comparison/compare_ml_algorihms.py

"""
Compare different ML algorithms against iris & digits dataset.

Supervised:
    - Decision Tree
    - Naive Bayes (GaussianNB)
Unsuprvised:
    - K-Means

Use metrics (classification_report) to try to evaluate the algorithms.
"""

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, adjusted_rand_score

import numpy as np


def kmeans_true_labels(X, y, n_classes):
    """
    Since k-means is unsupervised it comes up with its own classes, that
    do not reflect the classes from the gold standard.
    This function maps each cluster to its majority true label, to help compute accuracy.

    Each cluster gets assigned the most common true label in it.
    """
    # train classifier and do a fit, set the rng seed to a fixed value
    kmeans = KMeans(n_clusters=n_classes, init="k-means++", n_init=10, random_state=42)
    kmeans.fit(X)

    # creates an empty array the same shape as the cluster assignments
    labels = np.zeros_like(kmeans.labels_)
    # for each cluster i ...
    for i in range(n_classes):
        # boolean array, true for every sample that K-Means put in cluster i
        mask = kmeans.labels_ == i
        # just skip empty clusters
        if mask.sum() == 0:
            continue
        # set true label as the label most prominent in this cluster
        # e.g: if cluster 2 contains a mix of digits 7, 7, 7, 3, 7 this gives you 7
        labels[mask] = np.bincount(y[mask]).argmax()
    return labels, kmeans


def evaluate(name, dataset, target_names):
    """
    Evaluate unsupervised and supervised ML algorithms on the same dataset, split with the train_test_split function.
    Use the classification_report to evaluate the function.
    """
    print(f"\n{'=' * 60}")
    print(f" {name}")
    print(f"{'=' * 60}")

    # split the dataset into train and test data and use a fixed rng seed
    X_train, X_test, y_train, y_test = train_test_split(
        dataset.data, dataset.target, test_size=0.3, random_state=42
    )

    # all the supervised leanring algorithms to test
    algorithms = [
        ("Decision Tree", DecisionTreeClassifier(random_state=42)),
        ("Naive Bayes", GaussianNB()),
        ("Random Forest", RandomForestClassifier(n_estimators=100, random_state=42)),
    ]

    #  do the test on the supervised learning algorithms
    for classifier_name, classifier in algorithms:
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)
        print(f"\n--- {classifier_name} ---")
        print(f"Accuracy: {accuracy_score(y_test, y_pred):.3f}")
        print(f"Adj. Rand: {adjusted_rand_score(y_test, y_pred):.3f}")
        print(classification_report(y_test, y_pred, target_names=target_names))

    # tdo the test on the unsupervised learning algorithm (evaluated on full dataset)
    n_classes = len(target_names)
    mapped_labels, kmeans = kmeans_true_labels(dataset.data, dataset.target, n_classes)
    print(f"\n--- K-Means (mapped) ---")
    print(f"Accuracy: {accuracy_score(dataset.target, mapped_labels):.3f}")
    print(f"Adj. Rand: {adjusted_rand_score(dataset.target, kmeans.labels_):.3f}")
    print(classification_report(dataset.target, mapped_labels, target_names=target_names))


# evaluate all ML algorithms on iris
iris = datasets.load_iris()
evaluate("IRIS", iris, list(iris.target_names))

# evaluate all ML algorithms on digits
digits = datasets.load_digits()
evaluate("DIGITS", digits, [str(n) for n in digits.target_names])