refactor: add more documentation strings

2026-05-01 10:05:40 +02:00
parent 611da9538f
commit 959e53b7b3
1 changed files with 36 additions and 17 deletions
@@ -1,6 +1,13 @@
 """
-Compare Decision Tree, Naive Bayes (supervised) and K-Means (unsupervised)
+Compare different ML algorithms against iris & digits dataset.
-on the Iris and Digits datasets using the same metrics.
+
 Supervised:
    - Decision Tree
    - Naive Bayes (GaussianNB)
 Unsuprvised:
    - K-Means
 Use metrics (classification_report) to try to evaluate the algorithms.
 """
 from sklearn import datasets
@@ -9,23 +16,33 @@ from sklearn.tree import DecisionTreeClassifier
 from sklearn.naive_bayes import GaussianNB
 from sklearn.cluster import KMeans
 from sklearn.metrics import accuracy_score, classification_report, adjusted_rand_score
 import numpy as np
-
+def kmeans_true_labels(X, y, n_classes):
 def kmeans_accuracy(X, y, n_classes):
    """
-    Map each cluster to its majority true label, then compute accuracy.
+    Since k-means is unsupervised it comes up with its own classes, that
-    This function handles the cluster→label mapping via majority vote.
+    do not reflect the classes from the gold standard.
    This function maps each cluster to its majority true label, to help compute accuracy.
    Each cluster gets assigned the most common true label in it.
    """
    # train classifier and do a fit, set the rng seed to a fixed value
    kmeans = KMeans(n_clusters=n_classes, init="k-means++", n_init=10, random_state=42)
    kmeans.fit(X)
    # creates an empty array the same shape as the cluster assignments
    labels = np.zeros_like(kmeans.labels_)
    # for each cluster i ...
    for i in range(n_classes):
        # boolean array, true for every sample that K-Means put in cluster i
        mask = kmeans.labels_ == i
-        if mask.sum() > 0:
+        # just skip empty clusters
-            labels[mask] = np.bincount(y[mask]).argmax()
+        if mask.sum() == 0:
            continue
        # set true label as the label most prominent in this cluster
        # e.g: if cluster 2 contains a mix of digits 7, 7, 7, 3, 7 this gives you 7
        labels[mask] = np.bincount(y[mask]).argmax()
    return labels, kmeans
@@ -38,25 +55,26 @@ def evaluate(name, dataset, target_names):
    print(f" {name}")
    print(f"{'=' * 60}")
    # split the dataset into train and test data and use a fixed rng seed
    X_train, X_test, y_train, y_test = train_test_split(
        dataset.data, dataset.target, test_size=0.3, random_state=42
    )
-    # supervised
+    # test the supervised leanring algorithms
-    for clf_name, clf in [
+    for classifier_name, classifier in [
        ("Decision Tree", DecisionTreeClassifier(random_state=42)),
        ("Naive Bayes", GaussianNB()),
    ]:
-        clf.fit(X_train, y_train)
+        classifier.fit(X_train, y_train)
-        y_pred = clf.predict(X_test)
+        y_pred = classifier.predict(X_test)
-        print(f"\n--- {clf_name} ---")
+        print(f"\n--- {classifier_name} ---")
        print(f"Accuracy: {accuracy_score(y_test, y_pred):.3f}")
        print(f"Adj. Rand: {adjusted_rand_score(y_test, y_pred):.3f}")
        print(classification_report(y_test, y_pred, target_names=target_names))
-    # unsupervised (evaluated on full dataset)
+    # test the unsupervised learning algorithm (evaluated on full dataset)
    n_classes = len(target_names)
-    mapped_labels, kmeans = kmeans_accuracy(dataset.data, dataset.target, n_classes)
+    mapped_labels, kmeans = kmeans_true_labels(dataset.data, dataset.target, n_classes)
    print(f"\n--- K-Means (mapped) ---")
    print(f"Accuracy: {accuracy_score(dataset.target, mapped_labels):.3f}")
    print(f"Adj. Rand: {adjusted_rand_score(dataset.target, kmeans.labels_):.3f}")
@@ -64,9 +82,10 @@ def evaluate(name, dataset, target_names):
        classification_report(dataset.target, mapped_labels, target_names=target_names)
    )
-
+# evaluate all ML algorithms on iris
 iris = datasets.load_iris()
 evaluate("IRIS", iris, list(iris.target_names))
 # evaluate all ML algorithms on digits
 digits = datasets.load_digits()
 evaluate("DIGITS", digits, [str(n) for n in digits.target_names])