feature: add a comparison between all algorithms for each dataset to see which performs best

2026-04-30 20:05:55 +02:00
parent a375173ec1
commit 28bb039e58
2 changed files with 172 additions and 0 deletions
@@ -0,0 +1,113 @@
 ============================================================
 IRIS
 ============================================================
 --- Decision Tree ---
 Accuracy: 1.000
 Adj. Rand: 1.000
              precision    recall  f1-score   support
      setosa       1.00      1.00      1.00        19
  versicolor       1.00      1.00      1.00        13
   virginica       1.00      1.00      1.00        13
    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
 weighted avg       1.00      1.00      1.00        45
 --- Naive Bayes ---
 Accuracy: 0.978
 Adj. Rand: 0.943
              precision    recall  f1-score   support
      setosa       1.00      1.00      1.00        19
  versicolor       1.00      0.92      0.96        13
   virginica       0.93      1.00      0.96        13
    accuracy                           0.98        45
   macro avg       0.98      0.97      0.97        45
 weighted avg       0.98      0.98      0.98        45
 --- K-Means (mapped) ---
 Accuracy: 0.893
 Adj. Rand: 0.730
              precision    recall  f1-score   support
      setosa       1.00      1.00      1.00        50
  versicolor       0.77      0.96      0.86        50
   virginica       0.95      0.72      0.82        50
    accuracy                           0.89       150
   macro avg       0.91      0.89      0.89       150
 weighted avg       0.91      0.89      0.89       150
 ============================================================
 DIGITS
 ============================================================
 --- Decision Tree ---
 Accuracy: 0.843
 Adj. Rand: 0.685
              precision    recall  f1-score   support
           0       0.92      0.91      0.91        53
           1       0.74      0.78      0.76        50
           2       0.83      0.74      0.79        47
           3       0.78      0.85      0.81        54
           4       0.81      0.85      0.83        60
           5       0.92      0.86      0.89        66
           6       0.93      0.94      0.93        53
           7       0.85      0.84      0.84        55
           8       0.89      0.77      0.82        43
           9       0.78      0.85      0.81        59
    accuracy                           0.84       540
   macro avg       0.85      0.84      0.84       540
 weighted avg       0.85      0.84      0.84       540
 --- Naive Bayes ---
 Accuracy: 0.852
 Adj. Rand: 0.710
              precision    recall  f1-score   support
           0       1.00      0.98      0.99        53
           1       0.86      0.74      0.80        50
           2       0.86      0.66      0.75        47
           3       0.95      0.76      0.85        54
           4       0.98      0.85      0.91        60
           5       0.94      0.94      0.94        66
           6       0.89      0.96      0.93        53
           7       0.72      0.98      0.83        55
           8       0.57      0.91      0.70        43
           9       0.89      0.71      0.79        59
    accuracy                           0.85       540
   macro avg       0.87      0.85      0.85       540
 weighted avg       0.88      0.85      0.85       540
 --- K-Means (mapped) ---
 Accuracy: 0.794
 Adj. Rand: 0.667
              precision    recall  f1-score   support
           0       0.99      0.99      0.99       178
           1       0.62      0.30      0.41       182
           2       0.84      0.84      0.84       177
           3       0.86      0.85      0.85       183
           4       0.99      0.92      0.95       181
           5       0.87      0.75      0.81       182
           6       0.97      0.98      0.98       181
           7       0.86      0.95      0.90       179
           8       0.45      0.59      0.51       174
           9       0.58      0.77      0.66       180
    accuracy                           0.79      1797
   macro avg       0.80      0.79      0.79      1797
 weighted avg       0.80      0.79      0.79      1797
@@ -0,0 +1,59 @@
 """
 Compare Decision Tree, Naive Bayes (supervised) and K-Means (unsupervised)
 on the Iris and Digits datasets using the same metrics.
 """
 from sklearn import datasets
 from sklearn.model_selection import train_test_split
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.naive_bayes import GaussianNB
 from sklearn.cluster import KMeans
 from sklearn.metrics import accuracy_score, classification_report, adjusted_rand_score
 import numpy as np
 def kmeans_accuracy(X, y, n_classes):
    """Map each cluster to its majority true label, then compute accuracy."""
    kmeans = KMeans(n_clusters=n_classes, init="k-means++", n_init=10, random_state=42)
    kmeans.fit(X)
    labels = np.zeros_like(kmeans.labels_)
    for i in range(n_classes):
        mask = kmeans.labels_ == i
        if mask.sum() > 0:
            labels[mask] = np.bincount(y[mask]).argmax()
    return labels, kmeans
 def evaluate(name, dataset, target_names):
    print(f"\n{'='*60}")
    print(f" {name}")
    print(f"{'='*60}")
    X_train, X_test, y_train, y_test = train_test_split(
        dataset.data, dataset.target, test_size=0.3, random_state=42
    )
    # supervised
    for clf_name, clf in [("Decision Tree", DecisionTreeClassifier(random_state=42)),
                          ("Naive Bayes", GaussianNB())]:
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        print(f"\n--- {clf_name} ---")
        print(f"Accuracy: {accuracy_score(y_test, y_pred):.3f}")
        print(f"Adj. Rand: {adjusted_rand_score(y_test, y_pred):.3f}")
        print(classification_report(y_test, y_pred, target_names=target_names))
    # unsupervised (evaluated on full dataset)
    n_classes = len(target_names)
    mapped_labels, kmeans = kmeans_accuracy(dataset.data, dataset.target, n_classes)
    print(f"\n--- K-Means (mapped) ---")
    print(f"Accuracy: {accuracy_score(dataset.target, mapped_labels):.3f}")
    print(f"Adj. Rand: {adjusted_rand_score(dataset.target, kmeans.labels_):.3f}")
    print(classification_report(dataset.target, mapped_labels, target_names=target_names))
 iris = datasets.load_iris()
 evaluate("IRIS", iris, list(iris.target_names))
 digits = datasets.load_digits()
 evaluate("DIGITS", digits, [str(n) for n in digits.target_names])