feature: add a comparison between all algorithms for each dataset to see which performs best
This commit is contained in:
@@ -0,0 +1,59 @@
|
||||
"""
|
||||
Compare Decision Tree, Naive Bayes (supervised) and K-Means (unsupervised)
|
||||
on the Iris and Digits datasets using the same metrics.
|
||||
"""
|
||||
|
||||
from sklearn import datasets
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.tree import DecisionTreeClassifier
|
||||
from sklearn.naive_bayes import GaussianNB
|
||||
from sklearn.cluster import KMeans
|
||||
from sklearn.metrics import accuracy_score, classification_report, adjusted_rand_score
|
||||
import numpy as np
|
||||
|
||||
|
||||
def kmeans_accuracy(X, y, n_classes):
|
||||
"""Map each cluster to its majority true label, then compute accuracy."""
|
||||
kmeans = KMeans(n_clusters=n_classes, init="k-means++", n_init=10, random_state=42)
|
||||
kmeans.fit(X)
|
||||
labels = np.zeros_like(kmeans.labels_)
|
||||
for i in range(n_classes):
|
||||
mask = kmeans.labels_ == i
|
||||
if mask.sum() > 0:
|
||||
labels[mask] = np.bincount(y[mask]).argmax()
|
||||
return labels, kmeans
|
||||
|
||||
|
||||
def evaluate(name, dataset, target_names):
|
||||
print(f"\n{'='*60}")
|
||||
print(f" {name}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
dataset.data, dataset.target, test_size=0.3, random_state=42
|
||||
)
|
||||
|
||||
# supervised
|
||||
for clf_name, clf in [("Decision Tree", DecisionTreeClassifier(random_state=42)),
|
||||
("Naive Bayes", GaussianNB())]:
|
||||
clf.fit(X_train, y_train)
|
||||
y_pred = clf.predict(X_test)
|
||||
print(f"\n--- {clf_name} ---")
|
||||
print(f"Accuracy: {accuracy_score(y_test, y_pred):.3f}")
|
||||
print(f"Adj. Rand: {adjusted_rand_score(y_test, y_pred):.3f}")
|
||||
print(classification_report(y_test, y_pred, target_names=target_names))
|
||||
|
||||
# unsupervised (evaluated on full dataset)
|
||||
n_classes = len(target_names)
|
||||
mapped_labels, kmeans = kmeans_accuracy(dataset.data, dataset.target, n_classes)
|
||||
print(f"\n--- K-Means (mapped) ---")
|
||||
print(f"Accuracy: {accuracy_score(dataset.target, mapped_labels):.3f}")
|
||||
print(f"Adj. Rand: {adjusted_rand_score(dataset.target, kmeans.labels_):.3f}")
|
||||
print(classification_report(dataset.target, mapped_labels, target_names=target_names))
|
||||
|
||||
|
||||
iris = datasets.load_iris()
|
||||
evaluate("IRIS", iris, list(iris.target_names))
|
||||
|
||||
digits = datasets.load_digits()
|
||||
evaluate("DIGITS", digits, [str(n) for n in digits.target_names])
|
||||
Reference in New Issue
Block a user