""" Compare Decision Tree, Naive Bayes (supervised) and K-Means (unsupervised) on the Iris and Digits datasets using the same metrics. """ from sklearn import datasets from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeClassifier from sklearn.naive_bayes import GaussianNB from sklearn.cluster import KMeans from sklearn.metrics import accuracy_score, classification_report, adjusted_rand_score import numpy as np def kmeans_accuracy(X, y, n_classes): """ Map each cluster to its majority true label, then compute accuracy. This function handles the cluster→label mapping via majority vote. Each cluster gets assigned the most common true label in it. """ kmeans = KMeans(n_clusters=n_classes, init="k-means++", n_init=10, random_state=42) kmeans.fit(X) labels = np.zeros_like(kmeans.labels_) for i in range(n_classes): mask = kmeans.labels_ == i if mask.sum() > 0: labels[mask] = np.bincount(y[mask]).argmax() return labels, kmeans def evaluate(name, dataset, target_names): """ Evaluate unsupervised and supervised ML algorithms on the same dataset, split with the train_test_split function. Use the classification_report to evaluate the function. """ print(f"\n{'=' * 60}") print(f" {name}") print(f"{'=' * 60}") X_train, X_test, y_train, y_test = train_test_split( dataset.data, dataset.target, test_size=0.3, random_state=42 ) # supervised for clf_name, clf in [ ("Decision Tree", DecisionTreeClassifier(random_state=42)), ("Naive Bayes", GaussianNB()), ]: clf.fit(X_train, y_train) y_pred = clf.predict(X_test) print(f"\n--- {clf_name} ---") print(f"Accuracy: {accuracy_score(y_test, y_pred):.3f}") print(f"Adj. Rand: {adjusted_rand_score(y_test, y_pred):.3f}") print(classification_report(y_test, y_pred, target_names=target_names)) # unsupervised (evaluated on full dataset) n_classes = len(target_names) mapped_labels, kmeans = kmeans_accuracy(dataset.data, dataset.target, n_classes) print(f"\n--- K-Means (mapped) ---") print(f"Accuracy: {accuracy_score(dataset.target, mapped_labels):.3f}") print(f"Adj. Rand: {adjusted_rand_score(dataset.target, kmeans.labels_):.3f}") print( classification_report(dataset.target, mapped_labels, target_names=target_names) ) iris = datasets.load_iris() evaluate("IRIS", iris, list(iris.target_names)) digits = datasets.load_digits() evaluate("DIGITS", digits, [str(n) for n in digits.target_names])