"""Workshop 4: kNN Hyperparametersuche auf bank_data_prep.csv. Standardisiert die Features, sucht die besten Werte für n_neighbors und p (Minkowski-Distanz), und vergleicht die Accuracy mit vs. ohne Standardisieren. """ import numpy as np import pandas as pd from sklearn.metrics import confusion_matrix, classification_report from sklearn.model_selection import GridSearchCV from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier from sklearn.preprocessing import StandardScaler RAW = "data/bank_data_prep.csv" SEED = 1234 def load_split(path: str = RAW): """Schritt 1-3 der Folien: laden, X/y-Split, train/test-Split.""" df = pd.read_csv(path) X = df.drop("y", axis=1) y = df["y"] X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=2 / 3, random_state=SEED ) return X_train, X_test, y_train, y_test def scale(X_train, X_test): """Standardisieren: Scaler nur auf Train fitten, auf beide anwenden.""" scaler = StandardScaler().set_output(transform="pandas") scaler.fit(X_train) return scaler.transform(X_train), scaler.transform(X_test) def search_manual(X_train, X_test, y_train, y_test): """Folien-Methode: Grid ueber n_neighbors x p, Score auf Test.""" results = [] for k in range(1, 11): for p in (1, 2, 3): model = KNeighborsClassifier(n_neighbors=k, p=p) model.fit(X_train, y_train) acc = model.score(X_test, y_test) results.append((k, p, acc)) best = max(results, key=lambda r: r[2]) print(f"Bestes Ergebnis: k={best[0]}, p={best[1]}, acc={best[2]:.4f}") return results, best def search_grid(X_train, y_train): """Hyperparametersuche per GridSearchCV (CV auf Train, kein Test-Leakage).""" param_grid = { "n_neighbors": range(1, 11), "p": (1, 2, 3), } grid = GridSearchCV( KNeighborsClassifier(), param_grid, cv=5, # 5-fache Cross-Validation scoring="accuracy", ) grid.fit(X_train, y_train) # NUR Train — Test wird nicht angefasst print(f"Beste Params: {grid.best_params_}, CV-Score: {grid.best_score_:.4f}") return grid if __name__ == "__main__": X_train, X_test, y_train, y_test = load_split() # --- Variante A: OHNE Standardisieren --- print("=== Ohne Standardisieren ===") results_raw, best_raw = search_manual(X_train, X_test, y_train, y_test) # --- Variante B: MIT Standardisieren --- print("=== Mit Standardisieren ===") X_train_sc, X_test_sc = scale(X_train, X_test) # Features skalieren results_sc, best_sc = search_manual(X_train_sc, X_test_sc, y_train, y_test) # --- Variante C: GridScearch grid = search_grid(X_train_sc, y_train) # skalierte Train-Daten rein final_acc = grid.score(X_test_sc, y_test) # skalierte Test-Daten messen k_grid = grid.best_params_['n_neighbors'] p_grid = grid.best_params_['p'] cv_grid = grid.best_score_ # --- Vergleich --- print(f"\nOhne Skalierung: k={best_raw[0]}, p={best_raw[1]}, acc={best_raw[2]:.4f}") print(f"Mit Skalierung: k={best_sc[0]}, p={best_sc[1]}, acc={best_sc[2]:.4f}") print(f"Mit GridSearch: k={k_grid}, p={p_grid}, CV-acc={cv_grid:.4f}, test-acc={final_acc:.4f}") y_pred = grid.predict(X_test_sc) # weiterhin skaliert print(confusion_matrix(y_test, y_pred)) print(classification_report(y_test, y_pred))