feature: add workshop 4 solution
This commit is contained in:
@@ -0,0 +1,97 @@
|
||||
"""Workshop 4: kNN Hyperparametersuche auf bank_data_prep.csv.
|
||||
|
||||
Standardisiert die Features, sucht die besten Werte für n_neighbors und p
|
||||
(Minkowski-Distanz), und vergleicht die Accuracy mit vs. ohne Standardisieren.
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.metrics import confusion_matrix, classification_report
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.neighbors import KNeighborsClassifier
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
|
||||
RAW = "data/bank_data_prep.csv"
|
||||
SEED = 1234
|
||||
|
||||
|
||||
def load_split(path: str = RAW):
|
||||
"""Schritt 1-3 der Folien: laden, X/y-Split, train/test-Split."""
|
||||
df = pd.read_csv(path)
|
||||
|
||||
X = df.drop("y", axis=1)
|
||||
y = df["y"]
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
X, y, train_size=2 / 3, random_state=SEED
|
||||
)
|
||||
|
||||
return X_train, X_test, y_train, y_test
|
||||
|
||||
|
||||
def scale(X_train, X_test):
|
||||
"""Standardisieren: Scaler nur auf Train fitten, auf beide anwenden."""
|
||||
scaler = StandardScaler().set_output(transform="pandas")
|
||||
scaler.fit(X_train)
|
||||
return scaler.transform(X_train), scaler.transform(X_test)
|
||||
|
||||
|
||||
def search_manual(X_train, X_test, y_train, y_test):
|
||||
"""Folien-Methode: Grid ueber n_neighbors x p, Score auf Test."""
|
||||
results = []
|
||||
for k in range(1, 11):
|
||||
for p in (1, 2, 3):
|
||||
model = KNeighborsClassifier(n_neighbors=k, p=p)
|
||||
model.fit(X_train, y_train)
|
||||
acc = model.score(X_test, y_test)
|
||||
results.append((k, p, acc))
|
||||
best = max(results, key=lambda r: r[2])
|
||||
print(f"Bestes Ergebnis: k={best[0]}, p={best[1]}, acc={best[2]:.4f}")
|
||||
return results, best
|
||||
|
||||
|
||||
def search_grid(X_train, y_train):
|
||||
"""Hyperparametersuche per GridSearchCV (CV auf Train, kein Test-Leakage)."""
|
||||
param_grid = {
|
||||
"n_neighbors": range(1, 11),
|
||||
"p": (1, 2, 3),
|
||||
}
|
||||
grid = GridSearchCV(
|
||||
KNeighborsClassifier(),
|
||||
param_grid,
|
||||
cv=5, # 5-fache Cross-Validation
|
||||
scoring="accuracy",
|
||||
)
|
||||
grid.fit(X_train, y_train) # NUR Train — Test wird nicht angefasst
|
||||
print(f"Beste Params: {grid.best_params_}, CV-Score: {grid.best_score_:.4f}")
|
||||
return grid
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
X_train, X_test, y_train, y_test = load_split()
|
||||
|
||||
# --- Variante A: OHNE Standardisieren ---
|
||||
print("=== Ohne Standardisieren ===")
|
||||
results_raw, best_raw = search_manual(X_train, X_test, y_train, y_test)
|
||||
|
||||
# --- Variante B: MIT Standardisieren ---
|
||||
print("=== Mit Standardisieren ===")
|
||||
X_train_sc, X_test_sc = scale(X_train, X_test) # Features skalieren
|
||||
results_sc, best_sc = search_manual(X_train_sc, X_test_sc, y_train, y_test)
|
||||
|
||||
# --- Variante C: GridScearch
|
||||
grid = search_grid(X_train_sc, y_train) # skalierte Train-Daten rein
|
||||
final_acc = grid.score(X_test_sc, y_test) # skalierte Test-Daten messen
|
||||
k_grid = grid.best_params_['n_neighbors']
|
||||
p_grid = grid.best_params_['p']
|
||||
cv_grid = grid.best_score_
|
||||
|
||||
# --- Vergleich ---
|
||||
print(f"\nOhne Skalierung: k={best_raw[0]}, p={best_raw[1]}, acc={best_raw[2]:.4f}")
|
||||
print(f"Mit Skalierung: k={best_sc[0]}, p={best_sc[1]}, acc={best_sc[2]:.4f}")
|
||||
print(f"Mit GridSearch: k={k_grid}, p={p_grid}, CV-acc={cv_grid:.4f}, test-acc={final_acc:.4f}")
|
||||
|
||||
y_pred = grid.predict(X_test_sc) # weiterhin skaliert
|
||||
print(confusion_matrix(y_test, y_pred))
|
||||
print(classification_report(y_test, y_pred))
|
||||
Reference in New Issue
Block a user