refactor: move things around

2026-05-21 14:16:30 +02:00
parent 2fce3281a3
commit 41e15ed275
124 changed files with 404226 additions and 0 deletions
@@ -0,0 +1,193 @@
+"""
+    Useful functions for example notebooks and workshop solutions
+    of course Practical Machine Learning - Supervised Learning
+    Bern University of Applied Sciences (BFH)
+"""
+
+
+# ========== Packages ==========
+
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+
+# ========== Functions ==========
+
+def prep_data(dataset, target, train_ratio = 2 / 3, seed = None, sep = ','):
+    """ read and prepare real data from the current directory
+    performs 
+        read data
+        features - target - split
+        train - test - split
+
+    Parameters
+    ----------
+    dataset: name of dataset in csv format
+    target: name of target column
+    train_ratio (2 / 3): (optional)
+    seed (None): random seet for split (optional)
+    sep (,): separator of csv file (optional)
+
+    Returns
+    -------
+    X_train: feature matrix of train set
+    X_test: target vector of train set
+    y_train: feature matrix of test set
+    y_test: target vector of train set
+    """
+
+    ## load data
+    data = pd.read_csv(dataset, sep = sep)
+
+    ## features - target - split
+    X = data.drop(target, axis=1)
+    y = data[target]
+
+    ## train - test - split
+    from sklearn.model_selection import train_test_split
+    return train_test_split(
+        X,
+        y,
+        train_size=train_ratio,
+        random_state=seed)
+    
+
+
+def prep_demo_data(dataset, target):
+    """ read demo data from the current directory
+    performs 
+        read data
+        features - target - split
+
+    Parameters
+    ----------
+    dataset: name of dataset in csv format, ',' separated
+    target: name of target column
+
+    Returns
+    -------
+    X: feature matrix
+    y: target vector
+    """
+
+    ## load data
+    data = pd.read_csv(dataset)
+    
+    ## features - target - split
+    X = data.drop(target, axis=1)
+    y = data[target]
+    
+    return X, y   
+
+
+
+def inspect_decision_tree_model(model_def, features, target, figsize=(6, 6)):
+    """ train a DecisionTreeClassifier and visualize the tree
+    
+    prints some motel attributes from within the function
+    
+    Parameters
+    ----------
+    model_def: DecisionTreeClassifier object with set parameters
+    features: feature matrix
+    target: target vector
+    figsize: size of image, optional, default = (6, 6)
+    
+    Returns
+    -------
+    visualization of the trained tree
+    prints model attributes
+    """
+    
+    from sklearn.tree import plot_tree
+    
+    model = model_def
+    model.fit(features, target)
+
+    print('TREE DIAGNOSTICS:')
+    print('depth  :', model.get_depth())
+    print('leaves :', model.get_n_leaves())
+    print('score  :', model.score(features, target))
+
+    plt.figure(figsize=figsize)
+    plot_tree(model,
+              feature_names=features.columns,
+              class_names=model.classes_,
+              filled=True);
+
+
+
+def test_regression_model(model, X_train, y_train, X_test, y_test, show_plot=True):
+    
+    """ shows behavoiur of univariate ML regression on synthetic dataset
+    
+    performs
+    -   training on train data
+    -   prediction on test data
+    -   calculate performance measures
+ 
+    Parameters
+    ----------
+    model: a parametrized regression model
+    X_train, y_train: train data
+    X_test, y_test: test data
+    show_plot: show scatterplot ov pred vs true, optional, default=True
+    
+
+    Returns
+    -------
+    shows a scatterplot von X_test vs X_pred with a diagonal line, indicating identity
+    prints r2_score and mean_squared_error
+    
+    """
+
+    from sklearn.metrics import r2_score
+    from sklearn.metrics import mean_squared_error
+
+    model = model
+    model.fit(X_train, y_train)
+    y_pred = model.predict(X_test)
+    print('R2 = %0.4f' %(r2_score(y_test, y_pred)))
+    
+    if show_plot == True:
+        plt.figure(figsize=(6,6))
+        ax = sns.scatterplot(x=y_test, y=y_pred)
+        ax.set(xlabel='y_test', ylabel='y_pred')
+        ls = np.linspace(min(y_test), max(y_test), 100)
+        plt.plot(ls, ls, color='black', linestyle='dashed')
+        ax.set_title(model.__class__.__name__)
+        plt.show()
+    
+    return (model)
+    
+
+
+def show_pred_on_synth(model, X, y, X_synth, param_str):
+    """ shows behavoiur of univariate ML regression on synthetic dataset
+ 
+    Parameters
+    ----------
+    model: a parametrized regression model
+    X, y: data for univariate regression
+    X_synth: synthetic Feature
+    param_str: parameter description for title
+    seed (None): random seet for split
+
+    Returns
+    -------
+    a scatterplot von X, y, with the prediction values for X_synth
+    
+    """
+    
+    model.fit(X.to_numpy(), y)
+    y_pred = model.predict(X_synth)
+
+    ax = sns.scatterplot(x=X['X'], y=y)
+    ax = sns.lineplot(x=X_synth[:,0], y=y_pred, color='orange')
+    ax.set_title(model.__class__.__name__ + ' : ' + param_str)
+    ax.set(xlabel='X', ylabel='y')
+    plt.show()
+
+