cas-pml/SL/aufgaben/WS1/notebooks/bfh_cas_pml.py

"""
    Useful functions for example notebooks and workshop solutions
    of course Practical Machine Learning - Supervised Learning
    Bern University of Applied Sciences (BFH)
"""


# ========== Packages ==========

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# ========== Functions ==========

def prep_data(dataset, target, train_ratio = 2 / 3, seed = None, sep = ','):
    """ read and prepare real data from the current directory
    performs
        read data
        features - target - split
        train - test - split

    Parameters
    ----------
    dataset: name of dataset in csv format
    target: name of target column
    train_ratio (2 / 3): (optional)
    seed (None): random seet for split (optional)
    sep (,): separator of csv file (optional)

    Returns
    -------
    X_train: feature matrix of train set
    X_test: target vector of train set
    y_train: feature matrix of test set
    y_test: target vector of train set
    """

    ## load data
    data = pd.read_csv(dataset, sep = sep)

    ## features - target - split
    X = data.drop(target, axis=1)
    y = data[target]

    ## train - test - split
    from sklearn.model_selection import train_test_split
    return train_test_split(
        X,
        y,
        train_size=train_ratio,
        random_state=seed)


def prep_demo_data(dataset, target):
    """ read demo data from the current directory
    performs
        read data
        features - target - split

    Parameters
    ----------
    dataset: name of dataset in csv format, ',' separated
    target: name of target column

    Returns
    -------
    X: feature matrix
    y: target vector
    """

    ## load data
    data = pd.read_csv(dataset)

    ## features - target - split
    X = data.drop(target, axis=1)
    y = data[target]

    return X, y


def inspect_decision_tree_model(model_def, features, target, figsize=(6, 6)):
    """ train a DecisionTreeClassifier and visualize the tree

    prints some motel attributes from within the function

    Parameters
    ----------
    model_def: DecisionTreeClassifier object with set parameters
    features: feature matrix
    target: target vector
    figsize: size of image, optional, default = (6, 6)

    Returns
    -------
    visualization of the trained tree
    prints model attributes
    """

    from sklearn.tree import plot_tree

    model = model_def
    model.fit(features, target)

    print('TREE DIAGNOSTICS:')
    print('depth  :', model.get_depth())
    print('leaves :', model.get_n_leaves())
    print('score  :', model.score(features, target))

    plt.figure(figsize=figsize)
    plot_tree(model,
              feature_names=features.columns,
              class_names=model.classes_,
              filled=True);


def test_regression_model(model, X_train, y_train, X_test, y_test, show_plot=True):

    """ shows behavoiur of univariate ML regression on synthetic dataset

    performs
    -   training on train data
    -   prediction on test data
    -   calculate performance measures

    Parameters
    ----------
    model: a parametrized regression model
    X_train, y_train: train data
    X_test, y_test: test data
    show_plot: show scatterplot ov pred vs true, optional, default=True


    Returns
    -------
    shows a scatterplot von X_test vs X_pred with a diagonal line, indicating identity
    prints r2_score and mean_squared_error

    """

    from sklearn.metrics import r2_score
    from sklearn.metrics import mean_squared_error

    model = model
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print('R2 = %0.4f' %(r2_score(y_test, y_pred)))

    if show_plot == True:
        plt.figure(figsize=(6,6))
        ax = sns.scatterplot(x=y_test, y=y_pred)
        ax.set(xlabel='y_test', ylabel='y_pred')
        ls = np.linspace(min(y_test), max(y_test), 100)
        plt.plot(ls, ls, color='black', linestyle='dashed')
        ax.set_title(model.__class__.__name__)
        plt.show()

    return (model)


def show_pred_on_synth(model, X, y, X_synth, param_str):
    """ shows behavoiur of univariate ML regression on synthetic dataset

    Parameters
    ----------
    model: a parametrized regression model
    X, y: data for univariate regression
    X_synth: synthetic Feature
    param_str: parameter description for title
    seed (None): random seet for split

    Returns
    -------
    a scatterplot von X, y, with the prediction values for X_synth

    """

    model.fit(X.to_numpy(), y)
    y_pred = model.predict(X_synth)

    ax = sns.scatterplot(x=X['X'], y=y)
    ax = sns.lineplot(x=X_synth[:,0], y=y_pred, color='orange')
    ax.set_title(model.__class__.__name__ + ' : ' + param_str)
    ax.set(xlabel='X', ylabel='y')
    plt.show()