{ "cells": [ { "cell_type": "markdown", "metadata": { "tags": [] }, "source": [ "# Feature Engineering\n", "# Klassifikation\n", "# Regression\n", "# Validierung und mehr\n", "## Sampling und Resampling\n", "## Validierungstechniken\n", "## Grid Search und Random Search\n", "## Performancemetriken\n", "## Unbalancierte Daten\n", "### Motivation und Vorbereitung " ] }, { "cell_type": "raw", "metadata": {}, "source": [ "## for scikit-learn 1.4.2, to silence warnings regarding physical cores\n", "import os\n", "os.environ['LOKY_MAX_CPU_COUNT'] = '4' ## depending on the hardware used" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2020-04-15T20:53:02.218161Z", "start_time": "2020-04-15T20:53:02.079407Z" } }, "outputs": [], "source": [ "## import libraries\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns; sns.set()\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2020-04-14T21:30:48.667936Z", "start_time": "2020-04-14T21:30:48.420905Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "dim = (41188, 21)\n", "y\n", "no 0.887346\n", "yes 0.112654\n", "Name: proportion, dtype: float64\n" ] } ], "source": [ "## read and prepare data\n", "datapath = '../3_data'\n", "from os import chdir; chdir(datapath)\n", "data = pd.read_csv('bank-additional-full.csv', sep=';')\n", "print('dim =', data.shape)\n", "print(data.y.value_counts(normalize=True)) ## proportion\n", "\n", "X_full = data.drop('y', axis=1)\n", "y_full = data['y'] " ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2020-04-14T21:30:48.714876Z", "start_time": "2020-04-14T21:30:48.673886Z" } }, "outputs": [], "source": [ "## minimal feature engineering: one hot encoding for not numerical features\n", "X_full = pd.get_dummies(X_full, drop_first=True)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2020-04-14T21:30:48.714876Z", "start_time": "2020-04-14T21:30:48.673886Z" } }, "outputs": [], "source": [ "## test - train - split\n", "from sklearn.model_selection import train_test_split\n", "X_full_train, X_full_test, y_full_train, y_full_test, = train_test_split(\n", " X_full,\n", " y_full,\n", " train_size=2/3,\n", " random_state=1234)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "ExecuteTime": { "end_time": "2020-04-14T21:30:53.858945Z", "start_time": "2020-04-14T21:30:48.719365Z" } }, "outputs": [], "source": [ "## function for evaluate different sampling methods\n", "## train a RandomForestClassifier model with train data\n", "## return\n", "## internal scorer (accuracy) for test data\n", "## proportion of classes after resampling\n", "\n", "from sklearn.ensemble import RandomForestClassifier\n", "def getResampledRfScore(X_train, y_train, X_test, y_test):\n", " model = RandomForestClassifier(random_state=1234)\n", " model.fit(X_train, y_train)\n", " print('score ', model.score(X_test, y_test))\n", " print(y_train.value_counts(normalize=True)) " ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "ExecuteTime": { "end_time": "2020-04-14T21:30:53.858945Z", "start_time": "2020-04-14T21:30:48.719365Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "score 0.912163146394756\n", "y\n", "no 0.886773\n", "yes 0.113227\n", "Name: proportion, dtype: float64\n" ] } ], "source": [ "## test call (without resampling)\n", "getResampledRfScore(X_full_train, y_full_train, X_full_test, y_full_test)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Random under-sampling" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: imblearn in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (0.0)\n", "Requirement already satisfied: imbalanced-learn in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from imblearn) (0.13.0)\n", "Requirement already satisfied: numpy<3,>=1.24.3 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from imbalanced-learn->imblearn) (2.2.6)\n", "Requirement already satisfied: scipy<2,>=1.10.1 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from imbalanced-learn->imblearn) (1.15.3)\n", "Requirement already satisfied: scikit-learn<2,>=1.3.2 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from imbalanced-learn->imblearn) (1.6.1)\n", "Requirement already satisfied: sklearn-compat<1,>=0.1 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from imbalanced-learn->imblearn) (0.1.3)\n", "Requirement already satisfied: joblib<2,>=1.1.1 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from imbalanced-learn->imblearn) (1.5.1)\n", "Requirement already satisfied: threadpoolctl<4,>=2.0.0 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from imbalanced-learn->imblearn) (3.6.0)\n" ] } ], "source": [ "!pip install imblearn" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "ExecuteTime": { "end_time": "2020-04-14T21:30:55.982616Z", "start_time": "2020-04-14T21:30:53.863545Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "score 0.847632920611799\n", "y\n", "no 0.5\n", "yes 0.5\n", "Name: proportion, dtype: float64\n" ] } ], "source": [ "from imblearn.under_sampling import RandomUnderSampler\n", "rus = RandomUnderSampler(random_state=1234)\n", "X_resampled_train, y_resampled_train =\\\n", " rus.fit_resample(X_full_train, y_full_train)\n", "getResampledRfScore(\n", " X_resampled_train, y_resampled_train, X_full_test, y_full_test)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Random over-sampling" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "ExecuteTime": { "end_time": "2020-04-14T21:31:04.199265Z", "start_time": "2020-04-14T21:30:55.985909Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "score 0.9041514930808449\n", "y\n", "no 0.5\n", "yes 0.5\n", "Name: proportion, dtype: float64\n" ] } ], "source": [ "from imblearn.over_sampling import\\\n", " RandomOverSampler\n", "ros = RandomOverSampler(random_state=1234)\n", "X_resampled_train, y_resampled_train =\\\n", " ros.fit_resample(X_full_train, y_full_train)\n", "getResampledRfScore(\n", " X_resampled_train, y_resampled_train, X_full_test, y_full_test)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Undersampling mit Tomek Links" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "ExecuteTime": { "end_time": "2020-04-14T21:31:11.461134Z", "start_time": "2020-04-14T21:31:04.202872Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "score 0.9115076474872542\n", "y\n", "no 0.883063\n", "yes 0.116937\n", "Name: proportion, dtype: float64\n" ] } ], "source": [ "from imblearn.under_sampling import TomekLinks\n", "tl = TomekLinks()\n", "X_resampled_train, y_resampled_train = tl.fit_resample(\n", " X_full_train, y_full_train)\n", "getResampledRfScore(\n", " X_resampled_train, y_resampled_train, X_full_test, y_full_test)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Oversampling mit SMOTE" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "ExecuteTime": { "end_time": "2020-04-14T21:31:22.211925Z", "start_time": "2020-04-14T21:31:11.466648Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "score 0.9038601602330663\n", "y\n", "no 0.5\n", "yes 0.5\n", "Name: proportion, dtype: float64\n" ] } ], "source": [ "from imblearn.over_sampling import SMOTE\n", "sm = SMOTE()\n", "X_resampled_train, y_resampled_train = sm.fit_resample(\n", " X_full_train, y_full_train)\n", "getResampledRfScore(\n", " X_resampled_train, y_resampled_train, X_full_test, y_full_test)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Weights beim Trainieren" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "ref: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html\n", "\n", "* the formula for class_weights:\n", "\n", " n_samples / (n_classes * np.bincount(y))\n", "\n", "* the weights of y are calculated inversely proportional to the frequencies of the present classes" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.9104151493080845\n" ] } ], "source": [ "## with weights: balanced\n", "model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=1234)\n", "model.fit(X_full_train, y_full_train)\n", "print(model.score(X_full_test, y_full_test))" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.5638424575957945 4.415889353489868\n", "0.9104151493080845\n" ] } ], "source": [ "## with weights: balanced: mannualy set\n", "n_no = y_full_train.value_counts()['no']\n", "n_yes = y_full_train.value_counts()['yes']\n", "weight_no = len(y_full_train) / (2 * n_no)\n", "weight_yes = len(y_full_train) / (2 * n_yes)\n", "print(weight_no, weight_yes)\n", "\n", "model = RandomForestClassifier(\n", " n_estimators=100,\n", " class_weight={'no': weight_no,\n", " 'yes': weight_yes}, \n", " random_state=1234)\n", "\n", "model.fit(X_full_train, y_full_train)\n", "print(model.score(X_full_test, y_full_test))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.0" }, "toc": { "base_numbering": "4.5", "nav_menu": { "height": "189px", "width": "303.333px" }, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "4.5 Validierung und mehr - Unbalancierte Daten", "title_sidebar": "Contents", "toc_cell": true, "toc_position": { "height": "calc(100% - 180px)", "left": "10px", "top": "150px", "width": "291px" }, "toc_section_display": true, "toc_window_display": true }, "toc-autonumbering": true, "varInspector": { "cols": { "lenName": 16, "lenType": 16, "lenVar": 40 }, "kernels_config": { "python": { "delete_cmd_postfix": "", "delete_cmd_prefix": "del ", "library": "var_list.py", "varRefreshCmd": "print(var_dic_list())" }, "r": { "delete_cmd_postfix": ") ", "delete_cmd_prefix": "rm(", "library": "var_list.r", "varRefreshCmd": "cat(var_dic_list()) " } }, "position": { "height": "298.85px", "left": "782px", "right": "20px", "top": "120px", "width": "350px" }, "types_to_exclude": [ "module", "function", "builtin_function_or_method", "instance", "_Feature" ], "window_display": false } }, "nbformat": 4, "nbformat_minor": 4 }