Files
cas-pml/SL/aufgaben/template/2_Code/4.5 Validierung und mehr - Unbalancierte Daten.ipynb
T
2026-05-21 14:16:30 +02:00

502 lines
13 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"tags": []
},
"source": [
"# Feature Engineering\n",
"# Klassifikation\n",
"# Regression\n",
"# Validierung und mehr\n",
"## Sampling und Resampling\n",
"## Validierungstechniken\n",
"## Grid Search und Random Search\n",
"## Performancemetriken\n",
"## Unbalancierte Daten\n",
"### Motivation und Vorbereitung "
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"## for scikit-learn 1.4.2, to silence warnings regarding physical cores\n",
"import os\n",
"os.environ['LOKY_MAX_CPU_COUNT'] = '4' ## depending on the hardware used"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"ExecuteTime": {
"end_time": "2020-04-15T20:53:02.218161Z",
"start_time": "2020-04-15T20:53:02.079407Z"
}
},
"outputs": [],
"source": [
"## import libraries\n",
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns; sns.set()\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"ExecuteTime": {
"end_time": "2020-04-14T21:30:48.667936Z",
"start_time": "2020-04-14T21:30:48.420905Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"dim = (41188, 21)\n",
"y\n",
"no 0.887346\n",
"yes 0.112654\n",
"Name: proportion, dtype: float64\n"
]
}
],
"source": [
"## read and prepare data\n",
"datapath = '../3_data'\n",
"from os import chdir; chdir(datapath)\n",
"data = pd.read_csv('bank-additional-full.csv', sep=';')\n",
"print('dim =', data.shape)\n",
"print(data.y.value_counts(normalize=True)) ## proportion\n",
"\n",
"X_full = data.drop('y', axis=1)\n",
"y_full = data['y'] "
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"ExecuteTime": {
"end_time": "2020-04-14T21:30:48.714876Z",
"start_time": "2020-04-14T21:30:48.673886Z"
}
},
"outputs": [],
"source": [
"## minimal feature engineering: one hot encoding for not numerical features\n",
"X_full = pd.get_dummies(X_full, drop_first=True)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"ExecuteTime": {
"end_time": "2020-04-14T21:30:48.714876Z",
"start_time": "2020-04-14T21:30:48.673886Z"
}
},
"outputs": [],
"source": [
"## test - train - split\n",
"from sklearn.model_selection import train_test_split\n",
"X_full_train, X_full_test, y_full_train, y_full_test, = train_test_split(\n",
" X_full,\n",
" y_full,\n",
" train_size=2/3,\n",
" random_state=1234)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"ExecuteTime": {
"end_time": "2020-04-14T21:30:53.858945Z",
"start_time": "2020-04-14T21:30:48.719365Z"
}
},
"outputs": [],
"source": [
"## function for evaluate different sampling methods\n",
"## train a RandomForestClassifier model with train data\n",
"## return\n",
"## internal scorer (accuracy) for test data\n",
"## proportion of classes after resampling\n",
"\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"def getResampledRfScore(X_train, y_train, X_test, y_test):\n",
" model = RandomForestClassifier(random_state=1234)\n",
" model.fit(X_train, y_train)\n",
" print('score ', model.score(X_test, y_test))\n",
" print(y_train.value_counts(normalize=True)) "
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"ExecuteTime": {
"end_time": "2020-04-14T21:30:53.858945Z",
"start_time": "2020-04-14T21:30:48.719365Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"score 0.912163146394756\n",
"y\n",
"no 0.886773\n",
"yes 0.113227\n",
"Name: proportion, dtype: float64\n"
]
}
],
"source": [
"## test call (without resampling)\n",
"getResampledRfScore(X_full_train, y_full_train, X_full_test, y_full_test)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Random under-sampling"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: imblearn in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (0.0)\n",
"Requirement already satisfied: imbalanced-learn in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from imblearn) (0.13.0)\n",
"Requirement already satisfied: numpy<3,>=1.24.3 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from imbalanced-learn->imblearn) (2.2.6)\n",
"Requirement already satisfied: scipy<2,>=1.10.1 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from imbalanced-learn->imblearn) (1.15.3)\n",
"Requirement already satisfied: scikit-learn<2,>=1.3.2 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from imbalanced-learn->imblearn) (1.6.1)\n",
"Requirement already satisfied: sklearn-compat<1,>=0.1 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from imbalanced-learn->imblearn) (0.1.3)\n",
"Requirement already satisfied: joblib<2,>=1.1.1 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from imbalanced-learn->imblearn) (1.5.1)\n",
"Requirement already satisfied: threadpoolctl<4,>=2.0.0 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from imbalanced-learn->imblearn) (3.6.0)\n"
]
}
],
"source": [
"!pip install imblearn"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"ExecuteTime": {
"end_time": "2020-04-14T21:30:55.982616Z",
"start_time": "2020-04-14T21:30:53.863545Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"score 0.847632920611799\n",
"y\n",
"no 0.5\n",
"yes 0.5\n",
"Name: proportion, dtype: float64\n"
]
}
],
"source": [
"from imblearn.under_sampling import RandomUnderSampler\n",
"rus = RandomUnderSampler(random_state=1234)\n",
"X_resampled_train, y_resampled_train =\\\n",
" rus.fit_resample(X_full_train, y_full_train)\n",
"getResampledRfScore(\n",
" X_resampled_train, y_resampled_train, X_full_test, y_full_test)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Random over-sampling"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"ExecuteTime": {
"end_time": "2020-04-14T21:31:04.199265Z",
"start_time": "2020-04-14T21:30:55.985909Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"score 0.9041514930808449\n",
"y\n",
"no 0.5\n",
"yes 0.5\n",
"Name: proportion, dtype: float64\n"
]
}
],
"source": [
"from imblearn.over_sampling import\\\n",
" RandomOverSampler\n",
"ros = RandomOverSampler(random_state=1234)\n",
"X_resampled_train, y_resampled_train =\\\n",
" ros.fit_resample(X_full_train, y_full_train)\n",
"getResampledRfScore(\n",
" X_resampled_train, y_resampled_train, X_full_test, y_full_test)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Undersampling mit Tomek Links"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"ExecuteTime": {
"end_time": "2020-04-14T21:31:11.461134Z",
"start_time": "2020-04-14T21:31:04.202872Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"score 0.9115076474872542\n",
"y\n",
"no 0.883063\n",
"yes 0.116937\n",
"Name: proportion, dtype: float64\n"
]
}
],
"source": [
"from imblearn.under_sampling import TomekLinks\n",
"tl = TomekLinks()\n",
"X_resampled_train, y_resampled_train = tl.fit_resample(\n",
" X_full_train, y_full_train)\n",
"getResampledRfScore(\n",
" X_resampled_train, y_resampled_train, X_full_test, y_full_test)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Oversampling mit SMOTE"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"ExecuteTime": {
"end_time": "2020-04-14T21:31:22.211925Z",
"start_time": "2020-04-14T21:31:11.466648Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"score 0.9038601602330663\n",
"y\n",
"no 0.5\n",
"yes 0.5\n",
"Name: proportion, dtype: float64\n"
]
}
],
"source": [
"from imblearn.over_sampling import SMOTE\n",
"sm = SMOTE()\n",
"X_resampled_train, y_resampled_train = sm.fit_resample(\n",
" X_full_train, y_full_train)\n",
"getResampledRfScore(\n",
" X_resampled_train, y_resampled_train, X_full_test, y_full_test)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Weights beim Trainieren"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"ref: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html\n",
"\n",
"* the formula for class_weights:\n",
"\n",
" n_samples / (n_classes * np.bincount(y))\n",
"\n",
"* the weights of y are calculated inversely proportional to the frequencies of the present classes"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.9104151493080845\n"
]
}
],
"source": [
"## with weights: balanced\n",
"model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=1234)\n",
"model.fit(X_full_train, y_full_train)\n",
"print(model.score(X_full_test, y_full_test))"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.5638424575957945 4.415889353489868\n",
"0.9104151493080845\n"
]
}
],
"source": [
"## with weights: balanced: mannualy set\n",
"n_no = y_full_train.value_counts()['no']\n",
"n_yes = y_full_train.value_counts()['yes']\n",
"weight_no = len(y_full_train) / (2 * n_no)\n",
"weight_yes = len(y_full_train) / (2 * n_yes)\n",
"print(weight_no, weight_yes)\n",
"\n",
"model = RandomForestClassifier(\n",
" n_estimators=100,\n",
" class_weight={'no': weight_no,\n",
" 'yes': weight_yes}, \n",
" random_state=1234)\n",
"\n",
"model.fit(X_full_train, y_full_train)\n",
"print(model.score(X_full_test, y_full_test))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.0"
},
"toc": {
"base_numbering": "4.5",
"nav_menu": {
"height": "189px",
"width": "303.333px"
},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "4.5 Validierung und mehr - Unbalancierte Daten",
"title_sidebar": "Contents",
"toc_cell": true,
"toc_position": {
"height": "calc(100% - 180px)",
"left": "10px",
"top": "150px",
"width": "291px"
},
"toc_section_display": true,
"toc_window_display": true
},
"toc-autonumbering": true,
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"position": {
"height": "298.85px",
"left": "782px",
"right": "20px",
"top": "120px",
"width": "350px"
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 4
}