refactor: move things around

2026-05-21 14:16:30 +02:00
parent 2fce3281a3
commit 41e15ed275
124 changed files with 404226 additions and 0 deletions
@@ -0,0 +1,212 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "toc": true
+   },
+   "source": [
+    "# WS 14 Random Search CV"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "* untersuchen Sie Kombinationen von Parameterwerten bei RandomForestClassifier\n",
+    "* Vorschlag:\n",
+    "  * n_estimators in [50, 100, 150, 200]\n",
+    "  * max_features in [3, 5, 7, 9]\n",
+    "  * criterion in ['gini', 'entropy']\n",
+    "  * min_samples_leaf in [1, 2, 3, 4]\n",
+    "* wenden Sie 5-fach Kreuzvalidierung an\n",
+    "* setzen Sie die Anzahl der zu untersuchenden Kombinationen auf 12\n",
+    "* arbeiten Sie ohne setzen von random_state, damit anschliessend die Ergebnisse verglichen werden können"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## import libraries\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "\n",
+    "## load data\n",
+    "datapath = '../../3_data'\n",
+    "from os import chdir; chdir(datapath)\n",
+    "bank_df = pd.read_csv('bank_data_prep.csv')\n",
+    "\n",
+    "## features - target - split\n",
+    "X = bank_df.drop('y', axis=1)\n",
+    "y = bank_df['y']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "best_params_ : {'n_estimators': 50, 'min_samples_leaf': 4, 'max_features': 9, 'criterion': 'entropy'}\n",
+      "best_score_  : 0.8884381338742393\n",
+      "CPU times: total: 3.09 s\n",
+      "Wall time: 43.6 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "## import classes from sklearn\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "from sklearn.model_selection import RandomizedSearchCV\n",
+    "\n",
+    "## define parameter grid\n",
+    "parameter_grid = {'n_estimators': [50, 100, 150, 200],\n",
+    "                  'max_features': [3, 5, 7, 9],\n",
+    "                  'criterion': ['gini', 'entropy'],\n",
+    "                  'min_samples_leaf': [1, 2, 3, 4]}\n",
+    "\n",
+    "## define RandomizedSearchCV\n",
+    "rscv = RandomizedSearchCV(\n",
+    "    estimator=RandomForestClassifier(random_state=1234), \n",
+    "    param_distributions=parameter_grid, \n",
+    "    cv=5,\n",
+    "    n_iter=12,\n",
+    "    random_state=1234,\n",
+    "    n_jobs=-1)\n",
+    "\n",
+    "## run RandomizedSearchCV\n",
+    "rscv.fit(X, y)\n",
+    "\n",
+    "## evaluate RandomizedSearchCV\n",
+    "print('best_params_ :', rscv.best_params_)\n",
+    "print('best_score_  :', rscv.best_score_)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#rscv.best_estimator_"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(9860, 29)"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "X.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Fazit:**\n",
+    "* n_estimators: [50, 100, 150, 200] -> 50\n",
+    "  * hier müsste der Suchbereich nach unten erweiter werden\n",
+    "* max_features: [3, 5, 7, 9] -> 9\n",
+    "  * hier müsste der Suchbereich nach oben erweiter werden\n",
+    "* criterion: ['gini', 'entropy']\n",
+    "  * Suchbereich ok\n",
+    "* min_samples_leaf: [1, 2, 3, 4] -> 4\n",
+    "  * hier müsste der Suchbereich nach oben erweiter werden"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
+  },
+  "toc": {
+   "base_numbering": "",
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "title_cell": "WS 17 Validierung - Random Search CV",
+   "title_sidebar": "Contents",
+   "toc_cell": false,
+   "toc_position": {
+    "height": "calc(100% - 180px)",
+    "left": "10px",
+    "top": "150px",
+    "width": "195.867px"
+   },
+   "toc_section_display": true,
+   "toc_window_display": true
+  },
+  "varInspector": {
+   "cols": {
+    "lenName": 16,
+    "lenType": 16,
+    "lenVar": 40
+   },
+   "kernels_config": {
+    "python": {
+     "delete_cmd_postfix": "",
+     "delete_cmd_prefix": "del ",
+     "library": "var_list.py",
+     "varRefreshCmd": "print(var_dic_list())"
+    },
+    "r": {
+     "delete_cmd_postfix": ") ",
+     "delete_cmd_prefix": "rm(",
+     "library": "var_list.r",
+     "varRefreshCmd": "cat(var_dic_list()) "
+    }
+   },
+   "position": {
+    "height": "306.85px",
+    "left": "862px",
+    "right": "20px",
+    "top": "137px",
+    "width": "350px"
+   },
+   "types_to_exclude": [
+    "module",
+    "function",
+    "builtin_function_or_method",
+    "instance",
+    "_Feature"
+   ],
+   "window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}