feature: add example jupyter notebooks

2026-05-21 13:51:32 +02:00
parent 7ed665a524
commit 2fce3281a3
35 changed files with 24357 additions and 0 deletions
@@ -0,0 +1,222 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**nicht als Notebook verteilen**"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "# Feature Engineering\n",
    "\n",
    "## Feature Engineering - Einführung\n",
    "\n",
    "### Abgrenzungen\n",
    "\n",
    "### CRISP - und die Gliederung des Kurses\n",
    "\n",
    "### Strukturierte Daten\n",
    "\n",
    "#### Aufbau und Organisation eines Data Frame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "     age            job  marital  duration  campaign    y\n",
      "0   30.0  self-employed   single     245.0         3  yes\n",
      "1   32.0     technician  married     370.0         1   no\n",
      "2   27.0    blue collar   single     623.0         1  yes\n",
      "3    NaN    blue collar   single       9.0         6   no\n",
      "4   27.0         admin.   single     126.0         2  yes\n",
      "5   34.0         admin.   single     548.0         2  yes\n",
      "6   46.0           None  married      86.0         2   no\n",
      "7    NaN        retired  married     707.0         3  yes\n",
      "8   46.0         admin.  married      96.0         6   no\n",
      "9   48.0    blue collar  married     241.0         2   no\n",
      "10  29.0     technician  married     154.0         3   no\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "data = pd.read_csv('../3_data/bank_data.csv', sep=';')\n",
    "#print(data.iloc[0:11, 0:6])\n",
    "\n",
    "## arbitrarily input\n",
    "data.iloc[3, 0] = None\n",
    "data.iloc[6, 1] = None\n",
    "#print(data.iloc[0:11, 0:6])\n",
    "print(data.iloc[0:11, [0,1,2,10,11,20]])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Begriffe"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Beispieldaten"
   ]
  },
  {
   "cell_type": "raw",
   "metadata": {},
   "source": [
    "import pandas as pd\n",
    "demo_data_class = pd.read_csv('../3_data/demo_data_class.csv')\n",
    "demo_data_regr = pd.read_csv('../3_data/demo_data_regr.csv') "
   ]
  },
  {
   "cell_type": "raw",
   "metadata": {},
   "source": [
    "import seaborn as sns\n",
    "iris_data = sns.load_dataset('iris')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true
   },
   "source": [
    "### Anforderungen an die Daten für Machine Learning"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Eine typische ML Sequenz"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Python Libraries"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Feature Engineering"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Die Python Libraries und CRISP-DM"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Begleitende Literatur"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.0"
  },
  "toc": {
   "base_numbering": "1.1",
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "1.1 Feature Engineering - Einführung",
   "title_sidebar": "Contents",
   "toc_cell": true,
   "toc_position": {
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "251px"
   },
   "toc_section_display": true,
   "toc_window_display": true
  },
  "toc-autonumbering": true,
  "toc-showcode": false,
  "toc-showmarkdowntxt": false,
  "toc-showtags": false,
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "position": {
    "height": "326.85px",
    "left": "910px",
    "right": "20px",
    "top": "120px",
    "width": "350px"
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
 }
@@ -0,0 +1,521 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "# Feature Engineering\n",
    "## Einfuehrung\n",
    "## Exploration\n",
    "## Transformation\n",
    "## Konstruktion\n",
    "## Selektion\n",
    "## Implementation\n",
    "### Data Frame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-03-17T11:19:28.485043Z",
     "start_time": "2020-03-17T11:19:27.999820Z"
    }
   },
   "outputs": [],
   "source": [
    "## preparation: import libraries and read data\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "\n",
    "sns.set()\n",
    "%matplotlib inline\n",
    "\n",
    "datapath = '../3_data'\n",
    "from os import chdir\n",
    "\n",
    "chdir(datapath)\n",
    "\n",
    "data = pd.read_csv('bank_data.csv', sep=';')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### E1: Entfernen von Beobachtungen nach Bedingung"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "## remove case for age > 100\n",
    "data.drop(data[data.age >= 100].index, inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### E2: Entfernen von Duplikaten"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "## remove duplicates\n",
    "data.drop_duplicates(ignore_index=True, inplace = True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### E3: Entfernen fragwürdiger Variablen"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "## alternative ['default', 'poutcome', 'duration']\n",
    "vars_to_drop = ['default', 'poutcome']\n",
    "data = data.drop(vars_to_drop, axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### E4: Einsetzen von Werten für NAs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "## create lists of names of of categorical and numerical variables\n",
    "cat_vars = data.select_dtypes(include='object').columns.tolist()\n",
    "num_vars = data.select_dtypes(exclude='object').columns.tolist()\n",
    "\n",
    "## import SimpleImputer class\n",
    "from sklearn.impute import SimpleImputer\n",
    "\n",
    "## imput for categorical variables\n",
    "imp_mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent')\n",
    "data[cat_vars] = pd.DataFrame(imp_mode.fit_transform(data[cat_vars]), columns=data[cat_vars].columns)\n",
    "\n",
    "## imput for numerical variables\n",
    "imp_median = SimpleImputer(missing_values=np.nan, strategy='median')\n",
    "data[num_vars] = pd.DataFrame(imp_median.fit_transform(data[num_vars]), columns=data[num_vars].columns)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Kategoriale Variablen"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### E5: Reduzieren der Kardinalität"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-03-17T11:19:28.781822Z",
     "start_time": "2020-03-17T11:19:28.769081Z"
    }
   },
   "outputs": [],
   "source": [
    "## education: illiterate : basic.4y\n",
    "data.education = np.where(\n",
    "    data.education == 'illiterate', \n",
    "    'basic.4y',\n",
    "    data.education)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Nummerisiren - Faktorisieren (Platzhalter)\n",
    "hier kein Bedarf"
   ]
  },
  {
   "cell_type": "raw",
   "metadata": {},
   "source": [
    "## sandbox\n",
    "tmp_data = data.copy()\n",
    "\n",
    "## check before\n",
    "print(tmp_data.job.value_counts())\n",
    "\n",
    "## factorize\n",
    "tmp_data.job = pd.factorize(tmp_data.job)[0]\n",
    "\n",
    "## check after\n",
    "print(tmp_data.job.value_counts())\n",
    "\n",
    "del(tmp_data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### E6: Nummerisiren - Ordial Encodieren"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-03-17T11:19:28.885362Z",
     "start_time": "2020-03-17T11:19:28.825371Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/30/93p10lq141sd2pvx31bfs77w0000gp/T/ipykernel_6176/854807168.py:33: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
      "  data.replace(replace_nums, inplace=True)\n"
     ]
    }
   ],
   "source": [
    "## education, day_of_week, month\n",
    "replace_nums = {\n",
    "    'education': {\n",
    "        'basic.4y': 1,\n",
    "        'basic.6y': 2,\n",
    "        'basic.9y': 3,\n",
    "        'professional.course': 4,\n",
    "        'high.school': 5,\n",
    "        'university.degree': 6\n",
    "    },\n",
    "    'month': {\n",
    "        'jan': 1,\n",
    "        'feb': 2,\n",
    "        'mar': 3,\n",
    "        'apr': 4,\n",
    "        'may': 5,\n",
    "        'jun': 6,\n",
    "        'jul': 7,\n",
    "        'aug': 8,\n",
    "        'sep': 9,\n",
    "        'oct': 10,\n",
    "        'nov': 11,\n",
    "        'dec': 12\n",
    "    },\n",
    "    'day_of_week': {\n",
    "        'mon': 1,\n",
    "        'tue': 2,\n",
    "        'wed': 3,\n",
    "        'thu': 4,\n",
    "        'fri': 5\n",
    "    }\n",
    "}\n",
    "data.replace(replace_nums, inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### E7: Nummerisieren - Binär Encodieren"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-03-17T11:19:28.820652Z",
     "start_time": "2020-03-17T11:19:28.786046Z"
    }
   },
   "outputs": [],
   "source": [
    "## housing : no -> 0 else 1\n",
    "data.housing = np.where(data.housing == 'no', 0, 1)\n",
    "\n",
    "## contact : celular -> 1 else 0\n",
    "data.contact = np.where(data.contact == 'cellular', 1, 0)\n",
    "## rename\n",
    "data = data.rename(columns={'contact': 'contact_cellular'})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "#### E8: Nummerisieren - Ordinal Encodieren"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "## one-hot encoding\n",
    "## apply for all categorical variables except target\n",
    "target = 'y'\n",
    "sel_vars = data.select_dtypes(include=['object']).columns.drop(target)\n",
    "data = pd.get_dummies(data, columns=sel_vars, drop_first=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Numerische Variablen"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "#### E9: Logarithmieren"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-03-17T11:19:29.015675Z",
     "start_time": "2020-03-17T11:19:28.972900Z"
    }
   },
   "outputs": [],
   "source": [
    "## duration and campaign\n",
    "data.duration = np.log10(data.duration + data.duration.min() + 1)\n",
    "data.campaign = np.log10(data.campaign + data.campaign.min() + 1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "#### E10: Binär umcodieren"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-03-17T11:19:29.046736Z",
     "start_time": "2020-03-17T11:19:29.023717Z"
    }
   },
   "outputs": [],
   "source": [
    "## pdays : 999 -> 0, else 1\n",
    "data.pdays = np.where(data.pdays == 999, 0, 1)\n",
    "\n",
    "## previous : > 0 -> 1 else 0\n",
    "data.previous = np.where(data.previous > 0, 1, 0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Andere Tätigkeiten"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Konstruktion (Platzhalter)\n",
    "hier kein Bedarf"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### E11: Bereinigen der Variablennamen"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "old_names = data.columns\n",
    "new_names = old_names.str.replace('[^a-zA-Z0-9_]', '_', regex=True)\n",
    "for i in range(len(old_names)):\n",
    "    data.rename(columns={old_names[i]:new_names[i]}, inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Standardisieren (Platzhalter)\n",
    "hier kein Bedarf"
   ]
  },
  {
   "cell_type": "raw",
   "metadata": {
    "tags": []
   },
   "source": [
    "## all except target\n",
    "\n",
    "## features - target - split\n",
    "target = 'y'\n",
    "X = data.drop(target, axis=1)\n",
    "y = data.y\n",
    "\n",
    "## scale features (X)\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "scaler = StandardScaler().set_output(transform=\"pandas\")\n",
    "X = scaler.fit_transform(X)\n",
    "\n",
    "## concat target to scaled features\n",
    "new_data = pd.concat([X, y.reindex(X.index)], axis=1)\n",
    "\n",
    "del(new_data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "#### E12: Speichern unter neuem Namen"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-03-17T11:19:29.405659Z",
     "start_time": "2020-03-17T11:19:29.080587Z"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "## as bank_data_prep.csv\n",
    "## parameters\n",
    "##   sep = ',' (default)\n",
    "##   index = False (default True would add an index column on the left)\n",
    "data.to_csv('bank_data_prep.csv', index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.0"
  },
  "toc": {
   "base_numbering": "1.6",
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "1.6 Feature Engineering - Implementation",
   "title_sidebar": "Contents",
   "toc_cell": true,
   "toc_position": {
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "249.6px"
   },
   "toc_section_display": true,
   "toc_window_display": true
  },
  "toc-autonumbering": true,
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
 }
@@ -0,0 +1,506 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "# Feature Engineering\n",
    "# Klassifikation\n",
    "## Instanzbasierte Modelle\n",
    "## Regelbasierte Modelle\n",
    "## Mathematische Modelle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.append('./')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-03-17T12:01:39.858981Z",
     "start_time": "2020-03-17T12:01:37.904657Z"
    }
   },
   "outputs": [],
   "source": [
    "## preparation\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns; sns.set()\n",
    "%matplotlib inline\n",
    "\n",
    "datapath = '../3_data'\n",
    "from os import chdir; chdir(datapath)\n",
    "\n",
    "from bfh_cas_pml import prep_data, prep_demo_data\n",
    "X_train, X_test, y_train, y_test = prep_data('bank_data_prep.csv', 'y', seed = 1234)\n",
    "X_demo, y_demo = prep_demo_data('demo_data_class.csv', 'y')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### LinearDiscriminantAnalysis\n",
    "#### Theorie"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "kein Code zu diesem Kapitel"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Praxis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-03-17T12:01:40.035126Z",
     "start_time": "2020-03-17T12:01:39.864400Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.8487982963188317\n"
     ]
    }
   ],
   "source": [
    "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n",
    "model = LinearDiscriminantAnalysis()\n",
    "model.fit(X_train, y_train) \n",
    "print(model.score(X_test, y_test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-03-17T12:01:40.051095Z",
     "start_time": "2020-03-17T12:01:40.038394Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'covariance_estimator': None, 'n_components': None, 'priors': None, 'shrinkage': None, 'solver': 'svd', 'store_covariance': False, 'tol': 0.0001}\n"
     ]
    }
   ],
   "source": [
    "print(model.get_params())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### QuadraticDiscriminantAnalysis (eine Variante)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-03-17T12:01:40.144808Z",
     "start_time": "2020-03-17T12:01:40.054435Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.7246729540614543\n"
     ]
    }
   ],
   "source": [
    "from sklearn.discriminant_analysis \\\n",
    "    import QuadraticDiscriminantAnalysis\n",
    "model = QuadraticDiscriminantAnalysis()\n",
    "model.fit(X_train, y_train)\n",
    "print(model.score(X_test, y_test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-03-17T12:01:40.160468Z",
     "start_time": "2020-03-17T12:01:40.149447Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'priors': None, 'reg_param': 0.0, 'store_covariance': False, 'tol': 0.0001}\n"
     ]
    }
   ],
   "source": [
    "print(model.get_params())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "### SVC\n",
    "#### Theorie\n",
    "#### Praxis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-03-17T12:01:47.205171Z",
     "start_time": "2020-03-17T12:01:40.196843Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.7161545482202616\n"
     ]
    }
   ],
   "source": [
    "from sklearn.svm import SVC\n",
    "model = SVC()\n",
    "model.fit(X_train, y_train) \n",
    "print(model.score(X_test, y_test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-03-17T12:01:47.221147Z",
     "start_time": "2020-03-17T12:01:47.210935Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'C': 1.0, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}\n"
     ]
    }
   ],
   "source": [
    "print(model.get_params())"
   ]
  },
  {
   "cell_type": "raw",
   "metadata": {},
   "source": [
    "## with scaled features\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "scaler = StandardScaler()\n",
    "\n",
    "scaler.fit(X_train)\n",
    "X_train_sc = scaler.transform(X_train)\n",
    "X_test_sc = scaler.transform(X_test)\n",
    "\n",
    "model.fit(X_train_sc, y_train) \n",
    "print(model.score(X_test_sc, y_test))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### GaussianNB\n",
    "in aller Kürze"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Theorie"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "classes_ : ['A' 'B']\n",
      "class_prior_ : [0.55555556 0.44444444]\n",
      "\n",
      "theta_ :\n",
      " [[5.58666667]\n",
      " [4.26666667]]\n",
      "\n",
      "var_ :\n",
      " [[0.31182222]\n",
      " [0.23055556]]\n"
     ]
    }
   ],
   "source": [
    "## demo of GaussianNB interna with demo data\n",
    "X_nb_train = X_demo\n",
    "y_nb_train = y_demo\n",
    "\n",
    "X_nb_train = X_nb_train.drop('X2', axis=1)\n",
    "#print(X_train)\n",
    "\n",
    "from sklearn.naive_bayes import GaussianNB\n",
    "model = GaussianNB()\n",
    "model.fit(X_nb_train, y_nb_train)\n",
    "\n",
    "## print model attributes\n",
    "print('classes_ :', model.classes_)\n",
    "print('class_prior_ :', model.class_prior_)\n",
    "print('\\ntheta_ :\\n', model.theta_)\n",
    "print('\\nvar_ :\\n', model.var_)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Praxis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-03-17T12:01:47.963126Z",
     "start_time": "2020-03-17T12:01:47.897232Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.7337998174627319\n"
     ]
    }
   ],
   "source": [
    "from sklearn.naive_bayes import GaussianNB\n",
    "model = GaussianNB()\n",
    "model.fit(X_train, y_train) \n",
    "print(model.score(X_test, y_test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-03-17T12:01:48.042848Z",
     "start_time": "2020-03-17T12:01:48.032106Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'priors': None, 'var_smoothing': 1e-09}\n"
     ]
    }
   ],
   "source": [
    "print(model.get_params())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### LogisticRegression\n",
    "#### Theorie\n",
    "#### Praxis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-03-17T12:01:56.666086Z",
     "start_time": "2020-03-17T12:01:56.130695Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.8475813811986614\n"
     ]
    }
   ],
   "source": [
    "from sklearn.linear_model import LogisticRegression\n",
    "model = LogisticRegression(max_iter=4000)\n",
    "model.fit(X_train, y_train) \n",
    "print(model.score(X_test, y_test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "C                    : 1.0  \n",
      "class_weight         : None \n",
      "dual                 : False\n",
      "fit_intercept        : True \n",
      "intercept_scaling    : 1    \n",
      "l1_ratio             : None \n",
      "max_iter             : 4000 \n",
      "multi_class          : deprecated\n",
      "n_jobs               : None \n",
      "penalty              : l2   \n",
      "random_state         : None \n",
      "solver               : lbfgs\n",
      "tol                  : 0.0001\n",
      "verbose              : 0    \n",
      "warm_start           : False\n"
     ]
    }
   ],
   "source": [
    "for key, value in model.get_params().items():\n",
    "    print(\"%-20s : %-5s\"  % (key, value))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "teaching",
   "language": "python",
   "name": "teaching"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.0"
  },
  "toc": {
   "base_numbering": "2.3",
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "2.3 Klassifikation - Mathematische Modelle",
   "title_sidebar": "Contents",
   "toc_cell": true,
   "toc_position": {
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "202.667px"
   },
   "toc_section_display": true,
   "toc_window_display": true
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "position": {
    "height": "316.717px",
    "left": "782px",
    "right": "20px",
    "top": "119px",
    "width": "350px"
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
 }
@@ -0,0 +1,501 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "# Feature Engineering\n",
    "# Klassifikation\n",
    "# Regression\n",
    "# Validierung und mehr\n",
    "## Sampling und Resampling\n",
    "## Validierungstechniken\n",
    "## Grid Search und Random Search\n",
    "## Performancemetriken\n",
    "## Unbalancierte Daten\n",
    "### Motivation und Vorbereitung "
   ]
  },
  {
   "cell_type": "raw",
   "metadata": {},
   "source": [
    "## for scikit-learn 1.4.2, to silence warnings regarding physical cores\n",
    "import os\n",
    "os.environ['LOKY_MAX_CPU_COUNT'] = '4' ## depending on the hardware used"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-15T20:53:02.218161Z",
     "start_time": "2020-04-15T20:53:02.079407Z"
    }
   },
   "outputs": [],
   "source": [
    "## import libraries\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns; sns.set()\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-14T21:30:48.667936Z",
     "start_time": "2020-04-14T21:30:48.420905Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dim = (41188, 21)\n",
      "y\n",
      "no     0.887346\n",
      "yes    0.112654\n",
      "Name: proportion, dtype: float64\n"
     ]
    }
   ],
   "source": [
    "## read and prepare data\n",
    "datapath = '../3_data'\n",
    "from os import chdir; chdir(datapath)\n",
    "data = pd.read_csv('bank-additional-full.csv', sep=';')\n",
    "print('dim =', data.shape)\n",
    "print(data.y.value_counts(normalize=True)) ## proportion\n",
    "\n",
    "X_full = data.drop('y', axis=1)\n",
    "y_full = data['y']             "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-14T21:30:48.714876Z",
     "start_time": "2020-04-14T21:30:48.673886Z"
    }
   },
   "outputs": [],
   "source": [
    "## minimal feature engineering: one hot encoding for not numerical features\n",
    "X_full = pd.get_dummies(X_full, drop_first=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-14T21:30:48.714876Z",
     "start_time": "2020-04-14T21:30:48.673886Z"
    }
   },
   "outputs": [],
   "source": [
    "## test - train - split\n",
    "from sklearn.model_selection import train_test_split\n",
    "X_full_train, X_full_test, y_full_train, y_full_test, = train_test_split(\n",
    "    X_full,\n",
    "    y_full,\n",
    "    train_size=2/3,\n",
    "    random_state=1234)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-14T21:30:53.858945Z",
     "start_time": "2020-04-14T21:30:48.719365Z"
    }
   },
   "outputs": [],
   "source": [
    "## function for evaluate different sampling methods\n",
    "##   train a RandomForestClassifier model with train data\n",
    "##   return\n",
    "##     internal scorer (accuracy) for test data\n",
    "##     proportion of classes after resampling\n",
    "\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "def getResampledRfScore(X_train, y_train, X_test, y_test):\n",
    "    model = RandomForestClassifier(random_state=1234)\n",
    "    model.fit(X_train, y_train)\n",
    "    print('score ', model.score(X_test, y_test))\n",
    "    print(y_train.value_counts(normalize=True)) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-14T21:30:53.858945Z",
     "start_time": "2020-04-14T21:30:48.719365Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "score  0.912163146394756\n",
      "y\n",
      "no     0.886773\n",
      "yes    0.113227\n",
      "Name: proportion, dtype: float64\n"
     ]
    }
   ],
   "source": [
    "## test call (without resampling)\n",
    "getResampledRfScore(X_full_train, y_full_train, X_full_test, y_full_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Random under-sampling"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: imblearn in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (0.0)\n",
      "Requirement already satisfied: imbalanced-learn in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from imblearn) (0.13.0)\n",
      "Requirement already satisfied: numpy<3,>=1.24.3 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from imbalanced-learn->imblearn) (2.2.6)\n",
      "Requirement already satisfied: scipy<2,>=1.10.1 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from imbalanced-learn->imblearn) (1.15.3)\n",
      "Requirement already satisfied: scikit-learn<2,>=1.3.2 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from imbalanced-learn->imblearn) (1.6.1)\n",
      "Requirement already satisfied: sklearn-compat<1,>=0.1 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from imbalanced-learn->imblearn) (0.1.3)\n",
      "Requirement already satisfied: joblib<2,>=1.1.1 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from imbalanced-learn->imblearn) (1.5.1)\n",
      "Requirement already satisfied: threadpoolctl<4,>=2.0.0 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from imbalanced-learn->imblearn) (3.6.0)\n"
     ]
    }
   ],
   "source": [
    "!pip install imblearn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-14T21:30:55.982616Z",
     "start_time": "2020-04-14T21:30:53.863545Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "score  0.847632920611799\n",
      "y\n",
      "no     0.5\n",
      "yes    0.5\n",
      "Name: proportion, dtype: float64\n"
     ]
    }
   ],
   "source": [
    "from imblearn.under_sampling import RandomUnderSampler\n",
    "rus = RandomUnderSampler(random_state=1234)\n",
    "X_resampled_train, y_resampled_train =\\\n",
    "    rus.fit_resample(X_full_train, y_full_train)\n",
    "getResampledRfScore(\n",
    "    X_resampled_train, y_resampled_train, X_full_test, y_full_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Random over-sampling"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-14T21:31:04.199265Z",
     "start_time": "2020-04-14T21:30:55.985909Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "score  0.9041514930808449\n",
      "y\n",
      "no     0.5\n",
      "yes    0.5\n",
      "Name: proportion, dtype: float64\n"
     ]
    }
   ],
   "source": [
    "from imblearn.over_sampling import\\\n",
    "    RandomOverSampler\n",
    "ros = RandomOverSampler(random_state=1234)\n",
    "X_resampled_train, y_resampled_train =\\\n",
    "    ros.fit_resample(X_full_train, y_full_train)\n",
    "getResampledRfScore(\n",
    "    X_resampled_train, y_resampled_train, X_full_test, y_full_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Undersampling mit Tomek Links"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-14T21:31:11.461134Z",
     "start_time": "2020-04-14T21:31:04.202872Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "score  0.9115076474872542\n",
      "y\n",
      "no     0.883063\n",
      "yes    0.116937\n",
      "Name: proportion, dtype: float64\n"
     ]
    }
   ],
   "source": [
    "from imblearn.under_sampling import TomekLinks\n",
    "tl = TomekLinks()\n",
    "X_resampled_train, y_resampled_train = tl.fit_resample(\n",
    "    X_full_train, y_full_train)\n",
    "getResampledRfScore(\n",
    "    X_resampled_train, y_resampled_train, X_full_test, y_full_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Oversampling mit SMOTE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-14T21:31:22.211925Z",
     "start_time": "2020-04-14T21:31:11.466648Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "score  0.9038601602330663\n",
      "y\n",
      "no     0.5\n",
      "yes    0.5\n",
      "Name: proportion, dtype: float64\n"
     ]
    }
   ],
   "source": [
    "from imblearn.over_sampling import SMOTE\n",
    "sm = SMOTE()\n",
    "X_resampled_train, y_resampled_train = sm.fit_resample(\n",
    "    X_full_train, y_full_train)\n",
    "getResampledRfScore(\n",
    "    X_resampled_train, y_resampled_train, X_full_test, y_full_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Weights beim Trainieren"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "ref: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html\n",
    "\n",
    "* the formula for class_weights:\n",
    "\n",
    "        n_samples / (n_classes * np.bincount(y))\n",
    "\n",
    "* the weights of y are calculated inversely proportional to the frequencies of the present classes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.9104151493080845\n"
     ]
    }
   ],
   "source": [
    "## with weights: balanced\n",
    "model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=1234)\n",
    "model.fit(X_full_train, y_full_train)\n",
    "print(model.score(X_full_test, y_full_test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.5638424575957945 4.415889353489868\n",
      "0.9104151493080845\n"
     ]
    }
   ],
   "source": [
    "## with weights: balanced: mannualy set\n",
    "n_no = y_full_train.value_counts()['no']\n",
    "n_yes = y_full_train.value_counts()['yes']\n",
    "weight_no = len(y_full_train) / (2 * n_no)\n",
    "weight_yes = len(y_full_train) / (2 * n_yes)\n",
    "print(weight_no, weight_yes)\n",
    "\n",
    "model = RandomForestClassifier(\n",
    "    n_estimators=100,\n",
    "    class_weight={'no': weight_no,\n",
    "                  'yes': weight_yes}, \n",
    "    random_state=1234)\n",
    "\n",
    "model.fit(X_full_train, y_full_train)\n",
    "print(model.score(X_full_test, y_full_test))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.0"
  },
  "toc": {
   "base_numbering": "4.5",
   "nav_menu": {
    "height": "189px",
    "width": "303.333px"
   },
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "4.5 Validierung und mehr - Unbalancierte Daten",
   "title_sidebar": "Contents",
   "toc_cell": true,
   "toc_position": {
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "291px"
   },
   "toc_section_display": true,
   "toc_window_display": true
  },
  "toc-autonumbering": true,
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "position": {
    "height": "298.85px",
    "left": "782px",
    "right": "20px",
    "top": "120px",
    "width": "350px"
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
 }
@@ -0,0 +1,517 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "# Feature Engineering\n",
    "# Klassifikation\n",
    "# Regression\n",
    "# Validierung und mehr\n",
    "# Deployment und Abschluss"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.append('./')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-08T10:06:24.890328Z",
     "start_time": "2020-04-08T10:06:23.220148Z"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "## prepare environment\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "datapath = '../3_data'\n",
    "from os import chdir; chdir(datapath)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Das finale Modell\n",
    "## Feature Engineering in der Produktion\n",
    "### Missing Values\n",
    "### Neue Kategorien\n",
    "### Protokollieren"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.read_csv('bank_data.csv', sep=';')\n",
    "\n",
    "import datetime\n",
    "\n",
    "f = open('fe_prod_log.log','w')\n",
    "f.write(datetime.datetime.now().strftime(\"[%Y-%m-%d %H:%M:%S] (timestamp)\"))\n",
    "\n",
    "f.write('\\n\\n')\n",
    "\n",
    "s = data.isna().sum()\n",
    "f.write('features with NA\\'s\\n')\n",
    "f.write('=======================')\n",
    "\n",
    "f.write('\\n')\n",
    "f.write(s[s > 0].to_string())\n",
    "f.write('\\n\\n')\n",
    "\n",
    "## value counts of not numeric features\n",
    "f.write('categorical cols levels\\n')\n",
    "f.write('=======================')\n",
    "f.write('\\n')\n",
    "catcolnames = data.select_dtypes(include='object').columns\n",
    "for ccn in catcolnames:\n",
    "    f.write(ccn)\n",
    "    f.write('\\n')\n",
    "    f.write(data[ccn].value_counts().sort_index().to_string())\n",
    "    f.write('\\n\\n')\n",
    "f.close() "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Modellübergabe in die Prodkution"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Modelle speichern scikit-learn intern"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "## load data\n",
    "from bfh_cas_pml import prep_data\n",
    "X_train, X_test, y_train, y_test = prep_data('melb_data_prep.csv', 'Price')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "## three models:\n",
    "\n",
    "## StandardScaler\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "model_sc = StandardScaler().fit(X_train)\n",
    "\n",
    "## LinearRegression\n",
    "from sklearn.linear_model import LinearRegression\n",
    "model_lr = LinearRegression().fit(X_train, y_train)\n",
    "\n",
    "## DecisionTreeRegressor\n",
    "from sklearn.tree import DecisionTreeRegressor\n",
    "model_dt = DecisionTreeRegressor(max_depth= 2, random_state=1234).fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "## save models\n",
    "import pickle \n",
    "with open('model_sc.pkl', 'wb') as pickle_file:\n",
    "    pickle.dump(model_sc, pickle_file)\n",
    "with open('model_lr.pkl', 'wb') as pickle_file:\n",
    "    pickle.dump(model_lr, pickle_file)\n",
    "with open('model_dt.pkl', 'wb') as pickle_file:\n",
    "    pickle.dump(model_dt, pickle_file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "## reload models\n",
    "with open('model_sc.pkl', 'rb') as pickle_file:\n",
    "    model_sc_2 = pickle.load(pickle_file)\n",
    "with open('model_lr.pkl', 'rb') as pickle_file:\n",
    "    model_lr_2 = pickle.load(pickle_file)\n",
    "with open('model_dt.pkl', 'rb') as pickle_file:\n",
    "    model_dt_2 = pickle.load(pickle_file)  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 2.93174034e+00  1.45261784e+00  1.04267493e+01  1.43475779e+00\n",
      "  1.68210732e+00  2.36615297e+00  2.09901978e+00  1.96804559e+03\n",
      "  5.74824662e+00 -3.78091214e+01  1.44997478e+02  7.52039447e+03\n",
      "  6.57396836e-01  1.29098026e-01  9.18284130e-02  2.89430762e-01\n",
      "  3.73511662e-02  3.43010928e-01  1.01125428e-02  2.10650791e-01\n",
      "  7.15274833e+00  2.01656214e+03  4.83925950e+00] \n",
      "\n",
      "[9.08813135e-01 4.68436712e-01 3.68004376e+01 4.24996430e-01\n",
      " 7.32413501e-01 7.11803450e-01 3.34340977e-02 6.45488299e+02\n",
      " 3.80781143e+01 5.53828210e-03 9.28052881e-03 2.02869238e+07\n",
      " 2.25226236e-01 1.12431726e-01 8.33959556e-02 2.05660596e-01\n",
      " 3.59560566e-02 2.25354431e-01 1.00102793e-02 1.66277035e-01\n",
      " 6.21684084e+00 2.46138222e-01 1.23097215e+00] \n",
      "\n",
      "[ 2.93174034e+00  1.45261784e+00  1.04267493e+01  1.43475779e+00\n",
      "  1.68210732e+00  2.36615297e+00  2.09901978e+00  1.96804559e+03\n",
      "  5.74824662e+00 -3.78091214e+01  1.44997478e+02  7.52039447e+03\n",
      "  6.57396836e-01  1.29098026e-01  9.18284130e-02  2.89430762e-01\n",
      "  3.73511662e-02  3.43010928e-01  1.01125428e-02  2.10650791e-01\n",
      "  7.15274833e+00  2.01656214e+03  4.83925950e+00] \n",
      "\n",
      "[9.08813135e-01 4.68436712e-01 3.68004376e+01 4.24996430e-01\n",
      " 7.32413501e-01 7.11803450e-01 3.34340977e-02 6.45488299e+02\n",
      " 3.80781143e+01 5.53828210e-03 9.28052881e-03 2.02869238e+07\n",
      " 2.25226236e-01 1.12431726e-01 8.33959556e-02 2.05660596e-01\n",
      " 3.59560566e-02 2.25354431e-01 1.00102793e-02 1.66277035e-01\n",
      " 6.21684084e+00 2.46138222e-01 1.23097215e+00] \n",
      "\n"
     ]
    }
   ],
   "source": [
    "## compare model_sc\n",
    "print(model_sc.mean_, '\\n')\n",
    "print(model_sc.var_, '\\n')\n",
    "print(model_sc_2.mean_, '\\n')\n",
    "print(model_sc_2.var_, '\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "-148600153.0644718\n",
      "[ 2.38730932e+05 -1.44127046e+05 -4.11587415e+04  1.55233185e+05\n",
      "  4.16082458e+04  8.22944764e+04  2.97716907e+05 -2.76754904e+03\n",
      " -4.92937355e+03 -5.11115932e+05  1.79228159e+05 -1.58740210e+00\n",
      "  7.07758671e+04  7.99854850e+03  3.38953273e+04 -1.73563804e+05\n",
      "  1.07806729e+05  2.39261230e+05  2.81825047e+05 -2.26744902e+05\n",
      "  1.27253408e+03  5.38476338e+04  3.83311293e+03] \n",
      "\n",
      "-148600153.0644718\n",
      "[ 2.38730932e+05 -1.44127046e+05 -4.11587415e+04  1.55233185e+05\n",
      "  4.16082458e+04  8.22944764e+04  2.97716907e+05 -2.76754904e+03\n",
      " -4.92937355e+03 -5.11115932e+05  1.79228159e+05 -1.58740210e+00\n",
      "  7.07758671e+04  7.99854850e+03  3.38953273e+04 -1.73563804e+05\n",
      "  1.07806729e+05  2.39261230e+05  2.81825047e+05 -2.26744902e+05\n",
      "  1.27253408e+03  5.38476338e+04  3.83311293e+03]\n"
     ]
    }
   ],
   "source": [
    "## compare model_lr\n",
    "print(model_lr.intercept_)\n",
    "print(model_lr.coef_, '\\n')\n",
    "print(model_lr_2.intercept_)\n",
    "print(model_lr_2.coef_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "|--- Rooms <= 3.50\n",
      "|   |--- Type <= 1.50\n",
      "|   |   |--- value: [1077045.50]\n",
      "|   |--- Type >  1.50\n",
      "|   |   |--- value: [682863.38]\n",
      "|--- Rooms >  3.50\n",
      "|   |--- Regionname_Southern_Metropolitan <= 0.50\n",
      "|   |   |--- value: [1163850.50]\n",
      "|   |--- Regionname_Southern_Metropolitan >  0.50\n",
      "|   |   |--- value: [2113002.90]\n",
      "\n",
      "|--- Rooms <= 3.50\n",
      "|   |--- Type <= 1.50\n",
      "|   |   |--- value: [1077045.50]\n",
      "|   |--- Type >  1.50\n",
      "|   |   |--- value: [682863.38]\n",
      "|--- Rooms >  3.50\n",
      "|   |--- Regionname_Southern_Metropolitan <= 0.50\n",
      "|   |   |--- value: [1163850.50]\n",
      "|   |--- Regionname_Southern_Metropolitan >  0.50\n",
      "|   |   |--- value: [2113002.90]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "## compare model_dt\n",
    "from sklearn.tree import export_text\n",
    "print(export_text(\n",
    "    model_dt, feature_names=list(X_train.columns)))\n",
    "print(export_text(\n",
    "    model_dt_2, feature_names=list(X_train.columns)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Modelle speichern extern mit PMML"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Vorbereitungen:**  \n",
    "* Java Runtime muss vorhanden sein\n",
    "* Library installieren direkt in Notebook:  \n",
    "`!pip install sklearn2pmml`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: sklearn2pmml in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (0.119.1)\n",
      "Requirement already satisfied: dill>=0.3.4 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from sklearn2pmml) (0.4.0)\n",
      "Requirement already satisfied: joblib>=0.13.0 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from sklearn2pmml) (1.5.1)\n",
      "Requirement already satisfied: pandas>=1.5.0 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from sklearn2pmml) (2.2.3)\n",
      "Requirement already satisfied: scikit-learn>=1.0 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from sklearn2pmml) (1.6.1)\n",
      "Requirement already satisfied: numpy>=1.26.0 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from pandas>=1.5.0->sklearn2pmml) (2.2.6)\n",
      "Requirement already satisfied: python-dateutil>=2.8.2 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from pandas>=1.5.0->sklearn2pmml) (2.9.0.post0)\n",
      "Requirement already satisfied: pytz>=2020.1 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from pandas>=1.5.0->sklearn2pmml) (2025.2)\n",
      "Requirement already satisfied: tzdata>=2022.7 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from pandas>=1.5.0->sklearn2pmml) (2025.2)\n",
      "Requirement already satisfied: six>=1.5 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from python-dateutil>=2.8.2->pandas>=1.5.0->sklearn2pmml) (1.17.0)\n",
      "Requirement already satisfied: scipy>=1.6.0 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from scikit-learn>=1.0->sklearn2pmml) (1.15.3)\n",
      "Requirement already satisfied: threadpoolctl>=3.1.0 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from scikit-learn>=1.0->sklearn2pmml) (3.6.0)\n"
     ]
    }
   ],
   "source": [
    "!pip install sklearn2pmml"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "## import libraries\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn2pmml import PMMLPipeline, sklearn2pmml"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "## StandardScaler\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "pipeline = PMMLPipeline([(\"scaler\", StandardScaler())]).fit(X_train)\n",
    "sklearn2pmml(pipeline, \"StandardScaler_melb.pmml\", with_repr = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "## LinearRegression\n",
    "pipeline = PMMLPipeline([('regressor', LinearRegression())]).fit(X_train, y_train)\n",
    "sklearn2pmml(pipeline, \"LinearRegression_melb.pmml\", with_repr = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "## DecisionTreeClassifier\n",
    "pipeline = PMMLPipeline([('regressor', DecisionTreeRegressor(max_depth= 2, random_state=1234))]).fit(X_train, y_train)\n",
    "sklearn2pmml(pipeline, \"DecisionTreeRegressor_melb.pmml\", with_repr = True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "* re-import a pmml model\n",
    "  * see https://stackoverflow.com/questions/52393301/use-pmml-models-in-python"
   ]
  },
  {
   "cell_type": "raw",
   "metadata": {
    "tags": []
   },
   "source": [
    "## !pip install pypmml\n",
    "## not run: long running time\n",
    "from pypmml import Model\n",
    "new_model = Model.fromFile(\"StandardScaler_melb.pmml\")\n",
    "result = new_model.predict(X_train)\n",
    "print(result.describe())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Der Modellierungsprozess"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "(kein Code unter diesem Titel)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## pyCaret"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "vgl. `5.5 Deployment und Abschluss - pycaret.ipynb`"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.0"
  },
  "toc": {
   "base_numbering": "5",
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "5 Deployment und Abschluss",
   "title_sidebar": "Contents",
   "toc_cell": true,
   "toc_position": {
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "165px"
   },
   "toc_section_display": true,
   "toc_window_display": true
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "position": {
    "height": "321.85px",
    "left": "790px",
    "right": "20px",
    "top": "113px",
    "width": "350px"
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
 }
@@ -0,0 +1,118 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "e0323bfc",
   "metadata": {},
   "source": [
    "# Nachträge"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "41f6b0e4",
   "metadata": {},
   "source": [
    "* dieses Dokument kann von Kurstag zu Kurstag ergänzt werden, daher sollte es jeweils ebenfalls jeweils neu heruntergeladen werden"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0fce9c83",
   "metadata": {},
   "source": [
    "## Extra Themen"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7673663f",
   "metadata": {},
   "source": [
    "* eine Zusammenstellung von Themen aus der Präsentation, welche **nicht** prüfungsrelevant sind, zusätzlich zu den explizit mit (i) gekennzeichneten Folien\n",
    "* die Zusammenstellung wird fortlaufend ergänzt n.n."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "65be8be0",
   "metadata": {},
   "source": [
    "|Kapitel                                                      |Folie(n)\n",
    "|:------------------------------------------------------------|:-------\n",
    "|**1. Feature Engineering**                                   |\n",
    "|1.1.4.5 Begriffe - Homonyme und Synonyme                     |20-21\n",
    "|1.2.3.1 Numerische Variablen - Quantitativ (Nachbemerkung)   |24\n",
    "|1.2.4.5 Lineare Zusammenhänge, etc.                          |49-55"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.7"
  },
  "toc": {
   "base_numbering": "9",
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "9 Nachträge Allgemein",
   "title_sidebar": "Contents",
   "toc_cell": true,
   "toc_position": {
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "240.667px"
   },
   "toc_section_display": true,
   "toc_window_display": true
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
@@ -0,0 +1,193 @@
 """
    Useful functions for example notebooks and workshop solutions
    of course Practical Machine Learning - Supervised Learning
    Bern University of Applied Sciences (BFH)
 """
 # ========== Packages ==========
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 import seaborn as sns
 # ========== Functions ==========
 def prep_data(dataset, target, train_ratio = 2 / 3, seed = None, sep = ','):
    """ read and prepare real data from the current directory
    performs 
        read data
        features - target - split
        train - test - split
    Parameters
    ----------
    dataset: name of dataset in csv format
    target: name of target column
    train_ratio (2 / 3): (optional)
    seed (None): random seet for split (optional)
    sep (,): separator of csv file (optional)
    Returns
    -------
    X_train: feature matrix of train set
    X_test: target vector of train set
    y_train: feature matrix of test set
    y_test: target vector of train set
    """
    ## load data
    data = pd.read_csv(dataset, sep = sep)
    ## features - target - split
    X = data.drop(target, axis=1)
    y = data[target]
    ## train - test - split
    from sklearn.model_selection import train_test_split
    return train_test_split(
        X,
        y,
        train_size=train_ratio,
        random_state=seed)
 def prep_demo_data(dataset, target):
    """ read demo data from the current directory
    performs 
        read data
        features - target - split
    Parameters
    ----------
    dataset: name of dataset in csv format, ',' separated
    target: name of target column
    Returns
    -------
    X: feature matrix
    y: target vector
    """
    ## load data
    data = pd.read_csv(dataset)
    ## features - target - split
    X = data.drop(target, axis=1)
    y = data[target]
    return X, y   
 def inspect_decision_tree_model(model_def, features, target, figsize=(6, 6)):
    """ train a DecisionTreeClassifier and visualize the tree
    prints some motel attributes from within the function
    Parameters
    ----------
    model_def: DecisionTreeClassifier object with set parameters
    features: feature matrix
    target: target vector
    figsize: size of image, optional, default = (6, 6)
    Returns
    -------
    visualization of the trained tree
    prints model attributes
    """
    from sklearn.tree import plot_tree
    model = model_def
    model.fit(features, target)
    print('TREE DIAGNOSTICS:')
    print('depth  :', model.get_depth())
    print('leaves :', model.get_n_leaves())
    print('score  :', model.score(features, target))
    plt.figure(figsize=figsize)
    plot_tree(model,
              feature_names=features.columns,
              class_names=model.classes_,
              filled=True);
 def test_regression_model(model, X_train, y_train, X_test, y_test, show_plot=True):
    """ shows behavoiur of univariate ML regression on synthetic dataset
    performs
    -   training on train data
    -   prediction on test data
    -   calculate performance measures
    Parameters
    ----------
    model: a parametrized regression model
    X_train, y_train: train data
    X_test, y_test: test data
    show_plot: show scatterplot ov pred vs true, optional, default=True
    Returns
    -------
    shows a scatterplot von X_test vs X_pred with a diagonal line, indicating identity
    prints r2_score and mean_squared_error
    """
    from sklearn.metrics import r2_score
    from sklearn.metrics import mean_squared_error
    model = model
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print('R2 = %0.4f' %(r2_score(y_test, y_pred)))
    if show_plot == True:
        plt.figure(figsize=(6,6))
        ax = sns.scatterplot(x=y_test, y=y_pred)
        ax.set(xlabel='y_test', ylabel='y_pred')
        ls = np.linspace(min(y_test), max(y_test), 100)
        plt.plot(ls, ls, color='black', linestyle='dashed')
        ax.set_title(model.__class__.__name__)
        plt.show()
    return (model)
 def show_pred_on_synth(model, X, y, X_synth, param_str):
    """ shows behavoiur of univariate ML regression on synthetic dataset
    Parameters
    ----------
    model: a parametrized regression model
    X, y: data for univariate regression
    X_synth: synthetic Feature
    param_str: parameter description for title
    seed (None): random seet for split
    Returns
    -------
    a scatterplot von X, y, with the prediction values for X_synth
    """
    model.fit(X.to_numpy(), y)
    y_pred = model.predict(X_synth)
    ax = sns.scatterplot(x=X['X'], y=y)
    ax = sns.lineplot(x=X_synth[:,0], y=y_pred, color='orange')
    ax.set_title(model.__class__.__name__ + ' : ' + param_str)
    ax.set(xlabel='X', ylabel='y')
    plt.show()
@@ -0,0 +1,178 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "6394031a-f4e6-40b6-8612-9dbcea9cc456",
   "metadata": {},
   "source": [
    "# extra_3.2.1.4_linear_regression_in_data_analytics.ipynb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "9be95ca5-b3d6-4e6f-9ca6-9d84b1bf726e",
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.append('./')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "23f2b845-764a-46f4-8a51-e1787c05b878",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-08T10:06:24.890328Z",
     "start_time": "2020-04-08T10:06:23.220148Z"
    }
   },
   "outputs": [],
   "source": [
    "## prepare env and data\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns; sns.set()\n",
    "%matplotlib inline\n",
    "\n",
    "datapath = '../3_data'\n",
    "from os import chdir; chdir(datapath)\n",
    "\n",
    "from bfh_cas_pml import prep_data\n",
    "X_train, X_test, y_train, y_test = prep_data('melb_data_prep.csv', 'Price', seed = 1234)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "27dd0331-0bdf-43d2-8dcd-29b54ff147e9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting statsmodels\n",
      "  Downloading statsmodels-0.14.4-cp313-cp313-macosx_11_0_arm64.whl.metadata (9.2 kB)\n",
      "Requirement already satisfied: numpy<3,>=1.22.3 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from statsmodels) (2.2.6)\n",
      "Requirement already satisfied: scipy!=1.9.2,>=1.8 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from statsmodels) (1.15.3)\n",
      "Requirement already satisfied: pandas!=2.1.0,>=1.4 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from statsmodels) (2.2.3)\n",
      "Collecting patsy>=0.5.6 (from statsmodels)\n",
      "  Downloading patsy-1.0.1-py2.py3-none-any.whl.metadata (3.3 kB)\n",
      "Requirement already satisfied: packaging>=21.3 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from statsmodels) (25.0)\n",
      "Requirement already satisfied: python-dateutil>=2.8.2 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from pandas!=2.1.0,>=1.4->statsmodels) (2.9.0.post0)\n",
      "Requirement already satisfied: pytz>=2020.1 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from pandas!=2.1.0,>=1.4->statsmodels) (2025.2)\n",
      "Requirement already satisfied: tzdata>=2022.7 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from pandas!=2.1.0,>=1.4->statsmodels) (2025.2)\n",
      "Requirement already satisfied: six>=1.5 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from python-dateutil>=2.8.2->pandas!=2.1.0,>=1.4->statsmodels) (1.17.0)\n",
      "Downloading statsmodels-0.14.4-cp313-cp313-macosx_11_0_arm64.whl (9.9 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m9.9/9.9 MB\u001b[0m \u001b[31m11.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
      "\u001b[?25hDownloading patsy-1.0.1-py2.py3-none-any.whl (232 kB)\n",
      "Installing collected packages: patsy, statsmodels\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2/2\u001b[0m [statsmodels]\u001b[0m [statsmodels]\n",
      "\u001b[1A\u001b[2KSuccessfully installed patsy-1.0.1 statsmodels-0.14.4\n"
     ]
    }
   ],
   "source": [
    "!pip install statsmodels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "c6ce688e-aea0-4030-afe7-1dbf56b03ea1",
   "metadata": {
    "scrolled": true,
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                            OLS Regression Results                            \n",
      "==============================================================================\n",
      "Dep. Variable:                  Price   R-squared:                       0.586\n",
      "Model:                            OLS   Adj. R-squared:                  0.585\n",
      "Method:                 Least Squares   F-statistic:                     752.8\n",
      "Date:                Sat, 21 Jun 2025   Prob (F-statistic):               0.00\n",
      "Time:                        22:34:27   Log-Likelihood:            -1.7592e+05\n",
      "No. Observations:               12262   AIC:                         3.519e+05\n",
      "Df Residuals:                   12238   BIC:                         3.521e+05\n",
      "Df Model:                          23                                         \n",
      "Covariance Type:            nonrobust                                         \n",
      "=========================================================================================================\n",
      "                                            coef    std err          t      P>|t|      [0.025      0.975]\n",
      "---------------------------------------------------------------------------------------------------------\n",
      "const                                 -1.055e+08   1.99e+07     -5.300      0.000   -1.45e+08   -6.65e+07\n",
      "Rooms                                  2.454e+05   5402.990     45.416      0.000    2.35e+05    2.56e+05\n",
      "Type                                  -1.414e+05   6342.919    -22.286      0.000   -1.54e+05   -1.29e+05\n",
      "Distance                              -4.038e+04    928.303    -43.503      0.000   -4.22e+04   -3.86e+04\n",
      "Bathroom                               1.613e+05   7004.805     23.032      0.000    1.48e+05    1.75e+05\n",
      "Car                                    4.039e+04   4723.191      8.552      0.000    3.11e+04    4.96e+04\n",
      "logLandsize                             8.33e+04   5149.484     16.177      0.000    7.32e+04    9.34e+04\n",
      "logBuildingArea                        2.738e+05   2.27e+04     12.064      0.000    2.29e+05    3.18e+05\n",
      "YearBuilt                             -2484.2291    162.873    -15.253      0.000   -2803.486   -2164.972\n",
      "CouncilArea                           -4977.2245    674.748     -7.376      0.000   -6299.838   -3654.611\n",
      "Lattitude                              -5.15e+05   7.24e+04     -7.114      0.000   -6.57e+05   -3.73e+05\n",
      "Longtitude                             1.926e+05   5.94e+04      3.241      0.001    7.61e+04    3.09e+05\n",
      "Propertycount                            -1.1982      0.893     -1.342      0.180      -2.948       0.552\n",
      "Method_S                               9.427e+04   1.17e+04      8.065      0.000    7.14e+04    1.17e+05\n",
      "Method_SP                              4.166e+04   1.51e+04      2.760      0.006    1.21e+04    7.13e+04\n",
      "Method_VB                              5.418e+04   1.63e+04      3.315      0.001    2.21e+04    8.62e+04\n",
      "Regionname_Northern_Metropolitan       -1.85e+05   1.62e+04    -11.404      0.000   -2.17e+05   -1.53e+05\n",
      "Regionname_South_Eastern_Metropolitan  8.791e+04   2.69e+04      3.269      0.001    3.52e+04    1.41e+05\n",
      "Regionname_Southern_Metropolitan        2.44e+05   1.54e+04     15.820      0.000    2.14e+05    2.74e+05\n",
      "Regionname_Victoria                    2.656e+05   4.46e+04      5.954      0.000    1.78e+05    3.53e+05\n",
      "Regionname_Western_Metropolitan       -2.317e+05   1.93e+04    -12.030      0.000   -2.69e+05   -1.94e+05\n",
      "month                                  1756.1890   1674.442      1.049      0.294   -1525.982    5038.360\n",
      "year                                   3.116e+04   8889.372      3.505      0.000    1.37e+04    4.86e+04\n",
      "day_of_week                            4708.3047   3427.552      1.374      0.170   -2010.239    1.14e+04\n",
      "==============================================================================\n",
      "Omnibus:                     6530.672   Durbin-Watson:                   2.000\n",
      "Prob(Omnibus):                  0.000   Jarque-Bera (JB):            95692.398\n",
      "Skew:                           2.225   Prob(JB):                         0.00\n",
      "Kurtosis:                      15.942   Cond. No.                     4.86e+07\n",
      "==============================================================================\n",
      "\n",
      "Notes:\n",
      "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
      "[2] The condition number is large, 4.86e+07. This might indicate that there are\n",
      "strong multicollinearity or other numerical problems.\n"
     ]
    }
   ],
   "source": [
    "import statsmodels.api as sm\n",
    "X_train_ = sm.add_constant(X_train)\n",
    "model = sm.OLS(y_train, X_train_, hasconst=True)\n",
    "results = model.fit()\n",
    "print(results.summary())"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
@@ -0,0 +1,341 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# extra_3.3.4_weitere_ensemble_regressoren.ipynb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.append('./')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "## for scikit-learn 1.4.2, to silence warnings regarding physical cores\n",
    "import os\n",
    "os.environ['LOKY_MAX_CPU_COUNT'] = '4' ## depending on the hardware used"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-08T10:06:24.890328Z",
     "start_time": "2020-04-08T10:06:23.220148Z"
    }
   },
   "outputs": [],
   "source": [
    "## prepare environment and data\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns; sns.set()\n",
    "%matplotlib inline\n",
    "\n",
    "datapath = '../3_data'\n",
    "from os import chdir; chdir(datapath)\n",
    "\n",
    "from bfh_cas_pml import prep_data, prep_demo_data\n",
    "X_train, X_test, y_train, y_test = prep_data('melb_data_prep.csv', 'Price', seed = 1234)\n",
    "\n",
    "from bfh_cas_pml import test_regression_model\n",
    "\n",
    "names = []\n",
    "scores = []"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**AdaBoostRegressor**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-08T10:06:45.098899Z",
     "start_time": "2020-04-08T10:06:44.257283Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "R2 = -0.3023\n"
     ]
    }
   ],
   "source": [
    "from sklearn.ensemble import AdaBoostRegressor\n",
    "this_model = test_regression_model(\n",
    "    AdaBoostRegressor(random_state=1234), \n",
    "    X_train, y_train, X_test, y_test,\n",
    "    show_plot=False)\n",
    "names.append(this_model.__class__.__name__)\n",
    "scores.append(this_model.score(X_test, y_test))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**GradientBoostingRegressor**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-08T10:06:45.952822Z",
     "start_time": "2020-04-08T10:06:45.101810Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "R2 = 0.7250\n"
     ]
    }
   ],
   "source": [
    "from sklearn.ensemble import GradientBoostingRegressor\n",
    "this_model = test_regression_model(\n",
    "    GradientBoostingRegressor(random_state=1234), \n",
    "    X_train, y_train, X_test, y_test,\n",
    "    show_plot=False)\n",
    "names.append(this_model.__class__.__name__)\n",
    "scores.append(this_model.score(X_test, y_test))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**HistGradientBoostingRegressor**\n",
    "\n",
    "\"This estimator is much faster than GradientBoostingRegressor for big datasets (n_samples >= 10 000).\" [https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.HistGradientBoostingRegressor.html#sklearn.ensemble.HistGradientBoostingRegressor]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-08T10:06:48.544788Z",
     "start_time": "2020-04-08T10:06:45.955387Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "R2 = 0.7846\n"
     ]
    }
   ],
   "source": [
    "from sklearn.ensemble import HistGradientBoostingRegressor\n",
    "this_model = test_regression_model(\n",
    "    HistGradientBoostingRegressor(), \n",
    "    X_train, y_train, X_test, y_test,\n",
    "    show_plot=False)\n",
    "names.append(this_model.__class__.__name__)\n",
    "scores.append(this_model.score(X_test, y_test))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**CatBoostRegressor**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "R2 = 0.8003\n"
     ]
    }
   ],
   "source": [
    "from catboost import CatBoostRegressor\n",
    "this_model = test_regression_model(\n",
    "    CatBoostRegressor(logging_level='Silent'), \n",
    "    X_train, y_train, X_test, y_test,\n",
    "    show_plot=False)\n",
    "names.append(this_model.__class__.__name__)\n",
    "scores.append(this_model.score(X_test, y_test))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**LGBMRegressor**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000113 seconds.\n",
      "You can set `force_row_wise=true` to remove the overhead.\n",
      "And if memory is not enough, you can set `force_col_wise=true`.\n",
      "[LightGBM] [Info] Total Bins 1630\n",
      "[LightGBM] [Info] Number of data points in the train set: 12262, number of used features: 23\n",
      "[LightGBM] [Info] Start training from score 1055902.695237\n",
      "R2 = 0.7882\n"
     ]
    }
   ],
   "source": [
    "from lightgbm import LGBMRegressor\n",
    "this_model = test_regression_model(\n",
    "    LGBMRegressor(), \n",
    "    X_train, y_train, X_test, y_test,\n",
    "    show_plot=False)\n",
    "names.append(this_model.__class__.__name__)\n",
    "scores.append(this_model.score(X_test, y_test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                          models    scores\n",
      "0              AdaBoostRegressor -0.302314\n",
      "1      GradientBoostingRegressor  0.724983\n",
      "2  HistGradientBoostingRegressor  0.784615\n",
      "3              CatBoostRegressor  0.800349\n",
      "4                  LGBMRegressor  0.788166\n"
     ]
    }
   ],
   "source": [
    "## synthesis\n",
    "print(pd.DataFrame({'models': names, 'scores': scores}))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.0"
  },
  "toc": {
   "base_numbering": "3.3",
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "3.3 Regression - ML Methoden",
   "title_sidebar": "Contents",
   "toc_cell": true,
   "toc_position": {
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "186.867px"
   },
   "toc_section_display": true,
   "toc_window_display": true
  },
  "toc-autonumbering": true,
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "position": {
    "height": "321.85px",
    "left": "787px",
    "right": "20px",
    "top": "115px",
    "width": "350px"
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
 }