{ "cells": [ { "cell_type": "markdown", "metadata": { "tags": [] }, "source": [ "# Feature Engineering\n", "## Einfuehrung\n", "## Exploration\n", "## Transformation\n", "## Konstruktion\n", "## Selektion\n", "## Implementation\n", "### Data Frame" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2020-03-17T11:19:28.485043Z", "start_time": "2020-03-17T11:19:27.999820Z" } }, "outputs": [], "source": [ "## preparation: import libraries and read data\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "sns.set()\n", "%matplotlib inline\n", "\n", "datapath = '../3_data'\n", "from os import chdir\n", "\n", "chdir(datapath)\n", "\n", "data = pd.read_csv('bank_data.csv', sep=';')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### E1: Entfernen von Beobachtungen nach Bedingung" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "## remove case for age > 100\n", "data.drop(data[data.age >= 100].index, inplace=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### E2: Entfernen von Duplikaten" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "## remove duplicates\n", "data.drop_duplicates(ignore_index=True, inplace = True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### E3: Entfernen fragwürdiger Variablen" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "## alternative ['default', 'poutcome', 'duration']\n", "vars_to_drop = ['default', 'poutcome']\n", "data = data.drop(vars_to_drop, axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### E4: Einsetzen von Werten für NAs" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "## create lists of names of of categorical and numerical variables\n", "cat_vars = data.select_dtypes(include='object').columns.tolist()\n", "num_vars = data.select_dtypes(exclude='object').columns.tolist()\n", "\n", "## import SimpleImputer class\n", "from sklearn.impute import SimpleImputer\n", "\n", "## imput for categorical variables\n", "imp_mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent')\n", "data[cat_vars] = pd.DataFrame(imp_mode.fit_transform(data[cat_vars]), columns=data[cat_vars].columns)\n", "\n", "## imput for numerical variables\n", "imp_median = SimpleImputer(missing_values=np.nan, strategy='median')\n", "data[num_vars] = pd.DataFrame(imp_median.fit_transform(data[num_vars]), columns=data[num_vars].columns)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Kategoriale Variablen" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### E5: Reduzieren der Kardinalität" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "ExecuteTime": { "end_time": "2020-03-17T11:19:28.781822Z", "start_time": "2020-03-17T11:19:28.769081Z" } }, "outputs": [], "source": [ "## education: illiterate : basic.4y\n", "data.education = np.where(\n", " data.education == 'illiterate', \n", " 'basic.4y',\n", " data.education)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Nummerisiren - Faktorisieren (Platzhalter)\n", "hier kein Bedarf" ] }, { "cell_type": "raw", "metadata": {}, "source": [ "## sandbox\n", "tmp_data = data.copy()\n", "\n", "## check before\n", "print(tmp_data.job.value_counts())\n", "\n", "## factorize\n", "tmp_data.job = pd.factorize(tmp_data.job)[0]\n", "\n", "## check after\n", "print(tmp_data.job.value_counts())\n", "\n", "del(tmp_data)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### E6: Nummerisiren - Ordial Encodieren" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2020-03-17T11:19:28.885362Z", "start_time": "2020-03-17T11:19:28.825371Z" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/30/93p10lq141sd2pvx31bfs77w0000gp/T/ipykernel_6176/854807168.py:33: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n", " data.replace(replace_nums, inplace=True)\n" ] } ], "source": [ "## education, day_of_week, month\n", "replace_nums = {\n", " 'education': {\n", " 'basic.4y': 1,\n", " 'basic.6y': 2,\n", " 'basic.9y': 3,\n", " 'professional.course': 4,\n", " 'high.school': 5,\n", " 'university.degree': 6\n", " },\n", " 'month': {\n", " 'jan': 1,\n", " 'feb': 2,\n", " 'mar': 3,\n", " 'apr': 4,\n", " 'may': 5,\n", " 'jun': 6,\n", " 'jul': 7,\n", " 'aug': 8,\n", " 'sep': 9,\n", " 'oct': 10,\n", " 'nov': 11,\n", " 'dec': 12\n", " },\n", " 'day_of_week': {\n", " 'mon': 1,\n", " 'tue': 2,\n", " 'wed': 3,\n", " 'thu': 4,\n", " 'fri': 5\n", " }\n", "}\n", "data.replace(replace_nums, inplace=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### E7: Nummerisieren - Binär Encodieren" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "ExecuteTime": { "end_time": "2020-03-17T11:19:28.820652Z", "start_time": "2020-03-17T11:19:28.786046Z" } }, "outputs": [], "source": [ "## housing : no -> 0 else 1\n", "data.housing = np.where(data.housing == 'no', 0, 1)\n", "\n", "## contact : celular -> 1 else 0\n", "data.contact = np.where(data.contact == 'cellular', 1, 0)\n", "## rename\n", "data = data.rename(columns={'contact': 'contact_cellular'})" ] }, { "cell_type": "markdown", "metadata": { "tags": [] }, "source": [ "#### E8: Nummerisieren - Ordinal Encodieren" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "## one-hot encoding\n", "## apply for all categorical variables except target\n", "target = 'y'\n", "sel_vars = data.select_dtypes(include=['object']).columns.drop(target)\n", "data = pd.get_dummies(data, columns=sel_vars, drop_first=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Numerische Variablen" ] }, { "cell_type": "markdown", "metadata": { "tags": [] }, "source": [ "#### E9: Logarithmieren" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "ExecuteTime": { "end_time": "2020-03-17T11:19:29.015675Z", "start_time": "2020-03-17T11:19:28.972900Z" } }, "outputs": [], "source": [ "## duration and campaign\n", "data.duration = np.log10(data.duration + data.duration.min() + 1)\n", "data.campaign = np.log10(data.campaign + data.campaign.min() + 1)" ] }, { "cell_type": "markdown", "metadata": { "tags": [] }, "source": [ "#### E10: Binär umcodieren" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "ExecuteTime": { "end_time": "2020-03-17T11:19:29.046736Z", "start_time": "2020-03-17T11:19:29.023717Z" } }, "outputs": [], "source": [ "## pdays : 999 -> 0, else 1\n", "data.pdays = np.where(data.pdays == 999, 0, 1)\n", "\n", "## previous : > 0 -> 1 else 0\n", "data.previous = np.where(data.previous > 0, 1, 0)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Andere Tätigkeiten" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Konstruktion (Platzhalter)\n", "hier kein Bedarf" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### E11: Bereinigen der Variablennamen" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "old_names = data.columns\n", "new_names = old_names.str.replace('[^a-zA-Z0-9_]', '_', regex=True)\n", "for i in range(len(old_names)):\n", " data.rename(columns={old_names[i]:new_names[i]}, inplace=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Standardisieren (Platzhalter)\n", "hier kein Bedarf" ] }, { "cell_type": "raw", "metadata": { "tags": [] }, "source": [ "## all except target\n", "\n", "## features - target - split\n", "target = 'y'\n", "X = data.drop(target, axis=1)\n", "y = data.y\n", "\n", "## scale features (X)\n", "from sklearn.preprocessing import StandardScaler\n", "scaler = StandardScaler().set_output(transform=\"pandas\")\n", "X = scaler.fit_transform(X)\n", "\n", "## concat target to scaled features\n", "new_data = pd.concat([X, y.reindex(X.index)], axis=1)\n", "\n", "del(new_data)" ] }, { "cell_type": "markdown", "metadata": { "tags": [] }, "source": [ "#### E12: Speichern unter neuem Namen" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "ExecuteTime": { "end_time": "2020-03-17T11:19:29.405659Z", "start_time": "2020-03-17T11:19:29.080587Z" }, "tags": [] }, "outputs": [], "source": [ "## as bank_data_prep.csv\n", "## parameters\n", "## sep = ',' (default)\n", "## index = False (default True would add an index column on the left)\n", "data.to_csv('bank_data_prep.csv', index=False)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.0" }, "toc": { "base_numbering": "1.6", "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "1.6 Feature Engineering - Implementation", "title_sidebar": "Contents", "toc_cell": true, "toc_position": { "height": "calc(100% - 180px)", "left": "10px", "top": "150px", "width": "249.6px" }, "toc_section_display": true, "toc_window_display": true }, "toc-autonumbering": true, "varInspector": { "cols": { "lenName": 16, "lenType": 16, "lenVar": 40 }, "kernels_config": { "python": { "delete_cmd_postfix": "", "delete_cmd_prefix": "del ", "library": "var_list.py", "varRefreshCmd": "print(var_dic_list())" }, "r": { "delete_cmd_postfix": ") ", "delete_cmd_prefix": "rm(", "library": "var_list.r", "varRefreshCmd": "cat(var_dic_list()) " } }, "types_to_exclude": [ "module", "function", "builtin_function_or_method", "instance", "_Feature" ], "window_display": false } }, "nbformat": 4, "nbformat_minor": 4 }