Files
cas-pml/SL/aufgaben/template/2_Code/1.6 Feature Engineering - Implementation.ipynb
T
2026-05-21 14:16:30 +02:00

522 lines
12 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"tags": []
},
"source": [
"# Feature Engineering\n",
"## Einfuehrung\n",
"## Exploration\n",
"## Transformation\n",
"## Konstruktion\n",
"## Selektion\n",
"## Implementation\n",
"### Data Frame"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"ExecuteTime": {
"end_time": "2020-03-17T11:19:28.485043Z",
"start_time": "2020-03-17T11:19:27.999820Z"
}
},
"outputs": [],
"source": [
"## preparation: import libraries and read data\n",
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"sns.set()\n",
"%matplotlib inline\n",
"\n",
"datapath = '../3_data'\n",
"from os import chdir\n",
"\n",
"chdir(datapath)\n",
"\n",
"data = pd.read_csv('bank_data.csv', sep=';')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### E1: Entfernen von Beobachtungen nach Bedingung"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"## remove case for age > 100\n",
"data.drop(data[data.age >= 100].index, inplace=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### E2: Entfernen von Duplikaten"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"## remove duplicates\n",
"data.drop_duplicates(ignore_index=True, inplace = True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### E3: Entfernen fragwürdiger Variablen"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"## alternative ['default', 'poutcome', 'duration']\n",
"vars_to_drop = ['default', 'poutcome']\n",
"data = data.drop(vars_to_drop, axis=1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### E4: Einsetzen von Werten für NAs"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"## create lists of names of of categorical and numerical variables\n",
"cat_vars = data.select_dtypes(include='object').columns.tolist()\n",
"num_vars = data.select_dtypes(exclude='object').columns.tolist()\n",
"\n",
"## import SimpleImputer class\n",
"from sklearn.impute import SimpleImputer\n",
"\n",
"## imput for categorical variables\n",
"imp_mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent')\n",
"data[cat_vars] = pd.DataFrame(imp_mode.fit_transform(data[cat_vars]), columns=data[cat_vars].columns)\n",
"\n",
"## imput for numerical variables\n",
"imp_median = SimpleImputer(missing_values=np.nan, strategy='median')\n",
"data[num_vars] = pd.DataFrame(imp_median.fit_transform(data[num_vars]), columns=data[num_vars].columns)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Kategoriale Variablen"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### E5: Reduzieren der Kardinalität"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"ExecuteTime": {
"end_time": "2020-03-17T11:19:28.781822Z",
"start_time": "2020-03-17T11:19:28.769081Z"
}
},
"outputs": [],
"source": [
"## education: illiterate : basic.4y\n",
"data.education = np.where(\n",
" data.education == 'illiterate', \n",
" 'basic.4y',\n",
" data.education)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Nummerisiren - Faktorisieren (Platzhalter)\n",
"hier kein Bedarf"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"## sandbox\n",
"tmp_data = data.copy()\n",
"\n",
"## check before\n",
"print(tmp_data.job.value_counts())\n",
"\n",
"## factorize\n",
"tmp_data.job = pd.factorize(tmp_data.job)[0]\n",
"\n",
"## check after\n",
"print(tmp_data.job.value_counts())\n",
"\n",
"del(tmp_data)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### E6: Nummerisiren - Ordial Encodieren"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"ExecuteTime": {
"end_time": "2020-03-17T11:19:28.885362Z",
"start_time": "2020-03-17T11:19:28.825371Z"
}
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/30/93p10lq141sd2pvx31bfs77w0000gp/T/ipykernel_6176/854807168.py:33: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
" data.replace(replace_nums, inplace=True)\n"
]
}
],
"source": [
"## education, day_of_week, month\n",
"replace_nums = {\n",
" 'education': {\n",
" 'basic.4y': 1,\n",
" 'basic.6y': 2,\n",
" 'basic.9y': 3,\n",
" 'professional.course': 4,\n",
" 'high.school': 5,\n",
" 'university.degree': 6\n",
" },\n",
" 'month': {\n",
" 'jan': 1,\n",
" 'feb': 2,\n",
" 'mar': 3,\n",
" 'apr': 4,\n",
" 'may': 5,\n",
" 'jun': 6,\n",
" 'jul': 7,\n",
" 'aug': 8,\n",
" 'sep': 9,\n",
" 'oct': 10,\n",
" 'nov': 11,\n",
" 'dec': 12\n",
" },\n",
" 'day_of_week': {\n",
" 'mon': 1,\n",
" 'tue': 2,\n",
" 'wed': 3,\n",
" 'thu': 4,\n",
" 'fri': 5\n",
" }\n",
"}\n",
"data.replace(replace_nums, inplace=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### E7: Nummerisieren - Binär Encodieren"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"ExecuteTime": {
"end_time": "2020-03-17T11:19:28.820652Z",
"start_time": "2020-03-17T11:19:28.786046Z"
}
},
"outputs": [],
"source": [
"## housing : no -> 0 else 1\n",
"data.housing = np.where(data.housing == 'no', 0, 1)\n",
"\n",
"## contact : celular -> 1 else 0\n",
"data.contact = np.where(data.contact == 'cellular', 1, 0)\n",
"## rename\n",
"data = data.rename(columns={'contact': 'contact_cellular'})"
]
},
{
"cell_type": "markdown",
"metadata": {
"tags": []
},
"source": [
"#### E8: Nummerisieren - Ordinal Encodieren"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"## one-hot encoding\n",
"## apply for all categorical variables except target\n",
"target = 'y'\n",
"sel_vars = data.select_dtypes(include=['object']).columns.drop(target)\n",
"data = pd.get_dummies(data, columns=sel_vars, drop_first=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Numerische Variablen"
]
},
{
"cell_type": "markdown",
"metadata": {
"tags": []
},
"source": [
"#### E9: Logarithmieren"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"ExecuteTime": {
"end_time": "2020-03-17T11:19:29.015675Z",
"start_time": "2020-03-17T11:19:28.972900Z"
}
},
"outputs": [],
"source": [
"## duration and campaign\n",
"data.duration = np.log10(data.duration + data.duration.min() + 1)\n",
"data.campaign = np.log10(data.campaign + data.campaign.min() + 1)"
]
},
{
"cell_type": "markdown",
"metadata": {
"tags": []
},
"source": [
"#### E10: Binär umcodieren"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"ExecuteTime": {
"end_time": "2020-03-17T11:19:29.046736Z",
"start_time": "2020-03-17T11:19:29.023717Z"
}
},
"outputs": [],
"source": [
"## pdays : 999 -> 0, else 1\n",
"data.pdays = np.where(data.pdays == 999, 0, 1)\n",
"\n",
"## previous : > 0 -> 1 else 0\n",
"data.previous = np.where(data.previous > 0, 1, 0)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Andere Tätigkeiten"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Konstruktion (Platzhalter)\n",
"hier kein Bedarf"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### E11: Bereinigen der Variablennamen"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"old_names = data.columns\n",
"new_names = old_names.str.replace('[^a-zA-Z0-9_]', '_', regex=True)\n",
"for i in range(len(old_names)):\n",
" data.rename(columns={old_names[i]:new_names[i]}, inplace=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Standardisieren (Platzhalter)\n",
"hier kein Bedarf"
]
},
{
"cell_type": "raw",
"metadata": {
"tags": []
},
"source": [
"## all except target\n",
"\n",
"## features - target - split\n",
"target = 'y'\n",
"X = data.drop(target, axis=1)\n",
"y = data.y\n",
"\n",
"## scale features (X)\n",
"from sklearn.preprocessing import StandardScaler\n",
"scaler = StandardScaler().set_output(transform=\"pandas\")\n",
"X = scaler.fit_transform(X)\n",
"\n",
"## concat target to scaled features\n",
"new_data = pd.concat([X, y.reindex(X.index)], axis=1)\n",
"\n",
"del(new_data)"
]
},
{
"cell_type": "markdown",
"metadata": {
"tags": []
},
"source": [
"#### E12: Speichern unter neuem Namen"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"ExecuteTime": {
"end_time": "2020-03-17T11:19:29.405659Z",
"start_time": "2020-03-17T11:19:29.080587Z"
},
"tags": []
},
"outputs": [],
"source": [
"## as bank_data_prep.csv\n",
"## parameters\n",
"## sep = ',' (default)\n",
"## index = False (default True would add an index column on the left)\n",
"data.to_csv('bank_data_prep.csv', index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.0"
},
"toc": {
"base_numbering": "1.6",
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "1.6 Feature Engineering - Implementation",
"title_sidebar": "Contents",
"toc_cell": true,
"toc_position": {
"height": "calc(100% - 180px)",
"left": "10px",
"top": "150px",
"width": "249.6px"
},
"toc_section_display": true,
"toc_window_display": true
},
"toc-autonumbering": true,
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 4
}