522 lines
12 KiB
Plaintext
522 lines
12 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"source": [
|
|
"# Feature Engineering\n",
|
|
"## Einfuehrung\n",
|
|
"## Exploration\n",
|
|
"## Transformation\n",
|
|
"## Konstruktion\n",
|
|
"## Selektion\n",
|
|
"## Implementation\n",
|
|
"### Data Frame"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {
|
|
"ExecuteTime": {
|
|
"end_time": "2020-03-17T11:19:28.485043Z",
|
|
"start_time": "2020-03-17T11:19:27.999820Z"
|
|
}
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"## preparation: import libraries and read data\n",
|
|
"import pandas as pd\n",
|
|
"import numpy as np\n",
|
|
"import matplotlib.pyplot as plt\n",
|
|
"import seaborn as sns\n",
|
|
"\n",
|
|
"sns.set()\n",
|
|
"%matplotlib inline\n",
|
|
"\n",
|
|
"datapath = '../3_data'\n",
|
|
"from os import chdir\n",
|
|
"\n",
|
|
"chdir(datapath)\n",
|
|
"\n",
|
|
"data = pd.read_csv('bank_data.csv', sep=';')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### E1: Entfernen von Beobachtungen nach Bedingung"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"## remove case for age > 100\n",
|
|
"data.drop(data[data.age >= 100].index, inplace=True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### E2: Entfernen von Duplikaten"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"## remove duplicates\n",
|
|
"data.drop_duplicates(ignore_index=True, inplace = True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### E3: Entfernen fragwürdiger Variablen"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"## alternative ['default', 'poutcome', 'duration']\n",
|
|
"vars_to_drop = ['default', 'poutcome']\n",
|
|
"data = data.drop(vars_to_drop, axis=1)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### E4: Einsetzen von Werten für NAs"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"## create lists of names of of categorical and numerical variables\n",
|
|
"cat_vars = data.select_dtypes(include='object').columns.tolist()\n",
|
|
"num_vars = data.select_dtypes(exclude='object').columns.tolist()\n",
|
|
"\n",
|
|
"## import SimpleImputer class\n",
|
|
"from sklearn.impute import SimpleImputer\n",
|
|
"\n",
|
|
"## imput for categorical variables\n",
|
|
"imp_mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent')\n",
|
|
"data[cat_vars] = pd.DataFrame(imp_mode.fit_transform(data[cat_vars]), columns=data[cat_vars].columns)\n",
|
|
"\n",
|
|
"## imput for numerical variables\n",
|
|
"imp_median = SimpleImputer(missing_values=np.nan, strategy='median')\n",
|
|
"data[num_vars] = pd.DataFrame(imp_median.fit_transform(data[num_vars]), columns=data[num_vars].columns)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Kategoriale Variablen"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### E5: Reduzieren der Kardinalität"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {
|
|
"ExecuteTime": {
|
|
"end_time": "2020-03-17T11:19:28.781822Z",
|
|
"start_time": "2020-03-17T11:19:28.769081Z"
|
|
}
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"## education: illiterate : basic.4y\n",
|
|
"data.education = np.where(\n",
|
|
" data.education == 'illiterate', \n",
|
|
" 'basic.4y',\n",
|
|
" data.education)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### Nummerisiren - Faktorisieren (Platzhalter)\n",
|
|
"hier kein Bedarf"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "raw",
|
|
"metadata": {},
|
|
"source": [
|
|
"## sandbox\n",
|
|
"tmp_data = data.copy()\n",
|
|
"\n",
|
|
"## check before\n",
|
|
"print(tmp_data.job.value_counts())\n",
|
|
"\n",
|
|
"## factorize\n",
|
|
"tmp_data.job = pd.factorize(tmp_data.job)[0]\n",
|
|
"\n",
|
|
"## check after\n",
|
|
"print(tmp_data.job.value_counts())\n",
|
|
"\n",
|
|
"del(tmp_data)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### E6: Nummerisiren - Ordial Encodieren"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"metadata": {
|
|
"ExecuteTime": {
|
|
"end_time": "2020-03-17T11:19:28.885362Z",
|
|
"start_time": "2020-03-17T11:19:28.825371Z"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"/var/folders/30/93p10lq141sd2pvx31bfs77w0000gp/T/ipykernel_6176/854807168.py:33: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
|
|
" data.replace(replace_nums, inplace=True)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"## education, day_of_week, month\n",
|
|
"replace_nums = {\n",
|
|
" 'education': {\n",
|
|
" 'basic.4y': 1,\n",
|
|
" 'basic.6y': 2,\n",
|
|
" 'basic.9y': 3,\n",
|
|
" 'professional.course': 4,\n",
|
|
" 'high.school': 5,\n",
|
|
" 'university.degree': 6\n",
|
|
" },\n",
|
|
" 'month': {\n",
|
|
" 'jan': 1,\n",
|
|
" 'feb': 2,\n",
|
|
" 'mar': 3,\n",
|
|
" 'apr': 4,\n",
|
|
" 'may': 5,\n",
|
|
" 'jun': 6,\n",
|
|
" 'jul': 7,\n",
|
|
" 'aug': 8,\n",
|
|
" 'sep': 9,\n",
|
|
" 'oct': 10,\n",
|
|
" 'nov': 11,\n",
|
|
" 'dec': 12\n",
|
|
" },\n",
|
|
" 'day_of_week': {\n",
|
|
" 'mon': 1,\n",
|
|
" 'tue': 2,\n",
|
|
" 'wed': 3,\n",
|
|
" 'thu': 4,\n",
|
|
" 'fri': 5\n",
|
|
" }\n",
|
|
"}\n",
|
|
"data.replace(replace_nums, inplace=True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### E7: Nummerisieren - Binär Encodieren"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"metadata": {
|
|
"ExecuteTime": {
|
|
"end_time": "2020-03-17T11:19:28.820652Z",
|
|
"start_time": "2020-03-17T11:19:28.786046Z"
|
|
}
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"## housing : no -> 0 else 1\n",
|
|
"data.housing = np.where(data.housing == 'no', 0, 1)\n",
|
|
"\n",
|
|
"## contact : celular -> 1 else 0\n",
|
|
"data.contact = np.where(data.contact == 'cellular', 1, 0)\n",
|
|
"## rename\n",
|
|
"data = data.rename(columns={'contact': 'contact_cellular'})"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"source": [
|
|
"#### E8: Nummerisieren - Ordinal Encodieren"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"## one-hot encoding\n",
|
|
"## apply for all categorical variables except target\n",
|
|
"target = 'y'\n",
|
|
"sel_vars = data.select_dtypes(include=['object']).columns.drop(target)\n",
|
|
"data = pd.get_dummies(data, columns=sel_vars, drop_first=True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Numerische Variablen"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"source": [
|
|
"#### E9: Logarithmieren"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"metadata": {
|
|
"ExecuteTime": {
|
|
"end_time": "2020-03-17T11:19:29.015675Z",
|
|
"start_time": "2020-03-17T11:19:28.972900Z"
|
|
}
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"## duration and campaign\n",
|
|
"data.duration = np.log10(data.duration + data.duration.min() + 1)\n",
|
|
"data.campaign = np.log10(data.campaign + data.campaign.min() + 1)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"source": [
|
|
"#### E10: Binär umcodieren"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"metadata": {
|
|
"ExecuteTime": {
|
|
"end_time": "2020-03-17T11:19:29.046736Z",
|
|
"start_time": "2020-03-17T11:19:29.023717Z"
|
|
}
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"## pdays : 999 -> 0, else 1\n",
|
|
"data.pdays = np.where(data.pdays == 999, 0, 1)\n",
|
|
"\n",
|
|
"## previous : > 0 -> 1 else 0\n",
|
|
"data.previous = np.where(data.previous > 0, 1, 0)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Andere Tätigkeiten"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### Konstruktion (Platzhalter)\n",
|
|
"hier kein Bedarf"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### E11: Bereinigen der Variablennamen"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"old_names = data.columns\n",
|
|
"new_names = old_names.str.replace('[^a-zA-Z0-9_]', '_', regex=True)\n",
|
|
"for i in range(len(old_names)):\n",
|
|
" data.rename(columns={old_names[i]:new_names[i]}, inplace=True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### Standardisieren (Platzhalter)\n",
|
|
"hier kein Bedarf"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "raw",
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"source": [
|
|
"## all except target\n",
|
|
"\n",
|
|
"## features - target - split\n",
|
|
"target = 'y'\n",
|
|
"X = data.drop(target, axis=1)\n",
|
|
"y = data.y\n",
|
|
"\n",
|
|
"## scale features (X)\n",
|
|
"from sklearn.preprocessing import StandardScaler\n",
|
|
"scaler = StandardScaler().set_output(transform=\"pandas\")\n",
|
|
"X = scaler.fit_transform(X)\n",
|
|
"\n",
|
|
"## concat target to scaled features\n",
|
|
"new_data = pd.concat([X, y.reindex(X.index)], axis=1)\n",
|
|
"\n",
|
|
"del(new_data)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"source": [
|
|
"#### E12: Speichern unter neuem Namen"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"metadata": {
|
|
"ExecuteTime": {
|
|
"end_time": "2020-03-17T11:19:29.405659Z",
|
|
"start_time": "2020-03-17T11:19:29.080587Z"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"## as bank_data_prep.csv\n",
|
|
"## parameters\n",
|
|
"## sep = ',' (default)\n",
|
|
"## index = False (default True would add an index column on the left)\n",
|
|
"data.to_csv('bank_data_prep.csv', index=False)"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.13.0"
|
|
},
|
|
"toc": {
|
|
"base_numbering": "1.6",
|
|
"nav_menu": {},
|
|
"number_sections": true,
|
|
"sideBar": true,
|
|
"skip_h1_title": false,
|
|
"title_cell": "1.6 Feature Engineering - Implementation",
|
|
"title_sidebar": "Contents",
|
|
"toc_cell": true,
|
|
"toc_position": {
|
|
"height": "calc(100% - 180px)",
|
|
"left": "10px",
|
|
"top": "150px",
|
|
"width": "249.6px"
|
|
},
|
|
"toc_section_display": true,
|
|
"toc_window_display": true
|
|
},
|
|
"toc-autonumbering": true,
|
|
"varInspector": {
|
|
"cols": {
|
|
"lenName": 16,
|
|
"lenType": 16,
|
|
"lenVar": 40
|
|
},
|
|
"kernels_config": {
|
|
"python": {
|
|
"delete_cmd_postfix": "",
|
|
"delete_cmd_prefix": "del ",
|
|
"library": "var_list.py",
|
|
"varRefreshCmd": "print(var_dic_list())"
|
|
},
|
|
"r": {
|
|
"delete_cmd_postfix": ") ",
|
|
"delete_cmd_prefix": "rm(",
|
|
"library": "var_list.r",
|
|
"varRefreshCmd": "cat(var_dic_list()) "
|
|
}
|
|
},
|
|
"types_to_exclude": [
|
|
"module",
|
|
"function",
|
|
"builtin_function_or_method",
|
|
"instance",
|
|
"_Feature"
|
|
],
|
|
"window_display": false
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 4
|
|
}
|