feature: add example jupyter notebooks

This commit is contained in:
2026-05-21 13:51:32 +02:00
parent 7ed665a524
commit 2fce3281a3
35 changed files with 24357 additions and 0 deletions
@@ -0,0 +1,222 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**nicht als Notebook verteilen**"
]
},
{
"cell_type": "markdown",
"metadata": {
"tags": []
},
"source": [
"# Feature Engineering\n",
"\n",
"## Feature Engineering - Einführung\n",
"\n",
"### Abgrenzungen\n",
"\n",
"### CRISP - und die Gliederung des Kurses\n",
"\n",
"### Strukturierte Daten\n",
"\n",
"#### Aufbau und Organisation eines Data Frame"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" age job marital duration campaign y\n",
"0 30.0 self-employed single 245.0 3 yes\n",
"1 32.0 technician married 370.0 1 no\n",
"2 27.0 blue collar single 623.0 1 yes\n",
"3 NaN blue collar single 9.0 6 no\n",
"4 27.0 admin. single 126.0 2 yes\n",
"5 34.0 admin. single 548.0 2 yes\n",
"6 46.0 None married 86.0 2 no\n",
"7 NaN retired married 707.0 3 yes\n",
"8 46.0 admin. married 96.0 6 no\n",
"9 48.0 blue collar married 241.0 2 no\n",
"10 29.0 technician married 154.0 3 no\n"
]
}
],
"source": [
"import pandas as pd\n",
"data = pd.read_csv('../3_data/bank_data.csv', sep=';')\n",
"#print(data.iloc[0:11, 0:6])\n",
"\n",
"## arbitrarily input\n",
"data.iloc[3, 0] = None\n",
"data.iloc[6, 1] = None\n",
"#print(data.iloc[0:11, 0:6])\n",
"print(data.iloc[0:11, [0,1,2,10,11,20]])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Begriffe"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Beispieldaten"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"import pandas as pd\n",
"demo_data_class = pd.read_csv('../3_data/demo_data_class.csv')\n",
"demo_data_regr = pd.read_csv('../3_data/demo_data_regr.csv') "
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"import seaborn as sns\n",
"iris_data = sns.load_dataset('iris')"
]
},
{
"cell_type": "markdown",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"### Anforderungen an die Daten für Machine Learning"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Eine typische ML Sequenz"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Python Libraries"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Feature Engineering"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Die Python Libraries und CRISP-DM"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Begleitende Literatur"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.0"
},
"toc": {
"base_numbering": "1.1",
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "1.1 Feature Engineering - Einführung",
"title_sidebar": "Contents",
"toc_cell": true,
"toc_position": {
"height": "calc(100% - 180px)",
"left": "10px",
"top": "150px",
"width": "251px"
},
"toc_section_display": true,
"toc_window_display": true
},
"toc-autonumbering": true,
"toc-showcode": false,
"toc-showmarkdowntxt": false,
"toc-showtags": false,
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"position": {
"height": "326.85px",
"left": "910px",
"right": "20px",
"top": "120px",
"width": "350px"
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 4
}
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -0,0 +1,521 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"tags": []
},
"source": [
"# Feature Engineering\n",
"## Einfuehrung\n",
"## Exploration\n",
"## Transformation\n",
"## Konstruktion\n",
"## Selektion\n",
"## Implementation\n",
"### Data Frame"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"ExecuteTime": {
"end_time": "2020-03-17T11:19:28.485043Z",
"start_time": "2020-03-17T11:19:27.999820Z"
}
},
"outputs": [],
"source": [
"## preparation: import libraries and read data\n",
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"sns.set()\n",
"%matplotlib inline\n",
"\n",
"datapath = '../3_data'\n",
"from os import chdir\n",
"\n",
"chdir(datapath)\n",
"\n",
"data = pd.read_csv('bank_data.csv', sep=';')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### E1: Entfernen von Beobachtungen nach Bedingung"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"## remove case for age > 100\n",
"data.drop(data[data.age >= 100].index, inplace=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### E2: Entfernen von Duplikaten"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"## remove duplicates\n",
"data.drop_duplicates(ignore_index=True, inplace = True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### E3: Entfernen fragwürdiger Variablen"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"## alternative ['default', 'poutcome', 'duration']\n",
"vars_to_drop = ['default', 'poutcome']\n",
"data = data.drop(vars_to_drop, axis=1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### E4: Einsetzen von Werten für NAs"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"## create lists of names of of categorical and numerical variables\n",
"cat_vars = data.select_dtypes(include='object').columns.tolist()\n",
"num_vars = data.select_dtypes(exclude='object').columns.tolist()\n",
"\n",
"## import SimpleImputer class\n",
"from sklearn.impute import SimpleImputer\n",
"\n",
"## imput for categorical variables\n",
"imp_mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent')\n",
"data[cat_vars] = pd.DataFrame(imp_mode.fit_transform(data[cat_vars]), columns=data[cat_vars].columns)\n",
"\n",
"## imput for numerical variables\n",
"imp_median = SimpleImputer(missing_values=np.nan, strategy='median')\n",
"data[num_vars] = pd.DataFrame(imp_median.fit_transform(data[num_vars]), columns=data[num_vars].columns)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Kategoriale Variablen"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### E5: Reduzieren der Kardinalität"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"ExecuteTime": {
"end_time": "2020-03-17T11:19:28.781822Z",
"start_time": "2020-03-17T11:19:28.769081Z"
}
},
"outputs": [],
"source": [
"## education: illiterate : basic.4y\n",
"data.education = np.where(\n",
" data.education == 'illiterate', \n",
" 'basic.4y',\n",
" data.education)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Nummerisiren - Faktorisieren (Platzhalter)\n",
"hier kein Bedarf"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"## sandbox\n",
"tmp_data = data.copy()\n",
"\n",
"## check before\n",
"print(tmp_data.job.value_counts())\n",
"\n",
"## factorize\n",
"tmp_data.job = pd.factorize(tmp_data.job)[0]\n",
"\n",
"## check after\n",
"print(tmp_data.job.value_counts())\n",
"\n",
"del(tmp_data)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### E6: Nummerisiren - Ordial Encodieren"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"ExecuteTime": {
"end_time": "2020-03-17T11:19:28.885362Z",
"start_time": "2020-03-17T11:19:28.825371Z"
}
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/30/93p10lq141sd2pvx31bfs77w0000gp/T/ipykernel_6176/854807168.py:33: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
" data.replace(replace_nums, inplace=True)\n"
]
}
],
"source": [
"## education, day_of_week, month\n",
"replace_nums = {\n",
" 'education': {\n",
" 'basic.4y': 1,\n",
" 'basic.6y': 2,\n",
" 'basic.9y': 3,\n",
" 'professional.course': 4,\n",
" 'high.school': 5,\n",
" 'university.degree': 6\n",
" },\n",
" 'month': {\n",
" 'jan': 1,\n",
" 'feb': 2,\n",
" 'mar': 3,\n",
" 'apr': 4,\n",
" 'may': 5,\n",
" 'jun': 6,\n",
" 'jul': 7,\n",
" 'aug': 8,\n",
" 'sep': 9,\n",
" 'oct': 10,\n",
" 'nov': 11,\n",
" 'dec': 12\n",
" },\n",
" 'day_of_week': {\n",
" 'mon': 1,\n",
" 'tue': 2,\n",
" 'wed': 3,\n",
" 'thu': 4,\n",
" 'fri': 5\n",
" }\n",
"}\n",
"data.replace(replace_nums, inplace=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### E7: Nummerisieren - Binär Encodieren"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"ExecuteTime": {
"end_time": "2020-03-17T11:19:28.820652Z",
"start_time": "2020-03-17T11:19:28.786046Z"
}
},
"outputs": [],
"source": [
"## housing : no -> 0 else 1\n",
"data.housing = np.where(data.housing == 'no', 0, 1)\n",
"\n",
"## contact : celular -> 1 else 0\n",
"data.contact = np.where(data.contact == 'cellular', 1, 0)\n",
"## rename\n",
"data = data.rename(columns={'contact': 'contact_cellular'})"
]
},
{
"cell_type": "markdown",
"metadata": {
"tags": []
},
"source": [
"#### E8: Nummerisieren - Ordinal Encodieren"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"## one-hot encoding\n",
"## apply for all categorical variables except target\n",
"target = 'y'\n",
"sel_vars = data.select_dtypes(include=['object']).columns.drop(target)\n",
"data = pd.get_dummies(data, columns=sel_vars, drop_first=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Numerische Variablen"
]
},
{
"cell_type": "markdown",
"metadata": {
"tags": []
},
"source": [
"#### E9: Logarithmieren"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"ExecuteTime": {
"end_time": "2020-03-17T11:19:29.015675Z",
"start_time": "2020-03-17T11:19:28.972900Z"
}
},
"outputs": [],
"source": [
"## duration and campaign\n",
"data.duration = np.log10(data.duration + data.duration.min() + 1)\n",
"data.campaign = np.log10(data.campaign + data.campaign.min() + 1)"
]
},
{
"cell_type": "markdown",
"metadata": {
"tags": []
},
"source": [
"#### E10: Binär umcodieren"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"ExecuteTime": {
"end_time": "2020-03-17T11:19:29.046736Z",
"start_time": "2020-03-17T11:19:29.023717Z"
}
},
"outputs": [],
"source": [
"## pdays : 999 -> 0, else 1\n",
"data.pdays = np.where(data.pdays == 999, 0, 1)\n",
"\n",
"## previous : > 0 -> 1 else 0\n",
"data.previous = np.where(data.previous > 0, 1, 0)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Andere Tätigkeiten"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Konstruktion (Platzhalter)\n",
"hier kein Bedarf"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### E11: Bereinigen der Variablennamen"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"old_names = data.columns\n",
"new_names = old_names.str.replace('[^a-zA-Z0-9_]', '_', regex=True)\n",
"for i in range(len(old_names)):\n",
" data.rename(columns={old_names[i]:new_names[i]}, inplace=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Standardisieren (Platzhalter)\n",
"hier kein Bedarf"
]
},
{
"cell_type": "raw",
"metadata": {
"tags": []
},
"source": [
"## all except target\n",
"\n",
"## features - target - split\n",
"target = 'y'\n",
"X = data.drop(target, axis=1)\n",
"y = data.y\n",
"\n",
"## scale features (X)\n",
"from sklearn.preprocessing import StandardScaler\n",
"scaler = StandardScaler().set_output(transform=\"pandas\")\n",
"X = scaler.fit_transform(X)\n",
"\n",
"## concat target to scaled features\n",
"new_data = pd.concat([X, y.reindex(X.index)], axis=1)\n",
"\n",
"del(new_data)"
]
},
{
"cell_type": "markdown",
"metadata": {
"tags": []
},
"source": [
"#### E12: Speichern unter neuem Namen"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"ExecuteTime": {
"end_time": "2020-03-17T11:19:29.405659Z",
"start_time": "2020-03-17T11:19:29.080587Z"
},
"tags": []
},
"outputs": [],
"source": [
"## as bank_data_prep.csv\n",
"## parameters\n",
"## sep = ',' (default)\n",
"## index = False (default True would add an index column on the left)\n",
"data.to_csv('bank_data_prep.csv', index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.0"
},
"toc": {
"base_numbering": "1.6",
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "1.6 Feature Engineering - Implementation",
"title_sidebar": "Contents",
"toc_cell": true,
"toc_position": {
"height": "calc(100% - 180px)",
"left": "10px",
"top": "150px",
"width": "249.6px"
},
"toc_section_display": true,
"toc_window_display": true
},
"toc-autonumbering": true,
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 4
}
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -0,0 +1,506 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"tags": []
},
"source": [
"# Feature Engineering\n",
"# Klassifikation\n",
"## Instanzbasierte Modelle\n",
"## Regelbasierte Modelle\n",
"## Mathematische Modelle"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"sys.path.append('./')"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"ExecuteTime": {
"end_time": "2020-03-17T12:01:39.858981Z",
"start_time": "2020-03-17T12:01:37.904657Z"
}
},
"outputs": [],
"source": [
"## preparation\n",
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns; sns.set()\n",
"%matplotlib inline\n",
"\n",
"datapath = '../3_data'\n",
"from os import chdir; chdir(datapath)\n",
"\n",
"from bfh_cas_pml import prep_data, prep_demo_data\n",
"X_train, X_test, y_train, y_test = prep_data('bank_data_prep.csv', 'y', seed = 1234)\n",
"X_demo, y_demo = prep_demo_data('demo_data_class.csv', 'y')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### LinearDiscriminantAnalysis\n",
"#### Theorie"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"kein Code zu diesem Kapitel"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Praxis"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"ExecuteTime": {
"end_time": "2020-03-17T12:01:40.035126Z",
"start_time": "2020-03-17T12:01:39.864400Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.8487982963188317\n"
]
}
],
"source": [
"from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n",
"model = LinearDiscriminantAnalysis()\n",
"model.fit(X_train, y_train) \n",
"print(model.score(X_test, y_test))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"ExecuteTime": {
"end_time": "2020-03-17T12:01:40.051095Z",
"start_time": "2020-03-17T12:01:40.038394Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'covariance_estimator': None, 'n_components': None, 'priors': None, 'shrinkage': None, 'solver': 'svd', 'store_covariance': False, 'tol': 0.0001}\n"
]
}
],
"source": [
"print(model.get_params())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### QuadraticDiscriminantAnalysis (eine Variante)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"ExecuteTime": {
"end_time": "2020-03-17T12:01:40.144808Z",
"start_time": "2020-03-17T12:01:40.054435Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.7246729540614543\n"
]
}
],
"source": [
"from sklearn.discriminant_analysis \\\n",
" import QuadraticDiscriminantAnalysis\n",
"model = QuadraticDiscriminantAnalysis()\n",
"model.fit(X_train, y_train)\n",
"print(model.score(X_test, y_test))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"ExecuteTime": {
"end_time": "2020-03-17T12:01:40.160468Z",
"start_time": "2020-03-17T12:01:40.149447Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'priors': None, 'reg_param': 0.0, 'store_covariance': False, 'tol': 0.0001}\n"
]
}
],
"source": [
"print(model.get_params())"
]
},
{
"cell_type": "markdown",
"metadata": {
"tags": []
},
"source": [
"### SVC\n",
"#### Theorie\n",
"#### Praxis"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"ExecuteTime": {
"end_time": "2020-03-17T12:01:47.205171Z",
"start_time": "2020-03-17T12:01:40.196843Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.7161545482202616\n"
]
}
],
"source": [
"from sklearn.svm import SVC\n",
"model = SVC()\n",
"model.fit(X_train, y_train) \n",
"print(model.score(X_test, y_test))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"ExecuteTime": {
"end_time": "2020-03-17T12:01:47.221147Z",
"start_time": "2020-03-17T12:01:47.210935Z"
},
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'C': 1.0, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}\n"
]
}
],
"source": [
"print(model.get_params())"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"## with scaled features\n",
"from sklearn.preprocessing import StandardScaler\n",
"scaler = StandardScaler()\n",
"\n",
"scaler.fit(X_train)\n",
"X_train_sc = scaler.transform(X_train)\n",
"X_test_sc = scaler.transform(X_test)\n",
"\n",
"model.fit(X_train_sc, y_train) \n",
"print(model.score(X_test_sc, y_test))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### GaussianNB\n",
"in aller Kürze"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Theorie"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"classes_ : ['A' 'B']\n",
"class_prior_ : [0.55555556 0.44444444]\n",
"\n",
"theta_ :\n",
" [[5.58666667]\n",
" [4.26666667]]\n",
"\n",
"var_ :\n",
" [[0.31182222]\n",
" [0.23055556]]\n"
]
}
],
"source": [
"## demo of GaussianNB interna with demo data\n",
"X_nb_train = X_demo\n",
"y_nb_train = y_demo\n",
"\n",
"X_nb_train = X_nb_train.drop('X2', axis=1)\n",
"#print(X_train)\n",
"\n",
"from sklearn.naive_bayes import GaussianNB\n",
"model = GaussianNB()\n",
"model.fit(X_nb_train, y_nb_train)\n",
"\n",
"## print model attributes\n",
"print('classes_ :', model.classes_)\n",
"print('class_prior_ :', model.class_prior_)\n",
"print('\\ntheta_ :\\n', model.theta_)\n",
"print('\\nvar_ :\\n', model.var_)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Praxis"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"ExecuteTime": {
"end_time": "2020-03-17T12:01:47.963126Z",
"start_time": "2020-03-17T12:01:47.897232Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.7337998174627319\n"
]
}
],
"source": [
"from sklearn.naive_bayes import GaussianNB\n",
"model = GaussianNB()\n",
"model.fit(X_train, y_train) \n",
"print(model.score(X_test, y_test))"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"ExecuteTime": {
"end_time": "2020-03-17T12:01:48.042848Z",
"start_time": "2020-03-17T12:01:48.032106Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'priors': None, 'var_smoothing': 1e-09}\n"
]
}
],
"source": [
"print(model.get_params())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### LogisticRegression\n",
"#### Theorie\n",
"#### Praxis"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"ExecuteTime": {
"end_time": "2020-03-17T12:01:56.666086Z",
"start_time": "2020-03-17T12:01:56.130695Z"
},
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.8475813811986614\n"
]
}
],
"source": [
"from sklearn.linear_model import LogisticRegression\n",
"model = LogisticRegression(max_iter=4000)\n",
"model.fit(X_train, y_train) \n",
"print(model.score(X_test, y_test))"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"C : 1.0 \n",
"class_weight : None \n",
"dual : False\n",
"fit_intercept : True \n",
"intercept_scaling : 1 \n",
"l1_ratio : None \n",
"max_iter : 4000 \n",
"multi_class : deprecated\n",
"n_jobs : None \n",
"penalty : l2 \n",
"random_state : None \n",
"solver : lbfgs\n",
"tol : 0.0001\n",
"verbose : 0 \n",
"warm_start : False\n"
]
}
],
"source": [
"for key, value in model.get_params().items():\n",
" print(\"%-20s : %-5s\" % (key, value))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "teaching",
"language": "python",
"name": "teaching"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.0"
},
"toc": {
"base_numbering": "2.3",
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "2.3 Klassifikation - Mathematische Modelle",
"title_sidebar": "Contents",
"toc_cell": true,
"toc_position": {
"height": "calc(100% - 180px)",
"left": "10px",
"top": "150px",
"width": "202.667px"
},
"toc_section_display": true,
"toc_window_display": true
},
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"position": {
"height": "316.717px",
"left": "782px",
"right": "20px",
"top": "119px",
"width": "350px"
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 4
}
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -0,0 +1,501 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"tags": []
},
"source": [
"# Feature Engineering\n",
"# Klassifikation\n",
"# Regression\n",
"# Validierung und mehr\n",
"## Sampling und Resampling\n",
"## Validierungstechniken\n",
"## Grid Search und Random Search\n",
"## Performancemetriken\n",
"## Unbalancierte Daten\n",
"### Motivation und Vorbereitung "
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"## for scikit-learn 1.4.2, to silence warnings regarding physical cores\n",
"import os\n",
"os.environ['LOKY_MAX_CPU_COUNT'] = '4' ## depending on the hardware used"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"ExecuteTime": {
"end_time": "2020-04-15T20:53:02.218161Z",
"start_time": "2020-04-15T20:53:02.079407Z"
}
},
"outputs": [],
"source": [
"## import libraries\n",
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns; sns.set()\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"ExecuteTime": {
"end_time": "2020-04-14T21:30:48.667936Z",
"start_time": "2020-04-14T21:30:48.420905Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"dim = (41188, 21)\n",
"y\n",
"no 0.887346\n",
"yes 0.112654\n",
"Name: proportion, dtype: float64\n"
]
}
],
"source": [
"## read and prepare data\n",
"datapath = '../3_data'\n",
"from os import chdir; chdir(datapath)\n",
"data = pd.read_csv('bank-additional-full.csv', sep=';')\n",
"print('dim =', data.shape)\n",
"print(data.y.value_counts(normalize=True)) ## proportion\n",
"\n",
"X_full = data.drop('y', axis=1)\n",
"y_full = data['y'] "
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"ExecuteTime": {
"end_time": "2020-04-14T21:30:48.714876Z",
"start_time": "2020-04-14T21:30:48.673886Z"
}
},
"outputs": [],
"source": [
"## minimal feature engineering: one hot encoding for not numerical features\n",
"X_full = pd.get_dummies(X_full, drop_first=True)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"ExecuteTime": {
"end_time": "2020-04-14T21:30:48.714876Z",
"start_time": "2020-04-14T21:30:48.673886Z"
}
},
"outputs": [],
"source": [
"## test - train - split\n",
"from sklearn.model_selection import train_test_split\n",
"X_full_train, X_full_test, y_full_train, y_full_test, = train_test_split(\n",
" X_full,\n",
" y_full,\n",
" train_size=2/3,\n",
" random_state=1234)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"ExecuteTime": {
"end_time": "2020-04-14T21:30:53.858945Z",
"start_time": "2020-04-14T21:30:48.719365Z"
}
},
"outputs": [],
"source": [
"## function for evaluate different sampling methods\n",
"## train a RandomForestClassifier model with train data\n",
"## return\n",
"## internal scorer (accuracy) for test data\n",
"## proportion of classes after resampling\n",
"\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"def getResampledRfScore(X_train, y_train, X_test, y_test):\n",
" model = RandomForestClassifier(random_state=1234)\n",
" model.fit(X_train, y_train)\n",
" print('score ', model.score(X_test, y_test))\n",
" print(y_train.value_counts(normalize=True)) "
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"ExecuteTime": {
"end_time": "2020-04-14T21:30:53.858945Z",
"start_time": "2020-04-14T21:30:48.719365Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"score 0.912163146394756\n",
"y\n",
"no 0.886773\n",
"yes 0.113227\n",
"Name: proportion, dtype: float64\n"
]
}
],
"source": [
"## test call (without resampling)\n",
"getResampledRfScore(X_full_train, y_full_train, X_full_test, y_full_test)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Random under-sampling"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: imblearn in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (0.0)\n",
"Requirement already satisfied: imbalanced-learn in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from imblearn) (0.13.0)\n",
"Requirement already satisfied: numpy<3,>=1.24.3 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from imbalanced-learn->imblearn) (2.2.6)\n",
"Requirement already satisfied: scipy<2,>=1.10.1 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from imbalanced-learn->imblearn) (1.15.3)\n",
"Requirement already satisfied: scikit-learn<2,>=1.3.2 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from imbalanced-learn->imblearn) (1.6.1)\n",
"Requirement already satisfied: sklearn-compat<1,>=0.1 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from imbalanced-learn->imblearn) (0.1.3)\n",
"Requirement already satisfied: joblib<2,>=1.1.1 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from imbalanced-learn->imblearn) (1.5.1)\n",
"Requirement already satisfied: threadpoolctl<4,>=2.0.0 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from imbalanced-learn->imblearn) (3.6.0)\n"
]
}
],
"source": [
"!pip install imblearn"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"ExecuteTime": {
"end_time": "2020-04-14T21:30:55.982616Z",
"start_time": "2020-04-14T21:30:53.863545Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"score 0.847632920611799\n",
"y\n",
"no 0.5\n",
"yes 0.5\n",
"Name: proportion, dtype: float64\n"
]
}
],
"source": [
"from imblearn.under_sampling import RandomUnderSampler\n",
"rus = RandomUnderSampler(random_state=1234)\n",
"X_resampled_train, y_resampled_train =\\\n",
" rus.fit_resample(X_full_train, y_full_train)\n",
"getResampledRfScore(\n",
" X_resampled_train, y_resampled_train, X_full_test, y_full_test)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Random over-sampling"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"ExecuteTime": {
"end_time": "2020-04-14T21:31:04.199265Z",
"start_time": "2020-04-14T21:30:55.985909Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"score 0.9041514930808449\n",
"y\n",
"no 0.5\n",
"yes 0.5\n",
"Name: proportion, dtype: float64\n"
]
}
],
"source": [
"from imblearn.over_sampling import\\\n",
" RandomOverSampler\n",
"ros = RandomOverSampler(random_state=1234)\n",
"X_resampled_train, y_resampled_train =\\\n",
" ros.fit_resample(X_full_train, y_full_train)\n",
"getResampledRfScore(\n",
" X_resampled_train, y_resampled_train, X_full_test, y_full_test)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Undersampling mit Tomek Links"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"ExecuteTime": {
"end_time": "2020-04-14T21:31:11.461134Z",
"start_time": "2020-04-14T21:31:04.202872Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"score 0.9115076474872542\n",
"y\n",
"no 0.883063\n",
"yes 0.116937\n",
"Name: proportion, dtype: float64\n"
]
}
],
"source": [
"from imblearn.under_sampling import TomekLinks\n",
"tl = TomekLinks()\n",
"X_resampled_train, y_resampled_train = tl.fit_resample(\n",
" X_full_train, y_full_train)\n",
"getResampledRfScore(\n",
" X_resampled_train, y_resampled_train, X_full_test, y_full_test)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Oversampling mit SMOTE"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"ExecuteTime": {
"end_time": "2020-04-14T21:31:22.211925Z",
"start_time": "2020-04-14T21:31:11.466648Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"score 0.9038601602330663\n",
"y\n",
"no 0.5\n",
"yes 0.5\n",
"Name: proportion, dtype: float64\n"
]
}
],
"source": [
"from imblearn.over_sampling import SMOTE\n",
"sm = SMOTE()\n",
"X_resampled_train, y_resampled_train = sm.fit_resample(\n",
" X_full_train, y_full_train)\n",
"getResampledRfScore(\n",
" X_resampled_train, y_resampled_train, X_full_test, y_full_test)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Weights beim Trainieren"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"ref: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html\n",
"\n",
"* the formula for class_weights:\n",
"\n",
" n_samples / (n_classes * np.bincount(y))\n",
"\n",
"* the weights of y are calculated inversely proportional to the frequencies of the present classes"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.9104151493080845\n"
]
}
],
"source": [
"## with weights: balanced\n",
"model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=1234)\n",
"model.fit(X_full_train, y_full_train)\n",
"print(model.score(X_full_test, y_full_test))"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.5638424575957945 4.415889353489868\n",
"0.9104151493080845\n"
]
}
],
"source": [
"## with weights: balanced: mannualy set\n",
"n_no = y_full_train.value_counts()['no']\n",
"n_yes = y_full_train.value_counts()['yes']\n",
"weight_no = len(y_full_train) / (2 * n_no)\n",
"weight_yes = len(y_full_train) / (2 * n_yes)\n",
"print(weight_no, weight_yes)\n",
"\n",
"model = RandomForestClassifier(\n",
" n_estimators=100,\n",
" class_weight={'no': weight_no,\n",
" 'yes': weight_yes}, \n",
" random_state=1234)\n",
"\n",
"model.fit(X_full_train, y_full_train)\n",
"print(model.score(X_full_test, y_full_test))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.0"
},
"toc": {
"base_numbering": "4.5",
"nav_menu": {
"height": "189px",
"width": "303.333px"
},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "4.5 Validierung und mehr - Unbalancierte Daten",
"title_sidebar": "Contents",
"toc_cell": true,
"toc_position": {
"height": "calc(100% - 180px)",
"left": "10px",
"top": "150px",
"width": "291px"
},
"toc_section_display": true,
"toc_window_display": true
},
"toc-autonumbering": true,
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"position": {
"height": "298.85px",
"left": "782px",
"right": "20px",
"top": "120px",
"width": "350px"
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 4
}
@@ -0,0 +1,517 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"tags": []
},
"source": [
"# Feature Engineering\n",
"# Klassifikation\n",
"# Regression\n",
"# Validierung und mehr\n",
"# Deployment und Abschluss"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"sys.path.append('./')"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"ExecuteTime": {
"end_time": "2020-04-08T10:06:24.890328Z",
"start_time": "2020-04-08T10:06:23.220148Z"
},
"tags": []
},
"outputs": [],
"source": [
"## prepare environment\n",
"import pandas as pd\n",
"import numpy as np\n",
"datapath = '../3_data'\n",
"from os import chdir; chdir(datapath)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Das finale Modell\n",
"## Feature Engineering in der Produktion\n",
"### Missing Values\n",
"### Neue Kategorien\n",
"### Protokollieren"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"data = pd.read_csv('bank_data.csv', sep=';')\n",
"\n",
"import datetime\n",
"\n",
"f = open('fe_prod_log.log','w')\n",
"f.write(datetime.datetime.now().strftime(\"[%Y-%m-%d %H:%M:%S] (timestamp)\"))\n",
"\n",
"f.write('\\n\\n')\n",
"\n",
"s = data.isna().sum()\n",
"f.write('features with NA\\'s\\n')\n",
"f.write('=======================')\n",
"\n",
"f.write('\\n')\n",
"f.write(s[s > 0].to_string())\n",
"f.write('\\n\\n')\n",
"\n",
"## value counts of not numeric features\n",
"f.write('categorical cols levels\\n')\n",
"f.write('=======================')\n",
"f.write('\\n')\n",
"catcolnames = data.select_dtypes(include='object').columns\n",
"for ccn in catcolnames:\n",
" f.write(ccn)\n",
" f.write('\\n')\n",
" f.write(data[ccn].value_counts().sort_index().to_string())\n",
" f.write('\\n\\n')\n",
"f.close() "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Modellübergabe in die Prodkution"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Modelle speichern scikit-learn intern"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"## load data\n",
"from bfh_cas_pml import prep_data\n",
"X_train, X_test, y_train, y_test = prep_data('melb_data_prep.csv', 'Price')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"## three models:\n",
"\n",
"## StandardScaler\n",
"from sklearn.preprocessing import StandardScaler\n",
"model_sc = StandardScaler().fit(X_train)\n",
"\n",
"## LinearRegression\n",
"from sklearn.linear_model import LinearRegression\n",
"model_lr = LinearRegression().fit(X_train, y_train)\n",
"\n",
"## DecisionTreeRegressor\n",
"from sklearn.tree import DecisionTreeRegressor\n",
"model_dt = DecisionTreeRegressor(max_depth= 2, random_state=1234).fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"## save models\n",
"import pickle \n",
"with open('model_sc.pkl', 'wb') as pickle_file:\n",
" pickle.dump(model_sc, pickle_file)\n",
"with open('model_lr.pkl', 'wb') as pickle_file:\n",
" pickle.dump(model_lr, pickle_file)\n",
"with open('model_dt.pkl', 'wb') as pickle_file:\n",
" pickle.dump(model_dt, pickle_file)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"## reload models\n",
"with open('model_sc.pkl', 'rb') as pickle_file:\n",
" model_sc_2 = pickle.load(pickle_file)\n",
"with open('model_lr.pkl', 'rb') as pickle_file:\n",
" model_lr_2 = pickle.load(pickle_file)\n",
"with open('model_dt.pkl', 'rb') as pickle_file:\n",
" model_dt_2 = pickle.load(pickle_file) "
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[ 2.93174034e+00 1.45261784e+00 1.04267493e+01 1.43475779e+00\n",
" 1.68210732e+00 2.36615297e+00 2.09901978e+00 1.96804559e+03\n",
" 5.74824662e+00 -3.78091214e+01 1.44997478e+02 7.52039447e+03\n",
" 6.57396836e-01 1.29098026e-01 9.18284130e-02 2.89430762e-01\n",
" 3.73511662e-02 3.43010928e-01 1.01125428e-02 2.10650791e-01\n",
" 7.15274833e+00 2.01656214e+03 4.83925950e+00] \n",
"\n",
"[9.08813135e-01 4.68436712e-01 3.68004376e+01 4.24996430e-01\n",
" 7.32413501e-01 7.11803450e-01 3.34340977e-02 6.45488299e+02\n",
" 3.80781143e+01 5.53828210e-03 9.28052881e-03 2.02869238e+07\n",
" 2.25226236e-01 1.12431726e-01 8.33959556e-02 2.05660596e-01\n",
" 3.59560566e-02 2.25354431e-01 1.00102793e-02 1.66277035e-01\n",
" 6.21684084e+00 2.46138222e-01 1.23097215e+00] \n",
"\n",
"[ 2.93174034e+00 1.45261784e+00 1.04267493e+01 1.43475779e+00\n",
" 1.68210732e+00 2.36615297e+00 2.09901978e+00 1.96804559e+03\n",
" 5.74824662e+00 -3.78091214e+01 1.44997478e+02 7.52039447e+03\n",
" 6.57396836e-01 1.29098026e-01 9.18284130e-02 2.89430762e-01\n",
" 3.73511662e-02 3.43010928e-01 1.01125428e-02 2.10650791e-01\n",
" 7.15274833e+00 2.01656214e+03 4.83925950e+00] \n",
"\n",
"[9.08813135e-01 4.68436712e-01 3.68004376e+01 4.24996430e-01\n",
" 7.32413501e-01 7.11803450e-01 3.34340977e-02 6.45488299e+02\n",
" 3.80781143e+01 5.53828210e-03 9.28052881e-03 2.02869238e+07\n",
" 2.25226236e-01 1.12431726e-01 8.33959556e-02 2.05660596e-01\n",
" 3.59560566e-02 2.25354431e-01 1.00102793e-02 1.66277035e-01\n",
" 6.21684084e+00 2.46138222e-01 1.23097215e+00] \n",
"\n"
]
}
],
"source": [
"## compare model_sc\n",
"print(model_sc.mean_, '\\n')\n",
"print(model_sc.var_, '\\n')\n",
"print(model_sc_2.mean_, '\\n')\n",
"print(model_sc_2.var_, '\\n')"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"-148600153.0644718\n",
"[ 2.38730932e+05 -1.44127046e+05 -4.11587415e+04 1.55233185e+05\n",
" 4.16082458e+04 8.22944764e+04 2.97716907e+05 -2.76754904e+03\n",
" -4.92937355e+03 -5.11115932e+05 1.79228159e+05 -1.58740210e+00\n",
" 7.07758671e+04 7.99854850e+03 3.38953273e+04 -1.73563804e+05\n",
" 1.07806729e+05 2.39261230e+05 2.81825047e+05 -2.26744902e+05\n",
" 1.27253408e+03 5.38476338e+04 3.83311293e+03] \n",
"\n",
"-148600153.0644718\n",
"[ 2.38730932e+05 -1.44127046e+05 -4.11587415e+04 1.55233185e+05\n",
" 4.16082458e+04 8.22944764e+04 2.97716907e+05 -2.76754904e+03\n",
" -4.92937355e+03 -5.11115932e+05 1.79228159e+05 -1.58740210e+00\n",
" 7.07758671e+04 7.99854850e+03 3.38953273e+04 -1.73563804e+05\n",
" 1.07806729e+05 2.39261230e+05 2.81825047e+05 -2.26744902e+05\n",
" 1.27253408e+03 5.38476338e+04 3.83311293e+03]\n"
]
}
],
"source": [
"## compare model_lr\n",
"print(model_lr.intercept_)\n",
"print(model_lr.coef_, '\\n')\n",
"print(model_lr_2.intercept_)\n",
"print(model_lr_2.coef_)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"|--- Rooms <= 3.50\n",
"| |--- Type <= 1.50\n",
"| | |--- value: [1077045.50]\n",
"| |--- Type > 1.50\n",
"| | |--- value: [682863.38]\n",
"|--- Rooms > 3.50\n",
"| |--- Regionname_Southern_Metropolitan <= 0.50\n",
"| | |--- value: [1163850.50]\n",
"| |--- Regionname_Southern_Metropolitan > 0.50\n",
"| | |--- value: [2113002.90]\n",
"\n",
"|--- Rooms <= 3.50\n",
"| |--- Type <= 1.50\n",
"| | |--- value: [1077045.50]\n",
"| |--- Type > 1.50\n",
"| | |--- value: [682863.38]\n",
"|--- Rooms > 3.50\n",
"| |--- Regionname_Southern_Metropolitan <= 0.50\n",
"| | |--- value: [1163850.50]\n",
"| |--- Regionname_Southern_Metropolitan > 0.50\n",
"| | |--- value: [2113002.90]\n",
"\n"
]
}
],
"source": [
"## compare model_dt\n",
"from sklearn.tree import export_text\n",
"print(export_text(\n",
" model_dt, feature_names=list(X_train.columns)))\n",
"print(export_text(\n",
" model_dt_2, feature_names=list(X_train.columns)))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Modelle speichern extern mit PMML"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Vorbereitungen:** \n",
"* Java Runtime muss vorhanden sein\n",
"* Library installieren direkt in Notebook: \n",
"`!pip install sklearn2pmml`"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: sklearn2pmml in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (0.119.1)\n",
"Requirement already satisfied: dill>=0.3.4 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from sklearn2pmml) (0.4.0)\n",
"Requirement already satisfied: joblib>=0.13.0 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from sklearn2pmml) (1.5.1)\n",
"Requirement already satisfied: pandas>=1.5.0 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from sklearn2pmml) (2.2.3)\n",
"Requirement already satisfied: scikit-learn>=1.0 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from sklearn2pmml) (1.6.1)\n",
"Requirement already satisfied: numpy>=1.26.0 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from pandas>=1.5.0->sklearn2pmml) (2.2.6)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from pandas>=1.5.0->sklearn2pmml) (2.9.0.post0)\n",
"Requirement already satisfied: pytz>=2020.1 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from pandas>=1.5.0->sklearn2pmml) (2025.2)\n",
"Requirement already satisfied: tzdata>=2022.7 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from pandas>=1.5.0->sklearn2pmml) (2025.2)\n",
"Requirement already satisfied: six>=1.5 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from python-dateutil>=2.8.2->pandas>=1.5.0->sklearn2pmml) (1.17.0)\n",
"Requirement already satisfied: scipy>=1.6.0 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from scikit-learn>=1.0->sklearn2pmml) (1.15.3)\n",
"Requirement already satisfied: threadpoolctl>=3.1.0 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from scikit-learn>=1.0->sklearn2pmml) (3.6.0)\n"
]
}
],
"source": [
"!pip install sklearn2pmml"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"## import libraries\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn2pmml import PMMLPipeline, sklearn2pmml"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"## StandardScaler\n",
"from sklearn.preprocessing import StandardScaler\n",
"pipeline = PMMLPipeline([(\"scaler\", StandardScaler())]).fit(X_train)\n",
"sklearn2pmml(pipeline, \"StandardScaler_melb.pmml\", with_repr = True)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"## LinearRegression\n",
"pipeline = PMMLPipeline([('regressor', LinearRegression())]).fit(X_train, y_train)\n",
"sklearn2pmml(pipeline, \"LinearRegression_melb.pmml\", with_repr = True)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"## DecisionTreeClassifier\n",
"pipeline = PMMLPipeline([('regressor', DecisionTreeRegressor(max_depth= 2, random_state=1234))]).fit(X_train, y_train)\n",
"sklearn2pmml(pipeline, \"DecisionTreeRegressor_melb.pmml\", with_repr = True)"
]
},
{
"cell_type": "markdown",
"metadata": {
"tags": []
},
"source": [
"* re-import a pmml model\n",
" * see https://stackoverflow.com/questions/52393301/use-pmml-models-in-python"
]
},
{
"cell_type": "raw",
"metadata": {
"tags": []
},
"source": [
"## !pip install pypmml\n",
"## not run: long running time\n",
"from pypmml import Model\n",
"new_model = Model.fromFile(\"StandardScaler_melb.pmml\")\n",
"result = new_model.predict(X_train)\n",
"print(result.describe())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Der Modellierungsprozess"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"(kein Code unter diesem Titel)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## pyCaret"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"vgl. `5.5 Deployment und Abschluss - pycaret.ipynb`"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.0"
},
"toc": {
"base_numbering": "5",
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "5 Deployment und Abschluss",
"title_sidebar": "Contents",
"toc_cell": true,
"toc_position": {
"height": "calc(100% - 180px)",
"left": "10px",
"top": "150px",
"width": "165px"
},
"toc_section_display": true,
"toc_window_display": true
},
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"position": {
"height": "321.85px",
"left": "790px",
"right": "20px",
"top": "113px",
"width": "350px"
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 4
}
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,118 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "e0323bfc",
"metadata": {},
"source": [
"# Nachträge"
]
},
{
"cell_type": "markdown",
"id": "41f6b0e4",
"metadata": {},
"source": [
"* dieses Dokument kann von Kurstag zu Kurstag ergänzt werden, daher sollte es jeweils ebenfalls jeweils neu heruntergeladen werden"
]
},
{
"cell_type": "markdown",
"id": "0fce9c83",
"metadata": {},
"source": [
"## Extra Themen"
]
},
{
"cell_type": "markdown",
"id": "7673663f",
"metadata": {},
"source": [
"* eine Zusammenstellung von Themen aus der Präsentation, welche **nicht** prüfungsrelevant sind, zusätzlich zu den explizit mit (i) gekennzeichneten Folien\n",
"* die Zusammenstellung wird fortlaufend ergänzt n.n."
]
},
{
"cell_type": "markdown",
"id": "65be8be0",
"metadata": {},
"source": [
"|Kapitel |Folie(n)\n",
"|:------------------------------------------------------------|:-------\n",
"|**1. Feature Engineering** |\n",
"|1.1.4.5 Begriffe - Homonyme und Synonyme |20-21\n",
"|1.2.3.1 Numerische Variablen - Quantitativ (Nachbemerkung) |24\n",
"|1.2.4.5 Lineare Zusammenhänge, etc. |49-55"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.7"
},
"toc": {
"base_numbering": "9",
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "9 Nachträge Allgemein",
"title_sidebar": "Contents",
"toc_cell": true,
"toc_position": {
"height": "calc(100% - 180px)",
"left": "10px",
"top": "150px",
"width": "240.667px"
},
"toc_section_display": true,
"toc_window_display": true
},
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 5
}
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
+193
View File
@@ -0,0 +1,193 @@
"""
Useful functions for example notebooks and workshop solutions
of course Practical Machine Learning - Supervised Learning
Bern University of Applied Sciences (BFH)
"""
# ========== Packages ==========
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# ========== Functions ==========
def prep_data(dataset, target, train_ratio = 2 / 3, seed = None, sep = ','):
""" read and prepare real data from the current directory
performs
read data
features - target - split
train - test - split
Parameters
----------
dataset: name of dataset in csv format
target: name of target column
train_ratio (2 / 3): (optional)
seed (None): random seet for split (optional)
sep (,): separator of csv file (optional)
Returns
-------
X_train: feature matrix of train set
X_test: target vector of train set
y_train: feature matrix of test set
y_test: target vector of train set
"""
## load data
data = pd.read_csv(dataset, sep = sep)
## features - target - split
X = data.drop(target, axis=1)
y = data[target]
## train - test - split
from sklearn.model_selection import train_test_split
return train_test_split(
X,
y,
train_size=train_ratio,
random_state=seed)
def prep_demo_data(dataset, target):
""" read demo data from the current directory
performs
read data
features - target - split
Parameters
----------
dataset: name of dataset in csv format, ',' separated
target: name of target column
Returns
-------
X: feature matrix
y: target vector
"""
## load data
data = pd.read_csv(dataset)
## features - target - split
X = data.drop(target, axis=1)
y = data[target]
return X, y
def inspect_decision_tree_model(model_def, features, target, figsize=(6, 6)):
""" train a DecisionTreeClassifier and visualize the tree
prints some motel attributes from within the function
Parameters
----------
model_def: DecisionTreeClassifier object with set parameters
features: feature matrix
target: target vector
figsize: size of image, optional, default = (6, 6)
Returns
-------
visualization of the trained tree
prints model attributes
"""
from sklearn.tree import plot_tree
model = model_def
model.fit(features, target)
print('TREE DIAGNOSTICS:')
print('depth :', model.get_depth())
print('leaves :', model.get_n_leaves())
print('score :', model.score(features, target))
plt.figure(figsize=figsize)
plot_tree(model,
feature_names=features.columns,
class_names=model.classes_,
filled=True);
def test_regression_model(model, X_train, y_train, X_test, y_test, show_plot=True):
""" shows behavoiur of univariate ML regression on synthetic dataset
performs
- training on train data
- prediction on test data
- calculate performance measures
Parameters
----------
model: a parametrized regression model
X_train, y_train: train data
X_test, y_test: test data
show_plot: show scatterplot ov pred vs true, optional, default=True
Returns
-------
shows a scatterplot von X_test vs X_pred with a diagonal line, indicating identity
prints r2_score and mean_squared_error
"""
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
model = model
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('R2 = %0.4f' %(r2_score(y_test, y_pred)))
if show_plot == True:
plt.figure(figsize=(6,6))
ax = sns.scatterplot(x=y_test, y=y_pred)
ax.set(xlabel='y_test', ylabel='y_pred')
ls = np.linspace(min(y_test), max(y_test), 100)
plt.plot(ls, ls, color='black', linestyle='dashed')
ax.set_title(model.__class__.__name__)
plt.show()
return (model)
def show_pred_on_synth(model, X, y, X_synth, param_str):
""" shows behavoiur of univariate ML regression on synthetic dataset
Parameters
----------
model: a parametrized regression model
X, y: data for univariate regression
X_synth: synthetic Feature
param_str: parameter description for title
seed (None): random seet for split
Returns
-------
a scatterplot von X, y, with the prediction values for X_synth
"""
model.fit(X.to_numpy(), y)
y_pred = model.predict(X_synth)
ax = sns.scatterplot(x=X['X'], y=y)
ax = sns.lineplot(x=X_synth[:,0], y=y_pred, color='orange')
ax.set_title(model.__class__.__name__ + ' : ' + param_str)
ax.set(xlabel='X', ylabel='y')
plt.show()
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -0,0 +1,178 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "6394031a-f4e6-40b6-8612-9dbcea9cc456",
"metadata": {},
"source": [
"# extra_3.2.1.4_linear_regression_in_data_analytics.ipynb"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "9be95ca5-b3d6-4e6f-9ca6-9d84b1bf726e",
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"sys.path.append('./')"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "23f2b845-764a-46f4-8a51-e1787c05b878",
"metadata": {
"ExecuteTime": {
"end_time": "2020-04-08T10:06:24.890328Z",
"start_time": "2020-04-08T10:06:23.220148Z"
}
},
"outputs": [],
"source": [
"## prepare env and data\n",
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns; sns.set()\n",
"%matplotlib inline\n",
"\n",
"datapath = '../3_data'\n",
"from os import chdir; chdir(datapath)\n",
"\n",
"from bfh_cas_pml import prep_data\n",
"X_train, X_test, y_train, y_test = prep_data('melb_data_prep.csv', 'Price', seed = 1234)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "27dd0331-0bdf-43d2-8dcd-29b54ff147e9",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting statsmodels\n",
" Downloading statsmodels-0.14.4-cp313-cp313-macosx_11_0_arm64.whl.metadata (9.2 kB)\n",
"Requirement already satisfied: numpy<3,>=1.22.3 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from statsmodels) (2.2.6)\n",
"Requirement already satisfied: scipy!=1.9.2,>=1.8 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from statsmodels) (1.15.3)\n",
"Requirement already satisfied: pandas!=2.1.0,>=1.4 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from statsmodels) (2.2.3)\n",
"Collecting patsy>=0.5.6 (from statsmodels)\n",
" Downloading patsy-1.0.1-py2.py3-none-any.whl.metadata (3.3 kB)\n",
"Requirement already satisfied: packaging>=21.3 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from statsmodels) (25.0)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from pandas!=2.1.0,>=1.4->statsmodels) (2.9.0.post0)\n",
"Requirement already satisfied: pytz>=2020.1 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from pandas!=2.1.0,>=1.4->statsmodels) (2025.2)\n",
"Requirement already satisfied: tzdata>=2022.7 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from pandas!=2.1.0,>=1.4->statsmodels) (2025.2)\n",
"Requirement already satisfied: six>=1.5 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from python-dateutil>=2.8.2->pandas!=2.1.0,>=1.4->statsmodels) (1.17.0)\n",
"Downloading statsmodels-0.14.4-cp313-cp313-macosx_11_0_arm64.whl (9.9 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m9.9/9.9 MB\u001b[0m \u001b[31m11.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
"\u001b[?25hDownloading patsy-1.0.1-py2.py3-none-any.whl (232 kB)\n",
"Installing collected packages: patsy, statsmodels\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2/2\u001b[0m [statsmodels]\u001b[0m [statsmodels]\n",
"\u001b[1A\u001b[2KSuccessfully installed patsy-1.0.1 statsmodels-0.14.4\n"
]
}
],
"source": [
"!pip install statsmodels"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "c6ce688e-aea0-4030-afe7-1dbf56b03ea1",
"metadata": {
"scrolled": true,
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" OLS Regression Results \n",
"==============================================================================\n",
"Dep. Variable: Price R-squared: 0.586\n",
"Model: OLS Adj. R-squared: 0.585\n",
"Method: Least Squares F-statistic: 752.8\n",
"Date: Sat, 21 Jun 2025 Prob (F-statistic): 0.00\n",
"Time: 22:34:27 Log-Likelihood: -1.7592e+05\n",
"No. Observations: 12262 AIC: 3.519e+05\n",
"Df Residuals: 12238 BIC: 3.521e+05\n",
"Df Model: 23 \n",
"Covariance Type: nonrobust \n",
"=========================================================================================================\n",
" coef std err t P>|t| [0.025 0.975]\n",
"---------------------------------------------------------------------------------------------------------\n",
"const -1.055e+08 1.99e+07 -5.300 0.000 -1.45e+08 -6.65e+07\n",
"Rooms 2.454e+05 5402.990 45.416 0.000 2.35e+05 2.56e+05\n",
"Type -1.414e+05 6342.919 -22.286 0.000 -1.54e+05 -1.29e+05\n",
"Distance -4.038e+04 928.303 -43.503 0.000 -4.22e+04 -3.86e+04\n",
"Bathroom 1.613e+05 7004.805 23.032 0.000 1.48e+05 1.75e+05\n",
"Car 4.039e+04 4723.191 8.552 0.000 3.11e+04 4.96e+04\n",
"logLandsize 8.33e+04 5149.484 16.177 0.000 7.32e+04 9.34e+04\n",
"logBuildingArea 2.738e+05 2.27e+04 12.064 0.000 2.29e+05 3.18e+05\n",
"YearBuilt -2484.2291 162.873 -15.253 0.000 -2803.486 -2164.972\n",
"CouncilArea -4977.2245 674.748 -7.376 0.000 -6299.838 -3654.611\n",
"Lattitude -5.15e+05 7.24e+04 -7.114 0.000 -6.57e+05 -3.73e+05\n",
"Longtitude 1.926e+05 5.94e+04 3.241 0.001 7.61e+04 3.09e+05\n",
"Propertycount -1.1982 0.893 -1.342 0.180 -2.948 0.552\n",
"Method_S 9.427e+04 1.17e+04 8.065 0.000 7.14e+04 1.17e+05\n",
"Method_SP 4.166e+04 1.51e+04 2.760 0.006 1.21e+04 7.13e+04\n",
"Method_VB 5.418e+04 1.63e+04 3.315 0.001 2.21e+04 8.62e+04\n",
"Regionname_Northern_Metropolitan -1.85e+05 1.62e+04 -11.404 0.000 -2.17e+05 -1.53e+05\n",
"Regionname_South_Eastern_Metropolitan 8.791e+04 2.69e+04 3.269 0.001 3.52e+04 1.41e+05\n",
"Regionname_Southern_Metropolitan 2.44e+05 1.54e+04 15.820 0.000 2.14e+05 2.74e+05\n",
"Regionname_Victoria 2.656e+05 4.46e+04 5.954 0.000 1.78e+05 3.53e+05\n",
"Regionname_Western_Metropolitan -2.317e+05 1.93e+04 -12.030 0.000 -2.69e+05 -1.94e+05\n",
"month 1756.1890 1674.442 1.049 0.294 -1525.982 5038.360\n",
"year 3.116e+04 8889.372 3.505 0.000 1.37e+04 4.86e+04\n",
"day_of_week 4708.3047 3427.552 1.374 0.170 -2010.239 1.14e+04\n",
"==============================================================================\n",
"Omnibus: 6530.672 Durbin-Watson: 2.000\n",
"Prob(Omnibus): 0.000 Jarque-Bera (JB): 95692.398\n",
"Skew: 2.225 Prob(JB): 0.00\n",
"Kurtosis: 15.942 Cond. No. 4.86e+07\n",
"==============================================================================\n",
"\n",
"Notes:\n",
"[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
"[2] The condition number is large, 4.86e+07. This might indicate that there are\n",
"strong multicollinearity or other numerical problems.\n"
]
}
],
"source": [
"import statsmodels.api as sm\n",
"X_train_ = sm.add_constant(X_train)\n",
"model = sm.OLS(y_train, X_train_, hasconst=True)\n",
"results = model.fit()\n",
"print(results.summary())"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
@@ -0,0 +1,341 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# extra_3.3.4_weitere_ensemble_regressoren.ipynb"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"sys.path.append('./')"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"## for scikit-learn 1.4.2, to silence warnings regarding physical cores\n",
"import os\n",
"os.environ['LOKY_MAX_CPU_COUNT'] = '4' ## depending on the hardware used"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"ExecuteTime": {
"end_time": "2020-04-08T10:06:24.890328Z",
"start_time": "2020-04-08T10:06:23.220148Z"
}
},
"outputs": [],
"source": [
"## prepare environment and data\n",
"\n",
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns; sns.set()\n",
"%matplotlib inline\n",
"\n",
"datapath = '../3_data'\n",
"from os import chdir; chdir(datapath)\n",
"\n",
"from bfh_cas_pml import prep_data, prep_demo_data\n",
"X_train, X_test, y_train, y_test = prep_data('melb_data_prep.csv', 'Price', seed = 1234)\n",
"\n",
"from bfh_cas_pml import test_regression_model\n",
"\n",
"names = []\n",
"scores = []"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**AdaBoostRegressor**"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"ExecuteTime": {
"end_time": "2020-04-08T10:06:45.098899Z",
"start_time": "2020-04-08T10:06:44.257283Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"R2 = -0.3023\n"
]
}
],
"source": [
"from sklearn.ensemble import AdaBoostRegressor\n",
"this_model = test_regression_model(\n",
" AdaBoostRegressor(random_state=1234), \n",
" X_train, y_train, X_test, y_test,\n",
" show_plot=False)\n",
"names.append(this_model.__class__.__name__)\n",
"scores.append(this_model.score(X_test, y_test))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**GradientBoostingRegressor**"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"ExecuteTime": {
"end_time": "2020-04-08T10:06:45.952822Z",
"start_time": "2020-04-08T10:06:45.101810Z"
},
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"R2 = 0.7250\n"
]
}
],
"source": [
"from sklearn.ensemble import GradientBoostingRegressor\n",
"this_model = test_regression_model(\n",
" GradientBoostingRegressor(random_state=1234), \n",
" X_train, y_train, X_test, y_test,\n",
" show_plot=False)\n",
"names.append(this_model.__class__.__name__)\n",
"scores.append(this_model.score(X_test, y_test))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**HistGradientBoostingRegressor**\n",
"\n",
"\"This estimator is much faster than GradientBoostingRegressor for big datasets (n_samples >= 10 000).\" [https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.HistGradientBoostingRegressor.html#sklearn.ensemble.HistGradientBoostingRegressor]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"ExecuteTime": {
"end_time": "2020-04-08T10:06:48.544788Z",
"start_time": "2020-04-08T10:06:45.955387Z"
},
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"R2 = 0.7846\n"
]
}
],
"source": [
"from sklearn.ensemble import HistGradientBoostingRegressor\n",
"this_model = test_regression_model(\n",
" HistGradientBoostingRegressor(), \n",
" X_train, y_train, X_test, y_test,\n",
" show_plot=False)\n",
"names.append(this_model.__class__.__name__)\n",
"scores.append(this_model.score(X_test, y_test))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**CatBoostRegressor**"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"R2 = 0.8003\n"
]
}
],
"source": [
"from catboost import CatBoostRegressor\n",
"this_model = test_regression_model(\n",
" CatBoostRegressor(logging_level='Silent'), \n",
" X_train, y_train, X_test, y_test,\n",
" show_plot=False)\n",
"names.append(this_model.__class__.__name__)\n",
"scores.append(this_model.score(X_test, y_test))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**LGBMRegressor**"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000113 seconds.\n",
"You can set `force_row_wise=true` to remove the overhead.\n",
"And if memory is not enough, you can set `force_col_wise=true`.\n",
"[LightGBM] [Info] Total Bins 1630\n",
"[LightGBM] [Info] Number of data points in the train set: 12262, number of used features: 23\n",
"[LightGBM] [Info] Start training from score 1055902.695237\n",
"R2 = 0.7882\n"
]
}
],
"source": [
"from lightgbm import LGBMRegressor\n",
"this_model = test_regression_model(\n",
" LGBMRegressor(), \n",
" X_train, y_train, X_test, y_test,\n",
" show_plot=False)\n",
"names.append(this_model.__class__.__name__)\n",
"scores.append(this_model.score(X_test, y_test))"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" models scores\n",
"0 AdaBoostRegressor -0.302314\n",
"1 GradientBoostingRegressor 0.724983\n",
"2 HistGradientBoostingRegressor 0.784615\n",
"3 CatBoostRegressor 0.800349\n",
"4 LGBMRegressor 0.788166\n"
]
}
],
"source": [
"## synthesis\n",
"print(pd.DataFrame({'models': names, 'scores': scores}))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.0"
},
"toc": {
"base_numbering": "3.3",
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "3.3 Regression - ML Methoden",
"title_sidebar": "Contents",
"toc_cell": true,
"toc_position": {
"height": "calc(100% - 180px)",
"left": "10px",
"top": "150px",
"width": "186.867px"
},
"toc_section_display": true,
"toc_window_display": true
},
"toc-autonumbering": true,
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"position": {
"height": "321.85px",
"left": "787px",
"right": "20px",
"top": "115px",
"width": "350px"
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 4
}
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long