refactor: move things around

This commit is contained in:
2026-05-21 14:16:30 +02:00
parent 2fce3281a3
commit 41e15ed275
124 changed files with 404226 additions and 0 deletions
@@ -0,0 +1,517 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"tags": []
},
"source": [
"# Feature Engineering\n",
"# Klassifikation\n",
"# Regression\n",
"# Validierung und mehr\n",
"# Deployment und Abschluss"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"sys.path.append('./')"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"ExecuteTime": {
"end_time": "2020-04-08T10:06:24.890328Z",
"start_time": "2020-04-08T10:06:23.220148Z"
},
"tags": []
},
"outputs": [],
"source": [
"## prepare environment\n",
"import pandas as pd\n",
"import numpy as np\n",
"datapath = '../3_data'\n",
"from os import chdir; chdir(datapath)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Das finale Modell\n",
"## Feature Engineering in der Produktion\n",
"### Missing Values\n",
"### Neue Kategorien\n",
"### Protokollieren"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"data = pd.read_csv('bank_data.csv', sep=';')\n",
"\n",
"import datetime\n",
"\n",
"f = open('fe_prod_log.log','w')\n",
"f.write(datetime.datetime.now().strftime(\"[%Y-%m-%d %H:%M:%S] (timestamp)\"))\n",
"\n",
"f.write('\\n\\n')\n",
"\n",
"s = data.isna().sum()\n",
"f.write('features with NA\\'s\\n')\n",
"f.write('=======================')\n",
"\n",
"f.write('\\n')\n",
"f.write(s[s > 0].to_string())\n",
"f.write('\\n\\n')\n",
"\n",
"## value counts of not numeric features\n",
"f.write('categorical cols levels\\n')\n",
"f.write('=======================')\n",
"f.write('\\n')\n",
"catcolnames = data.select_dtypes(include='object').columns\n",
"for ccn in catcolnames:\n",
" f.write(ccn)\n",
" f.write('\\n')\n",
" f.write(data[ccn].value_counts().sort_index().to_string())\n",
" f.write('\\n\\n')\n",
"f.close() "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Modellübergabe in die Prodkution"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Modelle speichern scikit-learn intern"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"## load data\n",
"from bfh_cas_pml import prep_data\n",
"X_train, X_test, y_train, y_test = prep_data('melb_data_prep.csv', 'Price')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"## three models:\n",
"\n",
"## StandardScaler\n",
"from sklearn.preprocessing import StandardScaler\n",
"model_sc = StandardScaler().fit(X_train)\n",
"\n",
"## LinearRegression\n",
"from sklearn.linear_model import LinearRegression\n",
"model_lr = LinearRegression().fit(X_train, y_train)\n",
"\n",
"## DecisionTreeRegressor\n",
"from sklearn.tree import DecisionTreeRegressor\n",
"model_dt = DecisionTreeRegressor(max_depth= 2, random_state=1234).fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"## save models\n",
"import pickle \n",
"with open('model_sc.pkl', 'wb') as pickle_file:\n",
" pickle.dump(model_sc, pickle_file)\n",
"with open('model_lr.pkl', 'wb') as pickle_file:\n",
" pickle.dump(model_lr, pickle_file)\n",
"with open('model_dt.pkl', 'wb') as pickle_file:\n",
" pickle.dump(model_dt, pickle_file)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"## reload models\n",
"with open('model_sc.pkl', 'rb') as pickle_file:\n",
" model_sc_2 = pickle.load(pickle_file)\n",
"with open('model_lr.pkl', 'rb') as pickle_file:\n",
" model_lr_2 = pickle.load(pickle_file)\n",
"with open('model_dt.pkl', 'rb') as pickle_file:\n",
" model_dt_2 = pickle.load(pickle_file) "
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[ 2.93174034e+00 1.45261784e+00 1.04267493e+01 1.43475779e+00\n",
" 1.68210732e+00 2.36615297e+00 2.09901978e+00 1.96804559e+03\n",
" 5.74824662e+00 -3.78091214e+01 1.44997478e+02 7.52039447e+03\n",
" 6.57396836e-01 1.29098026e-01 9.18284130e-02 2.89430762e-01\n",
" 3.73511662e-02 3.43010928e-01 1.01125428e-02 2.10650791e-01\n",
" 7.15274833e+00 2.01656214e+03 4.83925950e+00] \n",
"\n",
"[9.08813135e-01 4.68436712e-01 3.68004376e+01 4.24996430e-01\n",
" 7.32413501e-01 7.11803450e-01 3.34340977e-02 6.45488299e+02\n",
" 3.80781143e+01 5.53828210e-03 9.28052881e-03 2.02869238e+07\n",
" 2.25226236e-01 1.12431726e-01 8.33959556e-02 2.05660596e-01\n",
" 3.59560566e-02 2.25354431e-01 1.00102793e-02 1.66277035e-01\n",
" 6.21684084e+00 2.46138222e-01 1.23097215e+00] \n",
"\n",
"[ 2.93174034e+00 1.45261784e+00 1.04267493e+01 1.43475779e+00\n",
" 1.68210732e+00 2.36615297e+00 2.09901978e+00 1.96804559e+03\n",
" 5.74824662e+00 -3.78091214e+01 1.44997478e+02 7.52039447e+03\n",
" 6.57396836e-01 1.29098026e-01 9.18284130e-02 2.89430762e-01\n",
" 3.73511662e-02 3.43010928e-01 1.01125428e-02 2.10650791e-01\n",
" 7.15274833e+00 2.01656214e+03 4.83925950e+00] \n",
"\n",
"[9.08813135e-01 4.68436712e-01 3.68004376e+01 4.24996430e-01\n",
" 7.32413501e-01 7.11803450e-01 3.34340977e-02 6.45488299e+02\n",
" 3.80781143e+01 5.53828210e-03 9.28052881e-03 2.02869238e+07\n",
" 2.25226236e-01 1.12431726e-01 8.33959556e-02 2.05660596e-01\n",
" 3.59560566e-02 2.25354431e-01 1.00102793e-02 1.66277035e-01\n",
" 6.21684084e+00 2.46138222e-01 1.23097215e+00] \n",
"\n"
]
}
],
"source": [
"## compare model_sc\n",
"print(model_sc.mean_, '\\n')\n",
"print(model_sc.var_, '\\n')\n",
"print(model_sc_2.mean_, '\\n')\n",
"print(model_sc_2.var_, '\\n')"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"-148600153.0644718\n",
"[ 2.38730932e+05 -1.44127046e+05 -4.11587415e+04 1.55233185e+05\n",
" 4.16082458e+04 8.22944764e+04 2.97716907e+05 -2.76754904e+03\n",
" -4.92937355e+03 -5.11115932e+05 1.79228159e+05 -1.58740210e+00\n",
" 7.07758671e+04 7.99854850e+03 3.38953273e+04 -1.73563804e+05\n",
" 1.07806729e+05 2.39261230e+05 2.81825047e+05 -2.26744902e+05\n",
" 1.27253408e+03 5.38476338e+04 3.83311293e+03] \n",
"\n",
"-148600153.0644718\n",
"[ 2.38730932e+05 -1.44127046e+05 -4.11587415e+04 1.55233185e+05\n",
" 4.16082458e+04 8.22944764e+04 2.97716907e+05 -2.76754904e+03\n",
" -4.92937355e+03 -5.11115932e+05 1.79228159e+05 -1.58740210e+00\n",
" 7.07758671e+04 7.99854850e+03 3.38953273e+04 -1.73563804e+05\n",
" 1.07806729e+05 2.39261230e+05 2.81825047e+05 -2.26744902e+05\n",
" 1.27253408e+03 5.38476338e+04 3.83311293e+03]\n"
]
}
],
"source": [
"## compare model_lr\n",
"print(model_lr.intercept_)\n",
"print(model_lr.coef_, '\\n')\n",
"print(model_lr_2.intercept_)\n",
"print(model_lr_2.coef_)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"|--- Rooms <= 3.50\n",
"| |--- Type <= 1.50\n",
"| | |--- value: [1077045.50]\n",
"| |--- Type > 1.50\n",
"| | |--- value: [682863.38]\n",
"|--- Rooms > 3.50\n",
"| |--- Regionname_Southern_Metropolitan <= 0.50\n",
"| | |--- value: [1163850.50]\n",
"| |--- Regionname_Southern_Metropolitan > 0.50\n",
"| | |--- value: [2113002.90]\n",
"\n",
"|--- Rooms <= 3.50\n",
"| |--- Type <= 1.50\n",
"| | |--- value: [1077045.50]\n",
"| |--- Type > 1.50\n",
"| | |--- value: [682863.38]\n",
"|--- Rooms > 3.50\n",
"| |--- Regionname_Southern_Metropolitan <= 0.50\n",
"| | |--- value: [1163850.50]\n",
"| |--- Regionname_Southern_Metropolitan > 0.50\n",
"| | |--- value: [2113002.90]\n",
"\n"
]
}
],
"source": [
"## compare model_dt\n",
"from sklearn.tree import export_text\n",
"print(export_text(\n",
" model_dt, feature_names=list(X_train.columns)))\n",
"print(export_text(\n",
" model_dt_2, feature_names=list(X_train.columns)))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Modelle speichern extern mit PMML"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Vorbereitungen:** \n",
"* Java Runtime muss vorhanden sein\n",
"* Library installieren direkt in Notebook: \n",
"`!pip install sklearn2pmml`"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: sklearn2pmml in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (0.119.1)\n",
"Requirement already satisfied: dill>=0.3.4 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from sklearn2pmml) (0.4.0)\n",
"Requirement already satisfied: joblib>=0.13.0 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from sklearn2pmml) (1.5.1)\n",
"Requirement already satisfied: pandas>=1.5.0 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from sklearn2pmml) (2.2.3)\n",
"Requirement already satisfied: scikit-learn>=1.0 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from sklearn2pmml) (1.6.1)\n",
"Requirement already satisfied: numpy>=1.26.0 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from pandas>=1.5.0->sklearn2pmml) (2.2.6)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from pandas>=1.5.0->sklearn2pmml) (2.9.0.post0)\n",
"Requirement already satisfied: pytz>=2020.1 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from pandas>=1.5.0->sklearn2pmml) (2025.2)\n",
"Requirement already satisfied: tzdata>=2022.7 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from pandas>=1.5.0->sklearn2pmml) (2025.2)\n",
"Requirement already satisfied: six>=1.5 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from python-dateutil>=2.8.2->pandas>=1.5.0->sklearn2pmml) (1.17.0)\n",
"Requirement already satisfied: scipy>=1.6.0 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from scikit-learn>=1.0->sklearn2pmml) (1.15.3)\n",
"Requirement already satisfied: threadpoolctl>=3.1.0 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from scikit-learn>=1.0->sklearn2pmml) (3.6.0)\n"
]
}
],
"source": [
"!pip install sklearn2pmml"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"## import libraries\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn2pmml import PMMLPipeline, sklearn2pmml"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"## StandardScaler\n",
"from sklearn.preprocessing import StandardScaler\n",
"pipeline = PMMLPipeline([(\"scaler\", StandardScaler())]).fit(X_train)\n",
"sklearn2pmml(pipeline, \"StandardScaler_melb.pmml\", with_repr = True)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"## LinearRegression\n",
"pipeline = PMMLPipeline([('regressor', LinearRegression())]).fit(X_train, y_train)\n",
"sklearn2pmml(pipeline, \"LinearRegression_melb.pmml\", with_repr = True)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"## DecisionTreeClassifier\n",
"pipeline = PMMLPipeline([('regressor', DecisionTreeRegressor(max_depth= 2, random_state=1234))]).fit(X_train, y_train)\n",
"sklearn2pmml(pipeline, \"DecisionTreeRegressor_melb.pmml\", with_repr = True)"
]
},
{
"cell_type": "markdown",
"metadata": {
"tags": []
},
"source": [
"* re-import a pmml model\n",
" * see https://stackoverflow.com/questions/52393301/use-pmml-models-in-python"
]
},
{
"cell_type": "raw",
"metadata": {
"tags": []
},
"source": [
"## !pip install pypmml\n",
"## not run: long running time\n",
"from pypmml import Model\n",
"new_model = Model.fromFile(\"StandardScaler_melb.pmml\")\n",
"result = new_model.predict(X_train)\n",
"print(result.describe())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Der Modellierungsprozess"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"(kein Code unter diesem Titel)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## pyCaret"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"vgl. `5.5 Deployment und Abschluss - pycaret.ipynb`"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.0"
},
"toc": {
"base_numbering": "5",
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "5 Deployment und Abschluss",
"title_sidebar": "Contents",
"toc_cell": true,
"toc_position": {
"height": "calc(100% - 180px)",
"left": "10px",
"top": "150px",
"width": "165px"
},
"toc_section_display": true,
"toc_window_display": true
},
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"position": {
"height": "321.85px",
"left": "790px",
"right": "20px",
"top": "113px",
"width": "350px"
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 4
}