518 lines
15 KiB
Plaintext
518 lines
15 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"source": [
|
|
"# Feature Engineering\n",
|
|
"# Klassifikation\n",
|
|
"# Regression\n",
|
|
"# Validierung und mehr\n",
|
|
"# Deployment und Abschluss"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import sys\n",
|
|
"sys.path.append('./')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {
|
|
"ExecuteTime": {
|
|
"end_time": "2020-04-08T10:06:24.890328Z",
|
|
"start_time": "2020-04-08T10:06:23.220148Z"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"## prepare environment\n",
|
|
"import pandas as pd\n",
|
|
"import numpy as np\n",
|
|
"datapath = '../3_data'\n",
|
|
"from os import chdir; chdir(datapath)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Das finale Modell\n",
|
|
"## Feature Engineering in der Produktion\n",
|
|
"### Missing Values\n",
|
|
"### Neue Kategorien\n",
|
|
"### Protokollieren"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"data = pd.read_csv('bank_data.csv', sep=';')\n",
|
|
"\n",
|
|
"import datetime\n",
|
|
"\n",
|
|
"f = open('fe_prod_log.log','w')\n",
|
|
"f.write(datetime.datetime.now().strftime(\"[%Y-%m-%d %H:%M:%S] (timestamp)\"))\n",
|
|
"\n",
|
|
"f.write('\\n\\n')\n",
|
|
"\n",
|
|
"s = data.isna().sum()\n",
|
|
"f.write('features with NA\\'s\\n')\n",
|
|
"f.write('=======================')\n",
|
|
"\n",
|
|
"f.write('\\n')\n",
|
|
"f.write(s[s > 0].to_string())\n",
|
|
"f.write('\\n\\n')\n",
|
|
"\n",
|
|
"## value counts of not numeric features\n",
|
|
"f.write('categorical cols levels\\n')\n",
|
|
"f.write('=======================')\n",
|
|
"f.write('\\n')\n",
|
|
"catcolnames = data.select_dtypes(include='object').columns\n",
|
|
"for ccn in catcolnames:\n",
|
|
" f.write(ccn)\n",
|
|
" f.write('\\n')\n",
|
|
" f.write(data[ccn].value_counts().sort_index().to_string())\n",
|
|
" f.write('\\n\\n')\n",
|
|
"f.close() "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Modellübergabe in die Prodkution"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Modelle speichern scikit-learn intern"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"## load data\n",
|
|
"from bfh_cas_pml import prep_data\n",
|
|
"X_train, X_test, y_train, y_test = prep_data('melb_data_prep.csv', 'Price')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"## three models:\n",
|
|
"\n",
|
|
"## StandardScaler\n",
|
|
"from sklearn.preprocessing import StandardScaler\n",
|
|
"model_sc = StandardScaler().fit(X_train)\n",
|
|
"\n",
|
|
"## LinearRegression\n",
|
|
"from sklearn.linear_model import LinearRegression\n",
|
|
"model_lr = LinearRegression().fit(X_train, y_train)\n",
|
|
"\n",
|
|
"## DecisionTreeRegressor\n",
|
|
"from sklearn.tree import DecisionTreeRegressor\n",
|
|
"model_dt = DecisionTreeRegressor(max_depth= 2, random_state=1234).fit(X_train, y_train)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"## save models\n",
|
|
"import pickle \n",
|
|
"with open('model_sc.pkl', 'wb') as pickle_file:\n",
|
|
" pickle.dump(model_sc, pickle_file)\n",
|
|
"with open('model_lr.pkl', 'wb') as pickle_file:\n",
|
|
" pickle.dump(model_lr, pickle_file)\n",
|
|
"with open('model_dt.pkl', 'wb') as pickle_file:\n",
|
|
" pickle.dump(model_dt, pickle_file)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"## reload models\n",
|
|
"with open('model_sc.pkl', 'rb') as pickle_file:\n",
|
|
" model_sc_2 = pickle.load(pickle_file)\n",
|
|
"with open('model_lr.pkl', 'rb') as pickle_file:\n",
|
|
" model_lr_2 = pickle.load(pickle_file)\n",
|
|
"with open('model_dt.pkl', 'rb') as pickle_file:\n",
|
|
" model_dt_2 = pickle.load(pickle_file) "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"[ 2.93174034e+00 1.45261784e+00 1.04267493e+01 1.43475779e+00\n",
|
|
" 1.68210732e+00 2.36615297e+00 2.09901978e+00 1.96804559e+03\n",
|
|
" 5.74824662e+00 -3.78091214e+01 1.44997478e+02 7.52039447e+03\n",
|
|
" 6.57396836e-01 1.29098026e-01 9.18284130e-02 2.89430762e-01\n",
|
|
" 3.73511662e-02 3.43010928e-01 1.01125428e-02 2.10650791e-01\n",
|
|
" 7.15274833e+00 2.01656214e+03 4.83925950e+00] \n",
|
|
"\n",
|
|
"[9.08813135e-01 4.68436712e-01 3.68004376e+01 4.24996430e-01\n",
|
|
" 7.32413501e-01 7.11803450e-01 3.34340977e-02 6.45488299e+02\n",
|
|
" 3.80781143e+01 5.53828210e-03 9.28052881e-03 2.02869238e+07\n",
|
|
" 2.25226236e-01 1.12431726e-01 8.33959556e-02 2.05660596e-01\n",
|
|
" 3.59560566e-02 2.25354431e-01 1.00102793e-02 1.66277035e-01\n",
|
|
" 6.21684084e+00 2.46138222e-01 1.23097215e+00] \n",
|
|
"\n",
|
|
"[ 2.93174034e+00 1.45261784e+00 1.04267493e+01 1.43475779e+00\n",
|
|
" 1.68210732e+00 2.36615297e+00 2.09901978e+00 1.96804559e+03\n",
|
|
" 5.74824662e+00 -3.78091214e+01 1.44997478e+02 7.52039447e+03\n",
|
|
" 6.57396836e-01 1.29098026e-01 9.18284130e-02 2.89430762e-01\n",
|
|
" 3.73511662e-02 3.43010928e-01 1.01125428e-02 2.10650791e-01\n",
|
|
" 7.15274833e+00 2.01656214e+03 4.83925950e+00] \n",
|
|
"\n",
|
|
"[9.08813135e-01 4.68436712e-01 3.68004376e+01 4.24996430e-01\n",
|
|
" 7.32413501e-01 7.11803450e-01 3.34340977e-02 6.45488299e+02\n",
|
|
" 3.80781143e+01 5.53828210e-03 9.28052881e-03 2.02869238e+07\n",
|
|
" 2.25226236e-01 1.12431726e-01 8.33959556e-02 2.05660596e-01\n",
|
|
" 3.59560566e-02 2.25354431e-01 1.00102793e-02 1.66277035e-01\n",
|
|
" 6.21684084e+00 2.46138222e-01 1.23097215e+00] \n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"## compare model_sc\n",
|
|
"print(model_sc.mean_, '\\n')\n",
|
|
"print(model_sc.var_, '\\n')\n",
|
|
"print(model_sc_2.mean_, '\\n')\n",
|
|
"print(model_sc_2.var_, '\\n')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"-148600153.0644718\n",
|
|
"[ 2.38730932e+05 -1.44127046e+05 -4.11587415e+04 1.55233185e+05\n",
|
|
" 4.16082458e+04 8.22944764e+04 2.97716907e+05 -2.76754904e+03\n",
|
|
" -4.92937355e+03 -5.11115932e+05 1.79228159e+05 -1.58740210e+00\n",
|
|
" 7.07758671e+04 7.99854850e+03 3.38953273e+04 -1.73563804e+05\n",
|
|
" 1.07806729e+05 2.39261230e+05 2.81825047e+05 -2.26744902e+05\n",
|
|
" 1.27253408e+03 5.38476338e+04 3.83311293e+03] \n",
|
|
"\n",
|
|
"-148600153.0644718\n",
|
|
"[ 2.38730932e+05 -1.44127046e+05 -4.11587415e+04 1.55233185e+05\n",
|
|
" 4.16082458e+04 8.22944764e+04 2.97716907e+05 -2.76754904e+03\n",
|
|
" -4.92937355e+03 -5.11115932e+05 1.79228159e+05 -1.58740210e+00\n",
|
|
" 7.07758671e+04 7.99854850e+03 3.38953273e+04 -1.73563804e+05\n",
|
|
" 1.07806729e+05 2.39261230e+05 2.81825047e+05 -2.26744902e+05\n",
|
|
" 1.27253408e+03 5.38476338e+04 3.83311293e+03]\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"## compare model_lr\n",
|
|
"print(model_lr.intercept_)\n",
|
|
"print(model_lr.coef_, '\\n')\n",
|
|
"print(model_lr_2.intercept_)\n",
|
|
"print(model_lr_2.coef_)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"|--- Rooms <= 3.50\n",
|
|
"| |--- Type <= 1.50\n",
|
|
"| | |--- value: [1077045.50]\n",
|
|
"| |--- Type > 1.50\n",
|
|
"| | |--- value: [682863.38]\n",
|
|
"|--- Rooms > 3.50\n",
|
|
"| |--- Regionname_Southern_Metropolitan <= 0.50\n",
|
|
"| | |--- value: [1163850.50]\n",
|
|
"| |--- Regionname_Southern_Metropolitan > 0.50\n",
|
|
"| | |--- value: [2113002.90]\n",
|
|
"\n",
|
|
"|--- Rooms <= 3.50\n",
|
|
"| |--- Type <= 1.50\n",
|
|
"| | |--- value: [1077045.50]\n",
|
|
"| |--- Type > 1.50\n",
|
|
"| | |--- value: [682863.38]\n",
|
|
"|--- Rooms > 3.50\n",
|
|
"| |--- Regionname_Southern_Metropolitan <= 0.50\n",
|
|
"| | |--- value: [1163850.50]\n",
|
|
"| |--- Regionname_Southern_Metropolitan > 0.50\n",
|
|
"| | |--- value: [2113002.90]\n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"## compare model_dt\n",
|
|
"from sklearn.tree import export_text\n",
|
|
"print(export_text(\n",
|
|
" model_dt, feature_names=list(X_train.columns)))\n",
|
|
"print(export_text(\n",
|
|
" model_dt_2, feature_names=list(X_train.columns)))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Modelle speichern extern mit PMML"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"**Vorbereitungen:** \n",
|
|
"* Java Runtime muss vorhanden sein\n",
|
|
"* Library installieren direkt in Notebook: \n",
|
|
"`!pip install sklearn2pmml`"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Requirement already satisfied: sklearn2pmml in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (0.119.1)\n",
|
|
"Requirement already satisfied: dill>=0.3.4 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from sklearn2pmml) (0.4.0)\n",
|
|
"Requirement already satisfied: joblib>=0.13.0 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from sklearn2pmml) (1.5.1)\n",
|
|
"Requirement already satisfied: pandas>=1.5.0 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from sklearn2pmml) (2.2.3)\n",
|
|
"Requirement already satisfied: scikit-learn>=1.0 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from sklearn2pmml) (1.6.1)\n",
|
|
"Requirement already satisfied: numpy>=1.26.0 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from pandas>=1.5.0->sklearn2pmml) (2.2.6)\n",
|
|
"Requirement already satisfied: python-dateutil>=2.8.2 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from pandas>=1.5.0->sklearn2pmml) (2.9.0.post0)\n",
|
|
"Requirement already satisfied: pytz>=2020.1 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from pandas>=1.5.0->sklearn2pmml) (2025.2)\n",
|
|
"Requirement already satisfied: tzdata>=2022.7 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from pandas>=1.5.0->sklearn2pmml) (2025.2)\n",
|
|
"Requirement already satisfied: six>=1.5 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from python-dateutil>=2.8.2->pandas>=1.5.0->sklearn2pmml) (1.17.0)\n",
|
|
"Requirement already satisfied: scipy>=1.6.0 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from scikit-learn>=1.0->sklearn2pmml) (1.15.3)\n",
|
|
"Requirement already satisfied: threadpoolctl>=3.1.0 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from scikit-learn>=1.0->sklearn2pmml) (3.6.0)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"!pip install sklearn2pmml"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"## import libraries\n",
|
|
"from sklearn.pipeline import Pipeline\n",
|
|
"from sklearn2pmml import PMMLPipeline, sklearn2pmml"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"metadata": {
|
|
"scrolled": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"## StandardScaler\n",
|
|
"from sklearn.preprocessing import StandardScaler\n",
|
|
"pipeline = PMMLPipeline([(\"scaler\", StandardScaler())]).fit(X_train)\n",
|
|
"sklearn2pmml(pipeline, \"StandardScaler_melb.pmml\", with_repr = True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"## LinearRegression\n",
|
|
"pipeline = PMMLPipeline([('regressor', LinearRegression())]).fit(X_train, y_train)\n",
|
|
"sklearn2pmml(pipeline, \"LinearRegression_melb.pmml\", with_repr = True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 15,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"## DecisionTreeClassifier\n",
|
|
"pipeline = PMMLPipeline([('regressor', DecisionTreeRegressor(max_depth= 2, random_state=1234))]).fit(X_train, y_train)\n",
|
|
"sklearn2pmml(pipeline, \"DecisionTreeRegressor_melb.pmml\", with_repr = True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"source": [
|
|
"* re-import a pmml model\n",
|
|
" * see https://stackoverflow.com/questions/52393301/use-pmml-models-in-python"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "raw",
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"source": [
|
|
"## !pip install pypmml\n",
|
|
"## not run: long running time\n",
|
|
"from pypmml import Model\n",
|
|
"new_model = Model.fromFile(\"StandardScaler_melb.pmml\")\n",
|
|
"result = new_model.predict(X_train)\n",
|
|
"print(result.describe())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Der Modellierungsprozess"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"(kein Code unter diesem Titel)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## pyCaret"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"vgl. `5.5 Deployment und Abschluss - pycaret.ipynb`"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.13.0"
|
|
},
|
|
"toc": {
|
|
"base_numbering": "5",
|
|
"nav_menu": {},
|
|
"number_sections": true,
|
|
"sideBar": true,
|
|
"skip_h1_title": false,
|
|
"title_cell": "5 Deployment und Abschluss",
|
|
"title_sidebar": "Contents",
|
|
"toc_cell": true,
|
|
"toc_position": {
|
|
"height": "calc(100% - 180px)",
|
|
"left": "10px",
|
|
"top": "150px",
|
|
"width": "165px"
|
|
},
|
|
"toc_section_display": true,
|
|
"toc_window_display": true
|
|
},
|
|
"varInspector": {
|
|
"cols": {
|
|
"lenName": 16,
|
|
"lenType": 16,
|
|
"lenVar": 40
|
|
},
|
|
"kernels_config": {
|
|
"python": {
|
|
"delete_cmd_postfix": "",
|
|
"delete_cmd_prefix": "del ",
|
|
"library": "var_list.py",
|
|
"varRefreshCmd": "print(var_dic_list())"
|
|
},
|
|
"r": {
|
|
"delete_cmd_postfix": ") ",
|
|
"delete_cmd_prefix": "rm(",
|
|
"library": "var_list.r",
|
|
"varRefreshCmd": "cat(var_dic_list()) "
|
|
}
|
|
},
|
|
"position": {
|
|
"height": "321.85px",
|
|
"left": "790px",
|
|
"right": "20px",
|
|
"top": "113px",
|
|
"width": "350px"
|
|
},
|
|
"types_to_exclude": [
|
|
"module",
|
|
"function",
|
|
"builtin_function_or_method",
|
|
"instance",
|
|
"_Feature"
|
|
],
|
|
"window_display": false
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 4
|
|
}
|