refactor: move things around
This commit is contained in:
@@ -0,0 +1,517 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"# Feature Engineering\n",
|
||||
"# Klassifikation\n",
|
||||
"# Regression\n",
|
||||
"# Validierung und mehr\n",
|
||||
"# Deployment und Abschluss"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import sys\n",
|
||||
"sys.path.append('./')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2020-04-08T10:06:24.890328Z",
|
||||
"start_time": "2020-04-08T10:06:23.220148Z"
|
||||
},
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"## prepare environment\n",
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"datapath = '../3_data'\n",
|
||||
"from os import chdir; chdir(datapath)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Das finale Modell\n",
|
||||
"## Feature Engineering in der Produktion\n",
|
||||
"### Missing Values\n",
|
||||
"### Neue Kategorien\n",
|
||||
"### Protokollieren"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = pd.read_csv('bank_data.csv', sep=';')\n",
|
||||
"\n",
|
||||
"import datetime\n",
|
||||
"\n",
|
||||
"f = open('fe_prod_log.log','w')\n",
|
||||
"f.write(datetime.datetime.now().strftime(\"[%Y-%m-%d %H:%M:%S] (timestamp)\"))\n",
|
||||
"\n",
|
||||
"f.write('\\n\\n')\n",
|
||||
"\n",
|
||||
"s = data.isna().sum()\n",
|
||||
"f.write('features with NA\\'s\\n')\n",
|
||||
"f.write('=======================')\n",
|
||||
"\n",
|
||||
"f.write('\\n')\n",
|
||||
"f.write(s[s > 0].to_string())\n",
|
||||
"f.write('\\n\\n')\n",
|
||||
"\n",
|
||||
"## value counts of not numeric features\n",
|
||||
"f.write('categorical cols levels\\n')\n",
|
||||
"f.write('=======================')\n",
|
||||
"f.write('\\n')\n",
|
||||
"catcolnames = data.select_dtypes(include='object').columns\n",
|
||||
"for ccn in catcolnames:\n",
|
||||
" f.write(ccn)\n",
|
||||
" f.write('\\n')\n",
|
||||
" f.write(data[ccn].value_counts().sort_index().to_string())\n",
|
||||
" f.write('\\n\\n')\n",
|
||||
"f.close() "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Modellübergabe in die Prodkution"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Modelle speichern scikit-learn intern"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"## load data\n",
|
||||
"from bfh_cas_pml import prep_data\n",
|
||||
"X_train, X_test, y_train, y_test = prep_data('melb_data_prep.csv', 'Price')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"## three models:\n",
|
||||
"\n",
|
||||
"## StandardScaler\n",
|
||||
"from sklearn.preprocessing import StandardScaler\n",
|
||||
"model_sc = StandardScaler().fit(X_train)\n",
|
||||
"\n",
|
||||
"## LinearRegression\n",
|
||||
"from sklearn.linear_model import LinearRegression\n",
|
||||
"model_lr = LinearRegression().fit(X_train, y_train)\n",
|
||||
"\n",
|
||||
"## DecisionTreeRegressor\n",
|
||||
"from sklearn.tree import DecisionTreeRegressor\n",
|
||||
"model_dt = DecisionTreeRegressor(max_depth= 2, random_state=1234).fit(X_train, y_train)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"## save models\n",
|
||||
"import pickle \n",
|
||||
"with open('model_sc.pkl', 'wb') as pickle_file:\n",
|
||||
" pickle.dump(model_sc, pickle_file)\n",
|
||||
"with open('model_lr.pkl', 'wb') as pickle_file:\n",
|
||||
" pickle.dump(model_lr, pickle_file)\n",
|
||||
"with open('model_dt.pkl', 'wb') as pickle_file:\n",
|
||||
" pickle.dump(model_dt, pickle_file)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"## reload models\n",
|
||||
"with open('model_sc.pkl', 'rb') as pickle_file:\n",
|
||||
" model_sc_2 = pickle.load(pickle_file)\n",
|
||||
"with open('model_lr.pkl', 'rb') as pickle_file:\n",
|
||||
" model_lr_2 = pickle.load(pickle_file)\n",
|
||||
"with open('model_dt.pkl', 'rb') as pickle_file:\n",
|
||||
" model_dt_2 = pickle.load(pickle_file) "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[ 2.93174034e+00 1.45261784e+00 1.04267493e+01 1.43475779e+00\n",
|
||||
" 1.68210732e+00 2.36615297e+00 2.09901978e+00 1.96804559e+03\n",
|
||||
" 5.74824662e+00 -3.78091214e+01 1.44997478e+02 7.52039447e+03\n",
|
||||
" 6.57396836e-01 1.29098026e-01 9.18284130e-02 2.89430762e-01\n",
|
||||
" 3.73511662e-02 3.43010928e-01 1.01125428e-02 2.10650791e-01\n",
|
||||
" 7.15274833e+00 2.01656214e+03 4.83925950e+00] \n",
|
||||
"\n",
|
||||
"[9.08813135e-01 4.68436712e-01 3.68004376e+01 4.24996430e-01\n",
|
||||
" 7.32413501e-01 7.11803450e-01 3.34340977e-02 6.45488299e+02\n",
|
||||
" 3.80781143e+01 5.53828210e-03 9.28052881e-03 2.02869238e+07\n",
|
||||
" 2.25226236e-01 1.12431726e-01 8.33959556e-02 2.05660596e-01\n",
|
||||
" 3.59560566e-02 2.25354431e-01 1.00102793e-02 1.66277035e-01\n",
|
||||
" 6.21684084e+00 2.46138222e-01 1.23097215e+00] \n",
|
||||
"\n",
|
||||
"[ 2.93174034e+00 1.45261784e+00 1.04267493e+01 1.43475779e+00\n",
|
||||
" 1.68210732e+00 2.36615297e+00 2.09901978e+00 1.96804559e+03\n",
|
||||
" 5.74824662e+00 -3.78091214e+01 1.44997478e+02 7.52039447e+03\n",
|
||||
" 6.57396836e-01 1.29098026e-01 9.18284130e-02 2.89430762e-01\n",
|
||||
" 3.73511662e-02 3.43010928e-01 1.01125428e-02 2.10650791e-01\n",
|
||||
" 7.15274833e+00 2.01656214e+03 4.83925950e+00] \n",
|
||||
"\n",
|
||||
"[9.08813135e-01 4.68436712e-01 3.68004376e+01 4.24996430e-01\n",
|
||||
" 7.32413501e-01 7.11803450e-01 3.34340977e-02 6.45488299e+02\n",
|
||||
" 3.80781143e+01 5.53828210e-03 9.28052881e-03 2.02869238e+07\n",
|
||||
" 2.25226236e-01 1.12431726e-01 8.33959556e-02 2.05660596e-01\n",
|
||||
" 3.59560566e-02 2.25354431e-01 1.00102793e-02 1.66277035e-01\n",
|
||||
" 6.21684084e+00 2.46138222e-01 1.23097215e+00] \n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"## compare model_sc\n",
|
||||
"print(model_sc.mean_, '\\n')\n",
|
||||
"print(model_sc.var_, '\\n')\n",
|
||||
"print(model_sc_2.mean_, '\\n')\n",
|
||||
"print(model_sc_2.var_, '\\n')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"-148600153.0644718\n",
|
||||
"[ 2.38730932e+05 -1.44127046e+05 -4.11587415e+04 1.55233185e+05\n",
|
||||
" 4.16082458e+04 8.22944764e+04 2.97716907e+05 -2.76754904e+03\n",
|
||||
" -4.92937355e+03 -5.11115932e+05 1.79228159e+05 -1.58740210e+00\n",
|
||||
" 7.07758671e+04 7.99854850e+03 3.38953273e+04 -1.73563804e+05\n",
|
||||
" 1.07806729e+05 2.39261230e+05 2.81825047e+05 -2.26744902e+05\n",
|
||||
" 1.27253408e+03 5.38476338e+04 3.83311293e+03] \n",
|
||||
"\n",
|
||||
"-148600153.0644718\n",
|
||||
"[ 2.38730932e+05 -1.44127046e+05 -4.11587415e+04 1.55233185e+05\n",
|
||||
" 4.16082458e+04 8.22944764e+04 2.97716907e+05 -2.76754904e+03\n",
|
||||
" -4.92937355e+03 -5.11115932e+05 1.79228159e+05 -1.58740210e+00\n",
|
||||
" 7.07758671e+04 7.99854850e+03 3.38953273e+04 -1.73563804e+05\n",
|
||||
" 1.07806729e+05 2.39261230e+05 2.81825047e+05 -2.26744902e+05\n",
|
||||
" 1.27253408e+03 5.38476338e+04 3.83311293e+03]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"## compare model_lr\n",
|
||||
"print(model_lr.intercept_)\n",
|
||||
"print(model_lr.coef_, '\\n')\n",
|
||||
"print(model_lr_2.intercept_)\n",
|
||||
"print(model_lr_2.coef_)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"|--- Rooms <= 3.50\n",
|
||||
"| |--- Type <= 1.50\n",
|
||||
"| | |--- value: [1077045.50]\n",
|
||||
"| |--- Type > 1.50\n",
|
||||
"| | |--- value: [682863.38]\n",
|
||||
"|--- Rooms > 3.50\n",
|
||||
"| |--- Regionname_Southern_Metropolitan <= 0.50\n",
|
||||
"| | |--- value: [1163850.50]\n",
|
||||
"| |--- Regionname_Southern_Metropolitan > 0.50\n",
|
||||
"| | |--- value: [2113002.90]\n",
|
||||
"\n",
|
||||
"|--- Rooms <= 3.50\n",
|
||||
"| |--- Type <= 1.50\n",
|
||||
"| | |--- value: [1077045.50]\n",
|
||||
"| |--- Type > 1.50\n",
|
||||
"| | |--- value: [682863.38]\n",
|
||||
"|--- Rooms > 3.50\n",
|
||||
"| |--- Regionname_Southern_Metropolitan <= 0.50\n",
|
||||
"| | |--- value: [1163850.50]\n",
|
||||
"| |--- Regionname_Southern_Metropolitan > 0.50\n",
|
||||
"| | |--- value: [2113002.90]\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"## compare model_dt\n",
|
||||
"from sklearn.tree import export_text\n",
|
||||
"print(export_text(\n",
|
||||
" model_dt, feature_names=list(X_train.columns)))\n",
|
||||
"print(export_text(\n",
|
||||
" model_dt_2, feature_names=list(X_train.columns)))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Modelle speichern extern mit PMML"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Vorbereitungen:** \n",
|
||||
"* Java Runtime muss vorhanden sein\n",
|
||||
"* Library installieren direkt in Notebook: \n",
|
||||
"`!pip install sklearn2pmml`"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Requirement already satisfied: sklearn2pmml in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (0.119.1)\n",
|
||||
"Requirement already satisfied: dill>=0.3.4 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from sklearn2pmml) (0.4.0)\n",
|
||||
"Requirement already satisfied: joblib>=0.13.0 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from sklearn2pmml) (1.5.1)\n",
|
||||
"Requirement already satisfied: pandas>=1.5.0 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from sklearn2pmml) (2.2.3)\n",
|
||||
"Requirement already satisfied: scikit-learn>=1.0 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from sklearn2pmml) (1.6.1)\n",
|
||||
"Requirement already satisfied: numpy>=1.26.0 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from pandas>=1.5.0->sklearn2pmml) (2.2.6)\n",
|
||||
"Requirement already satisfied: python-dateutil>=2.8.2 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from pandas>=1.5.0->sklearn2pmml) (2.9.0.post0)\n",
|
||||
"Requirement already satisfied: pytz>=2020.1 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from pandas>=1.5.0->sklearn2pmml) (2025.2)\n",
|
||||
"Requirement already satisfied: tzdata>=2022.7 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from pandas>=1.5.0->sklearn2pmml) (2025.2)\n",
|
||||
"Requirement already satisfied: six>=1.5 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from python-dateutil>=2.8.2->pandas>=1.5.0->sklearn2pmml) (1.17.0)\n",
|
||||
"Requirement already satisfied: scipy>=1.6.0 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from scikit-learn>=1.0->sklearn2pmml) (1.15.3)\n",
|
||||
"Requirement already satisfied: threadpoolctl>=3.1.0 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from scikit-learn>=1.0->sklearn2pmml) (3.6.0)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!pip install sklearn2pmml"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"## import libraries\n",
|
||||
"from sklearn.pipeline import Pipeline\n",
|
||||
"from sklearn2pmml import PMMLPipeline, sklearn2pmml"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"## StandardScaler\n",
|
||||
"from sklearn.preprocessing import StandardScaler\n",
|
||||
"pipeline = PMMLPipeline([(\"scaler\", StandardScaler())]).fit(X_train)\n",
|
||||
"sklearn2pmml(pipeline, \"StandardScaler_melb.pmml\", with_repr = True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"## LinearRegression\n",
|
||||
"pipeline = PMMLPipeline([('regressor', LinearRegression())]).fit(X_train, y_train)\n",
|
||||
"sklearn2pmml(pipeline, \"LinearRegression_melb.pmml\", with_repr = True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"## DecisionTreeClassifier\n",
|
||||
"pipeline = PMMLPipeline([('regressor', DecisionTreeRegressor(max_depth= 2, random_state=1234))]).fit(X_train, y_train)\n",
|
||||
"sklearn2pmml(pipeline, \"DecisionTreeRegressor_melb.pmml\", with_repr = True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"* re-import a pmml model\n",
|
||||
" * see https://stackoverflow.com/questions/52393301/use-pmml-models-in-python"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "raw",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"## !pip install pypmml\n",
|
||||
"## not run: long running time\n",
|
||||
"from pypmml import Model\n",
|
||||
"new_model = Model.fromFile(\"StandardScaler_melb.pmml\")\n",
|
||||
"result = new_model.predict(X_train)\n",
|
||||
"print(result.describe())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Der Modellierungsprozess"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"(kein Code unter diesem Titel)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## pyCaret"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"vgl. `5.5 Deployment und Abschluss - pycaret.ipynb`"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.13.0"
|
||||
},
|
||||
"toc": {
|
||||
"base_numbering": "5",
|
||||
"nav_menu": {},
|
||||
"number_sections": true,
|
||||
"sideBar": true,
|
||||
"skip_h1_title": false,
|
||||
"title_cell": "5 Deployment und Abschluss",
|
||||
"title_sidebar": "Contents",
|
||||
"toc_cell": true,
|
||||
"toc_position": {
|
||||
"height": "calc(100% - 180px)",
|
||||
"left": "10px",
|
||||
"top": "150px",
|
||||
"width": "165px"
|
||||
},
|
||||
"toc_section_display": true,
|
||||
"toc_window_display": true
|
||||
},
|
||||
"varInspector": {
|
||||
"cols": {
|
||||
"lenName": 16,
|
||||
"lenType": 16,
|
||||
"lenVar": 40
|
||||
},
|
||||
"kernels_config": {
|
||||
"python": {
|
||||
"delete_cmd_postfix": "",
|
||||
"delete_cmd_prefix": "del ",
|
||||
"library": "var_list.py",
|
||||
"varRefreshCmd": "print(var_dic_list())"
|
||||
},
|
||||
"r": {
|
||||
"delete_cmd_postfix": ") ",
|
||||
"delete_cmd_prefix": "rm(",
|
||||
"library": "var_list.r",
|
||||
"varRefreshCmd": "cat(var_dic_list()) "
|
||||
}
|
||||
},
|
||||
"position": {
|
||||
"height": "321.85px",
|
||||
"left": "790px",
|
||||
"right": "20px",
|
||||
"top": "113px",
|
||||
"width": "350px"
|
||||
},
|
||||
"types_to_exclude": [
|
||||
"module",
|
||||
"function",
|
||||
"builtin_function_or_method",
|
||||
"instance",
|
||||
"_Feature"
|
||||
],
|
||||
"window_display": false
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
Reference in New Issue
Block a user