{ "cells": [ { "cell_type": "markdown", "metadata": { "tags": [] }, "source": [ "# Feature Engineering\n", "# Klassifikation\n", "# Regression\n", "# Validierung und mehr\n", "# Deployment und Abschluss" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import sys\n", "sys.path.append('./')" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2020-04-08T10:06:24.890328Z", "start_time": "2020-04-08T10:06:23.220148Z" }, "tags": [] }, "outputs": [], "source": [ "## prepare environment\n", "import pandas as pd\n", "import numpy as np\n", "datapath = '../3_data'\n", "from os import chdir; chdir(datapath)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Das finale Modell\n", "## Feature Engineering in der Produktion\n", "### Missing Values\n", "### Neue Kategorien\n", "### Protokollieren" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "data = pd.read_csv('bank_data.csv', sep=';')\n", "\n", "import datetime\n", "\n", "f = open('fe_prod_log.log','w')\n", "f.write(datetime.datetime.now().strftime(\"[%Y-%m-%d %H:%M:%S] (timestamp)\"))\n", "\n", "f.write('\\n\\n')\n", "\n", "s = data.isna().sum()\n", "f.write('features with NA\\'s\\n')\n", "f.write('=======================')\n", "\n", "f.write('\\n')\n", "f.write(s[s > 0].to_string())\n", "f.write('\\n\\n')\n", "\n", "## value counts of not numeric features\n", "f.write('categorical cols levels\\n')\n", "f.write('=======================')\n", "f.write('\\n')\n", "catcolnames = data.select_dtypes(include='object').columns\n", "for ccn in catcolnames:\n", " f.write(ccn)\n", " f.write('\\n')\n", " f.write(data[ccn].value_counts().sort_index().to_string())\n", " f.write('\\n\\n')\n", "f.close() " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Modellübergabe in die Prodkution" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Modelle speichern scikit-learn intern" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "tags": [] }, "outputs": [], "source": [ "## load data\n", "from bfh_cas_pml import prep_data\n", "X_train, X_test, y_train, y_test = prep_data('melb_data_prep.csv', 'Price')" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "## three models:\n", "\n", "## StandardScaler\n", "from sklearn.preprocessing import StandardScaler\n", "model_sc = StandardScaler().fit(X_train)\n", "\n", "## LinearRegression\n", "from sklearn.linear_model import LinearRegression\n", "model_lr = LinearRegression().fit(X_train, y_train)\n", "\n", "## DecisionTreeRegressor\n", "from sklearn.tree import DecisionTreeRegressor\n", "model_dt = DecisionTreeRegressor(max_depth= 2, random_state=1234).fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "## save models\n", "import pickle \n", "with open('model_sc.pkl', 'wb') as pickle_file:\n", " pickle.dump(model_sc, pickle_file)\n", "with open('model_lr.pkl', 'wb') as pickle_file:\n", " pickle.dump(model_lr, pickle_file)\n", "with open('model_dt.pkl', 'wb') as pickle_file:\n", " pickle.dump(model_dt, pickle_file)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "## reload models\n", "with open('model_sc.pkl', 'rb') as pickle_file:\n", " model_sc_2 = pickle.load(pickle_file)\n", "with open('model_lr.pkl', 'rb') as pickle_file:\n", " model_lr_2 = pickle.load(pickle_file)\n", "with open('model_dt.pkl', 'rb') as pickle_file:\n", " model_dt_2 = pickle.load(pickle_file) " ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[ 2.93174034e+00 1.45261784e+00 1.04267493e+01 1.43475779e+00\n", " 1.68210732e+00 2.36615297e+00 2.09901978e+00 1.96804559e+03\n", " 5.74824662e+00 -3.78091214e+01 1.44997478e+02 7.52039447e+03\n", " 6.57396836e-01 1.29098026e-01 9.18284130e-02 2.89430762e-01\n", " 3.73511662e-02 3.43010928e-01 1.01125428e-02 2.10650791e-01\n", " 7.15274833e+00 2.01656214e+03 4.83925950e+00] \n", "\n", "[9.08813135e-01 4.68436712e-01 3.68004376e+01 4.24996430e-01\n", " 7.32413501e-01 7.11803450e-01 3.34340977e-02 6.45488299e+02\n", " 3.80781143e+01 5.53828210e-03 9.28052881e-03 2.02869238e+07\n", " 2.25226236e-01 1.12431726e-01 8.33959556e-02 2.05660596e-01\n", " 3.59560566e-02 2.25354431e-01 1.00102793e-02 1.66277035e-01\n", " 6.21684084e+00 2.46138222e-01 1.23097215e+00] \n", "\n", "[ 2.93174034e+00 1.45261784e+00 1.04267493e+01 1.43475779e+00\n", " 1.68210732e+00 2.36615297e+00 2.09901978e+00 1.96804559e+03\n", " 5.74824662e+00 -3.78091214e+01 1.44997478e+02 7.52039447e+03\n", " 6.57396836e-01 1.29098026e-01 9.18284130e-02 2.89430762e-01\n", " 3.73511662e-02 3.43010928e-01 1.01125428e-02 2.10650791e-01\n", " 7.15274833e+00 2.01656214e+03 4.83925950e+00] \n", "\n", "[9.08813135e-01 4.68436712e-01 3.68004376e+01 4.24996430e-01\n", " 7.32413501e-01 7.11803450e-01 3.34340977e-02 6.45488299e+02\n", " 3.80781143e+01 5.53828210e-03 9.28052881e-03 2.02869238e+07\n", " 2.25226236e-01 1.12431726e-01 8.33959556e-02 2.05660596e-01\n", " 3.59560566e-02 2.25354431e-01 1.00102793e-02 1.66277035e-01\n", " 6.21684084e+00 2.46138222e-01 1.23097215e+00] \n", "\n" ] } ], "source": [ "## compare model_sc\n", "print(model_sc.mean_, '\\n')\n", "print(model_sc.var_, '\\n')\n", "print(model_sc_2.mean_, '\\n')\n", "print(model_sc_2.var_, '\\n')" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "-148600153.0644718\n", "[ 2.38730932e+05 -1.44127046e+05 -4.11587415e+04 1.55233185e+05\n", " 4.16082458e+04 8.22944764e+04 2.97716907e+05 -2.76754904e+03\n", " -4.92937355e+03 -5.11115932e+05 1.79228159e+05 -1.58740210e+00\n", " 7.07758671e+04 7.99854850e+03 3.38953273e+04 -1.73563804e+05\n", " 1.07806729e+05 2.39261230e+05 2.81825047e+05 -2.26744902e+05\n", " 1.27253408e+03 5.38476338e+04 3.83311293e+03] \n", "\n", "-148600153.0644718\n", "[ 2.38730932e+05 -1.44127046e+05 -4.11587415e+04 1.55233185e+05\n", " 4.16082458e+04 8.22944764e+04 2.97716907e+05 -2.76754904e+03\n", " -4.92937355e+03 -5.11115932e+05 1.79228159e+05 -1.58740210e+00\n", " 7.07758671e+04 7.99854850e+03 3.38953273e+04 -1.73563804e+05\n", " 1.07806729e+05 2.39261230e+05 2.81825047e+05 -2.26744902e+05\n", " 1.27253408e+03 5.38476338e+04 3.83311293e+03]\n" ] } ], "source": [ "## compare model_lr\n", "print(model_lr.intercept_)\n", "print(model_lr.coef_, '\\n')\n", "print(model_lr_2.intercept_)\n", "print(model_lr_2.coef_)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "|--- Rooms <= 3.50\n", "| |--- Type <= 1.50\n", "| | |--- value: [1077045.50]\n", "| |--- Type > 1.50\n", "| | |--- value: [682863.38]\n", "|--- Rooms > 3.50\n", "| |--- Regionname_Southern_Metropolitan <= 0.50\n", "| | |--- value: [1163850.50]\n", "| |--- Regionname_Southern_Metropolitan > 0.50\n", "| | |--- value: [2113002.90]\n", "\n", "|--- Rooms <= 3.50\n", "| |--- Type <= 1.50\n", "| | |--- value: [1077045.50]\n", "| |--- Type > 1.50\n", "| | |--- value: [682863.38]\n", "|--- Rooms > 3.50\n", "| |--- Regionname_Southern_Metropolitan <= 0.50\n", "| | |--- value: [1163850.50]\n", "| |--- Regionname_Southern_Metropolitan > 0.50\n", "| | |--- value: [2113002.90]\n", "\n" ] } ], "source": [ "## compare model_dt\n", "from sklearn.tree import export_text\n", "print(export_text(\n", " model_dt, feature_names=list(X_train.columns)))\n", "print(export_text(\n", " model_dt_2, feature_names=list(X_train.columns)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Modelle speichern extern mit PMML" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Vorbereitungen:** \n", "* Java Runtime muss vorhanden sein\n", "* Library installieren direkt in Notebook: \n", "`!pip install sklearn2pmml`" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: sklearn2pmml in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (0.119.1)\n", "Requirement already satisfied: dill>=0.3.4 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from sklearn2pmml) (0.4.0)\n", "Requirement already satisfied: joblib>=0.13.0 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from sklearn2pmml) (1.5.1)\n", "Requirement already satisfied: pandas>=1.5.0 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from sklearn2pmml) (2.2.3)\n", "Requirement already satisfied: scikit-learn>=1.0 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from sklearn2pmml) (1.6.1)\n", "Requirement already satisfied: numpy>=1.26.0 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from pandas>=1.5.0->sklearn2pmml) (2.2.6)\n", "Requirement already satisfied: python-dateutil>=2.8.2 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from pandas>=1.5.0->sklearn2pmml) (2.9.0.post0)\n", "Requirement already satisfied: pytz>=2020.1 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from pandas>=1.5.0->sklearn2pmml) (2025.2)\n", "Requirement already satisfied: tzdata>=2022.7 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from pandas>=1.5.0->sklearn2pmml) (2025.2)\n", "Requirement already satisfied: six>=1.5 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from python-dateutil>=2.8.2->pandas>=1.5.0->sklearn2pmml) (1.17.0)\n", "Requirement already satisfied: scipy>=1.6.0 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from scikit-learn>=1.0->sklearn2pmml) (1.15.3)\n", "Requirement already satisfied: threadpoolctl>=3.1.0 in /Users/vgv1/.pyenv/versions/teaching/lib/python3.13/site-packages (from scikit-learn>=1.0->sklearn2pmml) (3.6.0)\n" ] } ], "source": [ "!pip install sklearn2pmml" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "## import libraries\n", "from sklearn.pipeline import Pipeline\n", "from sklearn2pmml import PMMLPipeline, sklearn2pmml" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "scrolled": true }, "outputs": [], "source": [ "## StandardScaler\n", "from sklearn.preprocessing import StandardScaler\n", "pipeline = PMMLPipeline([(\"scaler\", StandardScaler())]).fit(X_train)\n", "sklearn2pmml(pipeline, \"StandardScaler_melb.pmml\", with_repr = True)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "## LinearRegression\n", "pipeline = PMMLPipeline([('regressor', LinearRegression())]).fit(X_train, y_train)\n", "sklearn2pmml(pipeline, \"LinearRegression_melb.pmml\", with_repr = True)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "## DecisionTreeClassifier\n", "pipeline = PMMLPipeline([('regressor', DecisionTreeRegressor(max_depth= 2, random_state=1234))]).fit(X_train, y_train)\n", "sklearn2pmml(pipeline, \"DecisionTreeRegressor_melb.pmml\", with_repr = True)" ] }, { "cell_type": "markdown", "metadata": { "tags": [] }, "source": [ "* re-import a pmml model\n", " * see https://stackoverflow.com/questions/52393301/use-pmml-models-in-python" ] }, { "cell_type": "raw", "metadata": { "tags": [] }, "source": [ "## !pip install pypmml\n", "## not run: long running time\n", "from pypmml import Model\n", "new_model = Model.fromFile(\"StandardScaler_melb.pmml\")\n", "result = new_model.predict(X_train)\n", "print(result.describe())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Der Modellierungsprozess" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "(kein Code unter diesem Titel)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## pyCaret" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "vgl. `5.5 Deployment und Abschluss - pycaret.ipynb`" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.0" }, "toc": { "base_numbering": "5", "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "5 Deployment und Abschluss", "title_sidebar": "Contents", "toc_cell": true, "toc_position": { "height": "calc(100% - 180px)", "left": "10px", "top": "150px", "width": "165px" }, "toc_section_display": true, "toc_window_display": true }, "varInspector": { "cols": { "lenName": 16, "lenType": 16, "lenVar": 40 }, "kernels_config": { "python": { "delete_cmd_postfix": "", "delete_cmd_prefix": "del ", "library": "var_list.py", "varRefreshCmd": "print(var_dic_list())" }, "r": { "delete_cmd_postfix": ") ", "delete_cmd_prefix": "rm(", "library": "var_list.r", "varRefreshCmd": "cat(var_dic_list()) " } }, "position": { "height": "321.85px", "left": "790px", "right": "20px", "top": "113px", "width": "350px" }, "types_to_exclude": [ "module", "function", "builtin_function_or_method", "instance", "_Feature" ], "window_display": false } }, "nbformat": 4, "nbformat_minor": 4 }