{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Deployment und Abschluss - pycaret" ] }, { "cell_type": "markdown", "metadata": { "tags": [] }, "source": [ "## Install\n", "\n", "Attention: PyCaret does not run natively on Apple Silicon and in this case it is recommended to use Docker instead\n", "\n", "ref: https://pycaret.gitbook.io/docs/get-started/installation\n", "* You can install PyCaret with Python's pip package manager:" ] }, { "cell_type": "raw", "metadata": { "jp-MarkdownHeadingCollapsed": true, "tags": [] }, "source": [ "!pip install pycaret" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load and prep Data" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2021-12-25T16:56:08.445356Z", "start_time": "2021-12-25T16:56:06.863957Z" }, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(9860, 30)\n" ] } ], "source": [ "## load data\n", "import numpy as np\n", "import pandas as pd\n", "\n", "datapath = '../3_data'\n", "from os import chdir; chdir(datapath)\n", "dataset = pd.read_csv('bank_data_prep.csv')\n", "print(dataset.shape)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "## remove duration\n", "dataset = dataset.drop(\"duration\", axis = 1)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Data for Modeling: (8874, 29)\n", "Unseen Data For Predictions: (986, 29)\n" ] } ], "source": [ "## train - test - split\n", "from sklearn.model_selection import train_test_split\n", "data, data_unseen = train_test_split(dataset, train_size=0.9, random_state=1234)\n", "\n", "print('Data for Modeling: ' + str(data.shape))\n", "print('Unseen Data For Predictions: ' + str(data_unseen.shape))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Run a Classication Experiment" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Init setup" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
 DescriptionValue
0Session id1234
1Targety
2Target typeBinary
3Target mappingno: 0, yes: 1
4Original data shape(8874, 29)
5Transformed data shape(8874, 29)
6Transformed train set shape(6211, 29)
7Transformed test set shape(2663, 29)
8Numeric features14
9PreprocessTrue
10Imputation typesimple
11Numeric imputationmean
12Categorical imputationmode
13Fold GeneratorStratifiedKFold
14Fold Number5
15CPU Jobs-1
16Use GPUFalse
17Log ExperimentFalse
18Experiment Nameclf-default-name
19USI9ce1
\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from pycaret.classification import *\n", "s = setup(\n", " data = data, \n", " target = 'y', \n", " fold = 5, ## defaul = 10\n", " session_id=1234) ## random seed" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Show available Models (for Classification)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NameReferenceTurbo
ID
lrLogistic Regressionsklearn.linear_model._logistic.LogisticRegressionTrue
knnK Neighbors Classifiersklearn.neighbors._classification.KNeighborsCl...True
nbNaive Bayessklearn.naive_bayes.GaussianNBTrue
dtDecision Tree Classifiersklearn.tree._classes.DecisionTreeClassifierTrue
svmSVM - Linear Kernelsklearn.linear_model._stochastic_gradient.SGDC...True
rbfsvmSVM - Radial Kernelsklearn.svm._classes.SVCFalse
gpcGaussian Process Classifiersklearn.gaussian_process._gpc.GaussianProcessC...False
mlpMLP Classifiersklearn.neural_network._multilayer_perceptron....False
ridgeRidge Classifiersklearn.linear_model._ridge.RidgeClassifierTrue
rfRandom Forest Classifiersklearn.ensemble._forest.RandomForestClassifierTrue
qdaQuadratic Discriminant Analysissklearn.discriminant_analysis.QuadraticDiscrim...True
adaAda Boost Classifiersklearn.ensemble._weight_boosting.AdaBoostClas...True
gbcGradient Boosting Classifiersklearn.ensemble._gb.GradientBoostingClassifierTrue
ldaLinear Discriminant Analysissklearn.discriminant_analysis.LinearDiscrimina...True
etExtra Trees Classifiersklearn.ensemble._forest.ExtraTreesClassifierTrue
lightgbmLight Gradient Boosting Machinelightgbm.sklearn.LGBMClassifierTrue
catboostCatBoost Classifiercatboost.core.CatBoostClassifierTrue
dummyDummy Classifiersklearn.dummy.DummyClassifierTrue
\n", "
" ], "text/plain": [ " Name \\\n", "ID \n", "lr Logistic Regression \n", "knn K Neighbors Classifier \n", "nb Naive Bayes \n", "dt Decision Tree Classifier \n", "svm SVM - Linear Kernel \n", "rbfsvm SVM - Radial Kernel \n", "gpc Gaussian Process Classifier \n", "mlp MLP Classifier \n", "ridge Ridge Classifier \n", "rf Random Forest Classifier \n", "qda Quadratic Discriminant Analysis \n", "ada Ada Boost Classifier \n", "gbc Gradient Boosting Classifier \n", "lda Linear Discriminant Analysis \n", "et Extra Trees Classifier \n", "lightgbm Light Gradient Boosting Machine \n", "catboost CatBoost Classifier \n", "dummy Dummy Classifier \n", "\n", " Reference Turbo \n", "ID \n", "lr sklearn.linear_model._logistic.LogisticRegression True \n", "knn sklearn.neighbors._classification.KNeighborsCl... True \n", "nb sklearn.naive_bayes.GaussianNB True \n", "dt sklearn.tree._classes.DecisionTreeClassifier True \n", "svm sklearn.linear_model._stochastic_gradient.SGDC... True \n", "rbfsvm sklearn.svm._classes.SVC False \n", "gpc sklearn.gaussian_process._gpc.GaussianProcessC... False \n", "mlp sklearn.neural_network._multilayer_perceptron.... False \n", "ridge sklearn.linear_model._ridge.RidgeClassifier True \n", "rf sklearn.ensemble._forest.RandomForestClassifier True \n", "qda sklearn.discriminant_analysis.QuadraticDiscrim... True \n", "ada sklearn.ensemble._weight_boosting.AdaBoostClas... True \n", "gbc sklearn.ensemble._gb.GradientBoostingClassifier True \n", "lda sklearn.discriminant_analysis.LinearDiscrimina... True \n", "et sklearn.ensemble._forest.ExtraTreesClassifier True \n", "lightgbm lightgbm.sklearn.LGBMClassifier True \n", "catboost catboost.core.CatBoostClassifier True \n", "dummy sklearn.dummy.DummyClassifier True " ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "models()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Model Training and Selection" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
 ModelAccuracyAUCRecallPrec.F1KappaMCCTT (Sec)
gbcGradient Boosting Classifier0.76170.80200.76170.77300.75670.51440.52900.4640
catboostCatBoost Classifier0.75820.79960.75820.76810.75340.50740.52073.4800
lightgbmLight Gradient Boosting Machine0.75430.79540.75430.76320.74980.49980.51200.3200
adaAda Boost Classifier0.74920.79570.74920.76160.74320.48820.50440.2180
rfRandom Forest Classifier0.74210.79020.74210.74510.73940.47710.48260.4160
lrLogistic Regression0.73930.78520.73930.74350.73610.47090.47782.8060
ridgeRidge Classifier0.73260.78510.73260.73530.72980.45780.46300.0740
ldaLinear Discriminant Analysis0.73240.78510.73240.73510.72960.45750.46270.0780
etExtra Trees Classifier0.72240.77320.72240.72320.72060.43870.44140.4140
qdaQuadratic Discriminant Analysis0.72210.77130.72210.73790.71320.43130.45200.0900
nbNaive Bayes0.71500.76370.71500.71690.71220.42240.42690.0640
knnK Neighbors Classifier0.71420.74540.71420.71440.71290.42300.42472.5480
dtDecision Tree Classifier0.66140.66080.66140.66180.66150.32060.32070.0840
dummyDummy Classifier0.53200.50000.53200.28300.36940.00000.00000.0620
svmSVM - Linear Kernel0.51180.70840.51180.44630.35270.01240.04130.1960
\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Processing: 0%| | 0/65 [00:00\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
 ModelAccuracyAUCRecallPrec.F1KappaMCC
0Gradient Boosting Classifier0.74950.79900.74950.76260.74570.49720.5109
\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ " age education housing contact_cellular month day_of_week \\\n", "2809 36.0 6 0 1 5 2 \n", "4052 34.0 6 1 1 8 3 \n", "658 36.0 6 1 0 5 2 \n", "786 40.0 5 0 0 5 3 \n", "6675 36.0 6 0 1 11 3 \n", "\n", " campaign pdays previous emp_var_rate ... job_student \\\n", "2809 0.477121 0 0 -1.8 ... False \n", "4052 0.698970 0 0 1.4 ... False \n", "658 0.903090 0 0 -1.8 ... False \n", "786 0.698970 0 0 1.1 ... False \n", "6675 0.477121 0 0 -0.1 ... False \n", "\n", " job_technician job_unemployed marital_married marital_single \\\n", "2809 False False False True \n", "4052 True False False True \n", "658 False False False True \n", "786 False False True False \n", "6675 False False False True \n", "\n", " loan_unknown loan_yes y prediction_label prediction_score \n", "2809 False False yes no 0.5681 \n", "4052 False True yes no 0.6703 \n", "658 False False yes no 0.8444 \n", "786 False False no no 0.7768 \n", "6675 False False no no 0.6379 \n", "\n", "[5 rows x 31 columns]\n" ] } ], "source": [ "predictions = predict_model(best_model, data=data_unseen)\n", "print(predictions.head())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Save best Model Pipeline" ] }, { "cell_type": "raw", "metadata": {}, "source": [ "save_model(best_model, 'best_model_pipeline')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Tune a specific Model\n", "ref: https://pycaret.gitbook.io/docs/get-started/functions/optimize" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### rf: Random Forest Classifier" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/html": [], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
 AccuracyAUCRecallPrec.F1KappaMCC
Fold       
00.74090.79390.74090.74470.73790.47440.4808
10.75360.79610.75360.75830.75050.49980.5072
20.73510.77100.73510.73920.73170.46210.4691
30.73110.79670.73110.73290.72870.45530.4594
40.74960.79310.74960.75060.74810.49400.4966
Mean0.74210.79020.74210.74510.73940.47710.4826
Std0.00850.00970.00850.00880.00870.01740.0175
\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Processing: 0%| | 0/4 [00:00" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
 AccuracyAUCRecallPrec.F1KappaMCC
Fold       
00.75620.80530.75620.76540.75170.50370.5161
10.76890.81340.76890.78250.76350.52850.5456
20.75440.78970.75440.76720.74850.49880.5154
30.76170.80210.76170.77090.75720.51470.5272
40.76890.80710.76890.77590.76550.53050.5402
Mean0.76200.80350.76200.77240.75730.51530.5289
Std0.00610.00790.00610.00620.00660.01270.0123
\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Processing: 0%| | 0/7 [00:00