{ "cells": [ { "cell_type": "markdown", "metadata": { "tags": [] }, "source": [ "# Feature Engineering\n", "# Klassifikation\n", "## Instanzbasierte Modelle\n", "## Regelbasierte Modelle\n", "## Mathematische Modelle" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import sys\n", "sys.path.append('./')" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2020-03-17T12:01:39.858981Z", "start_time": "2020-03-17T12:01:37.904657Z" } }, "outputs": [], "source": [ "## preparation\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns; sns.set()\n", "%matplotlib inline\n", "\n", "datapath = '../3_data'\n", "from os import chdir; chdir(datapath)\n", "\n", "from bfh_cas_pml import prep_data, prep_demo_data\n", "X_train, X_test, y_train, y_test = prep_data('bank_data_prep.csv', 'y', seed = 1234)\n", "X_demo, y_demo = prep_demo_data('demo_data_class.csv', 'y')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### LinearDiscriminantAnalysis\n", "#### Theorie" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "kein Code zu diesem Kapitel" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Praxis" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2020-03-17T12:01:40.035126Z", "start_time": "2020-03-17T12:01:39.864400Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.8487982963188317\n" ] } ], "source": [ "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n", "model = LinearDiscriminantAnalysis()\n", "model.fit(X_train, y_train) \n", "print(model.score(X_test, y_test))" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2020-03-17T12:01:40.051095Z", "start_time": "2020-03-17T12:01:40.038394Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'covariance_estimator': None, 'n_components': None, 'priors': None, 'shrinkage': None, 'solver': 'svd', 'store_covariance': False, 'tol': 0.0001}\n" ] } ], "source": [ "print(model.get_params())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### QuadraticDiscriminantAnalysis (eine Variante)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "ExecuteTime": { "end_time": "2020-03-17T12:01:40.144808Z", "start_time": "2020-03-17T12:01:40.054435Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.7246729540614543\n" ] } ], "source": [ "from sklearn.discriminant_analysis \\\n", " import QuadraticDiscriminantAnalysis\n", "model = QuadraticDiscriminantAnalysis()\n", "model.fit(X_train, y_train)\n", "print(model.score(X_test, y_test))" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "ExecuteTime": { "end_time": "2020-03-17T12:01:40.160468Z", "start_time": "2020-03-17T12:01:40.149447Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'priors': None, 'reg_param': 0.0, 'store_covariance': False, 'tol': 0.0001}\n" ] } ], "source": [ "print(model.get_params())" ] }, { "cell_type": "markdown", "metadata": { "tags": [] }, "source": [ "### SVC\n", "#### Theorie\n", "#### Praxis" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2020-03-17T12:01:47.205171Z", "start_time": "2020-03-17T12:01:40.196843Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.7161545482202616\n" ] } ], "source": [ "from sklearn.svm import SVC\n", "model = SVC()\n", "model.fit(X_train, y_train) \n", "print(model.score(X_test, y_test))" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "ExecuteTime": { "end_time": "2020-03-17T12:01:47.221147Z", "start_time": "2020-03-17T12:01:47.210935Z" }, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'C': 1.0, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}\n" ] } ], "source": [ "print(model.get_params())" ] }, { "cell_type": "raw", "metadata": {}, "source": [ "## with scaled features\n", "from sklearn.preprocessing import StandardScaler\n", "scaler = StandardScaler()\n", "\n", "scaler.fit(X_train)\n", "X_train_sc = scaler.transform(X_train)\n", "X_test_sc = scaler.transform(X_test)\n", "\n", "model.fit(X_train_sc, y_train) \n", "print(model.score(X_test_sc, y_test))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### GaussianNB\n", "in aller Kürze" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Theorie" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "classes_ : ['A' 'B']\n", "class_prior_ : [0.55555556 0.44444444]\n", "\n", "theta_ :\n", " [[5.58666667]\n", " [4.26666667]]\n", "\n", "var_ :\n", " [[0.31182222]\n", " [0.23055556]]\n" ] } ], "source": [ "## demo of GaussianNB interna with demo data\n", "X_nb_train = X_demo\n", "y_nb_train = y_demo\n", "\n", "X_nb_train = X_nb_train.drop('X2', axis=1)\n", "#print(X_train)\n", "\n", "from sklearn.naive_bayes import GaussianNB\n", "model = GaussianNB()\n", "model.fit(X_nb_train, y_nb_train)\n", "\n", "## print model attributes\n", "print('classes_ :', model.classes_)\n", "print('class_prior_ :', model.class_prior_)\n", "print('\\ntheta_ :\\n', model.theta_)\n", "print('\\nvar_ :\\n', model.var_)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Praxis" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "ExecuteTime": { "end_time": "2020-03-17T12:01:47.963126Z", "start_time": "2020-03-17T12:01:47.897232Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.7337998174627319\n" ] } ], "source": [ "from sklearn.naive_bayes import GaussianNB\n", "model = GaussianNB()\n", "model.fit(X_train, y_train) \n", "print(model.score(X_test, y_test))" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "ExecuteTime": { "end_time": "2020-03-17T12:01:48.042848Z", "start_time": "2020-03-17T12:01:48.032106Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'priors': None, 'var_smoothing': 1e-09}\n" ] } ], "source": [ "print(model.get_params())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### LogisticRegression\n", "#### Theorie\n", "#### Praxis" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "ExecuteTime": { "end_time": "2020-03-17T12:01:56.666086Z", "start_time": "2020-03-17T12:01:56.130695Z" }, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.8475813811986614\n" ] } ], "source": [ "from sklearn.linear_model import LogisticRegression\n", "model = LogisticRegression(max_iter=4000)\n", "model.fit(X_train, y_train) \n", "print(model.score(X_test, y_test))" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "C : 1.0 \n", "class_weight : None \n", "dual : False\n", "fit_intercept : True \n", "intercept_scaling : 1 \n", "l1_ratio : None \n", "max_iter : 4000 \n", "multi_class : deprecated\n", "n_jobs : None \n", "penalty : l2 \n", "random_state : None \n", "solver : lbfgs\n", "tol : 0.0001\n", "verbose : 0 \n", "warm_start : False\n" ] } ], "source": [ "for key, value in model.get_params().items():\n", " print(\"%-20s : %-5s\" % (key, value))" ] } ], "metadata": { "kernelspec": { "display_name": "teaching", "language": "python", "name": "teaching" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.0" }, "toc": { "base_numbering": "2.3", "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "2.3 Klassifikation - Mathematische Modelle", "title_sidebar": "Contents", "toc_cell": true, "toc_position": { "height": "calc(100% - 180px)", "left": "10px", "top": "150px", "width": "202.667px" }, "toc_section_display": true, "toc_window_display": true }, "varInspector": { "cols": { "lenName": 16, "lenType": 16, "lenVar": 40 }, "kernels_config": { "python": { "delete_cmd_postfix": "", "delete_cmd_prefix": "del ", "library": "var_list.py", "varRefreshCmd": "print(var_dic_list())" }, "r": { "delete_cmd_postfix": ") ", "delete_cmd_prefix": "rm(", "library": "var_list.r", "varRefreshCmd": "cat(var_dic_list()) " } }, "position": { "height": "316.717px", "left": "782px", "right": "20px", "top": "119px", "width": "350px" }, "types_to_exclude": [ "module", "function", "builtin_function_or_method", "instance", "_Feature" ], "window_display": false } }, "nbformat": 4, "nbformat_minor": 4 }