{ "cells": [ { "cell_type": "markdown", "metadata": { "tags": [] }, "source": [ "# Feature Engineering\n", "# Klassifikation\n", "# Regression\n", "# Validierung und mehr\n", "## Sampling und Resampling\n", "## Validierungstechniken" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2020-04-15T20:53:02.218161Z", "start_time": "2020-04-15T20:53:02.079407Z" } }, "outputs": [], "source": [ "## preparaton\n", "\n", "## import libraries\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns; sns.set()\n", "%matplotlib inline\n", "\n", "## load data\n", "datapath = '../3_data'\n", "from os import chdir; chdir(datapath)\n", "data = pd.read_csv('bank_data_prep.csv')\n", "data.shape ## zur Kontrolle\n", "\n", "## features - target - split\n", "X = data.drop('y', axis=1)\n", "y = data['y']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Holdout Validierung" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.8773958016428354\n" ] } ], "source": [ "## repetition from classification and regression\n", "\n", "## train - test - split\n", "from sklearn.model_selection import train_test_split\n", "X_train, X_test, y_train, y_test, = train_test_split(\n", " X,\n", " y,\n", " train_size=2/3,\n", " random_state=1234)\n", "\n", "## train and score\n", "from sklearn.ensemble import RandomForestClassifier\n", "model = RandomForestClassifier()\n", "model.fit(X_train, y_train)\n", "print(model.score(X_test, y_test))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Kreuzvalidierung" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "ref: https://scikit-learn.org/stable/modules/cross_validation.html" ] }, { "cell_type": "raw", "metadata": { "tags": [] }, "source": [ "## \"manually\" with sklearn.model_selection.KFold\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.model_selection import KFold\n", "kf = KFold(n_splits=5, shuffle=True, random_state=1234)\n", "\n", "scores = []\n", "for train, test in kf.split(X):\n", " X_train = X.iloc[train,:]\n", " X_test = X.iloc[test,:]\n", " y_train = y[train]\n", " y_test = y[test]\n", " model = RandomForestClassifier()\n", " model.fit(X_train, y_train)\n", " score = model.score(X_test, y_test)\n", " scores.append(score)\n", " print(score)\n", "\n", "## results\n", "print(scores)\n", "print('mean:', np.mean(scores))\n", "print('std: ', np.std(scores))\n", "sns.boxplot(x=scores);" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2020-04-14T21:29:33.905316Z", "start_time": "2020-04-14T21:29:32.438063Z" } }, "outputs": [], "source": [ "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.model_selection import cross_val_score\n", "\n", "model = RandomForestClassifier(random_state=1234)\n", "\n", "## cross validation\n", "kfold = 10 ## default: 5\n", "scores = cross_val_score(model, X, y, cv=kfold)\n", "\n", "## combine call\n", "#scores = cross_val_score(RandomForestClassifier(random_state=1234), X, y, cv=kfold)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2020-04-14T21:29:33.920862Z", "start_time": "2020-04-14T21:29:33.909541Z" }, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "mean: 0.8855983772819472\n", "std: 0.011729681375092045\n" ] }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAg8AAAGgCAYAAAAth4QEAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjMsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvZiW1igAAAAlwSFlzAAAPYQAAD2EBqD+naQAAFpRJREFUeJzt3QuMXHW9wPH/bgsUodgCgqioiJYK8iiFgrwUMCr4CGoEBFRQeVyEIHgxiAavAcGIQlEChBiJEdAmAqLBN+KbysMbkGcpL5GnfdFSWvrYc/P7m9lsl9bbn3R3ZnY/n6Quu3P2nPnPf2fmO2fOGXuapmkKAMBa6l3bBQEAgngAAFLEAwCQIh4AgBTxAACkiAcAIEU8AAAp4gEASBlbhkB87lRf35o/e6q3t+ffXj5SGffoMRrHHIx7dBmN4x7JY+7t7Sk9PT3ti4e4YefNW7z6DY7tLRMnblQWLny+rFjRV0YL4x494x6NYw7Gbdwj3Ugf86abblTGjFm7ePC2BQCQIh4AgBTxAACkiAcAIEU8AAAp4gEASBEPAECKeAAAUsQDAJAiHgCAFPEAAKSIBwAgRTwAACniAQBIEQ8AQIp4AABSxAMAkCIeAIAU8QAApIgHACBFPAAAKeIBAEgRDwBAingAAFLEAwCQIh4AgBTxAACkiAcAIEU8AAAp4gEASBEPAECKeAAAUsQDAJAiHgCAlLG5xekUc+fOKc89t6h0ujFjesvcuePKokVLy8qVfWU0WJdj3njj8WWzzTZfZ9cNYF0QD10aDmee+d9l+fJl7b4qDLH11lu/nHvu1wUE0FHEQxeKPQ4RDuNetWfpXX+Tdl+dUanvhYVl6ZMzy7it9iy9GwzNHPQtW1iWPjGzzrd4ADqJeOhiEQ5jNty03VdjVItwMAfAaOOASQAgRTwAACniAQBIEQ8AQIp4AABSxAMAkCIeAIAU8QAApIgHACBFPAAAKeIBAEgRDwBAingAAFLEAwCQIh4AgBTxAACkiAcAIEU8AAAp4gEASBEPAECKeAAAUsQDAJAiHgCAFPEAAKSIBwAgRTwAACniAQBIEQ8AQIp4AABSxAMAkCIeAIAU8QAApIgHACBFPAAAKeIBAEgRDwBAingAAFLEAwCQIh4AgBTxAACkiAcAIEU8AAAp4gEASBEPAECKeAAAUsQDAJAiHgCAFPEAAKSIBwAgRTwAACniAQBIEQ8AQIp4AABSxAMAkCIeAIAU8QAApIgHACBFPAAAKeIBAEgRDwBAingAAFLEAwCQIh4AgBTxAACkiAcAIEU8AAAp4gEASBEPAECKeAAAUsQDADCy42HRooXtvgoAMKp1VTw888zT5TOf+a/6FUa6vuXP16/z589r91UB6N54WLLk+dI0Tf0KI17fivpl6dIl7b4mAN0bDwBA+4kHACBFPAAAKeIBAEgRDwBAingAAFLEAwCQIh4AgBTxAACkiAcAIEU8AAAp4gEASBEPAECKeAAAUsQDAJAiHgCAFPEAAKSIBwAgRTwAACniAQBIEQ8AQIp4AABSxAMAkCIeAIAU8QAApIgHACBFPAAAKeIBAEgRDwBAingAAFLEAwCQIh4AgBTxAACkiAcAIEU8AAAp4gEASBEPAECKeAAAUsQDAJAiHgCAFPEAAKSIBwAgRTwAACniAQBIEQ8AQIp4AABSxAMAkCIeAIAU8QAApIgHACBFPAAAKeIBAEgRDwBAingAAFLEAwCQIh4AgBTxAACkiAcAIEU8AAAp4gEASBEPAECKeAAAUsQDAJAiHgCAFPEAAKSIBwAgRTwAACljc4sDAKvT19dXZs26ryxYsKBMmDChTJo0ufT29nbdNtaGeACAl+j2228pM2ZcVebM+Wf/zzbf/BXlsMOOLFOnTuuabawt8QAAL/FJ/ZJLLio77zylHH/8SeXVr966PP74Y+WGG66vPz/xxFNe8pP7cGwjwzEPAPAS3kaYMeOq+qR+0kmnlW23fVMZN25c/Rrfx8/j8liuk7cxKvY8PPnk46XbjBnTW+bOHVcWLVpaVq7sG3XjBxiJZs26r76NEHsDBh97EN8ffPD7y7nn/k9dbvLk7Tt2G6MiHi6//JJ2XwUAKHHgYoi3EVan9fPWcp26jVERD8cdd2LZaqtXl27b8zB+/Lrb8yCgANpvwoQJ9WscfxBvIwwWPx+4XKduY1TEQ4TD6163TekmY8f2lokTNyrz5y8uK1YM3/tSAAydSZMm1zMe4sDFOP5g4NsKcQzCT3/643p5LNfJ28hywCQA/Id6e3vrqZJ33PG/5eKLLyizZ88qS5YsqV/j+/h5XP5SPothOLYxKvY8AECnmDp1Wj1VMs54iAMXW2JvwLo6hXI4tpEhHgDgJZo6dVqZMmW3If30x+HYxtoSDwCwDvT29g75qZLDsY21uh7tvgIAQHcRDwBAingAAFLEAwCQIh4AgBTxAACkiAcAIEU8AAAp4gEASBEPAECKeAAAUsQDAJAiHgCAFPEAAKSIBwAgRTwAACniAQBIEQ8AQIp4AABSxAMAkCIeAIAU8QAApIgHACBFPAAAKeIBAEgRDwBAingAAFLEAwCQIh4AgBTxAACkiAcAIEU8AAAp4gEASBEPAECKeAAAUsQDAJAiHgCAFPEAAKSIBwAgRTwAACniAQBIEQ8AQIp4AABSxAMAkCIeAIAU8QAApIgHACBFPAAAKeIBAEgRDwBAingAAFLEAwCQIh4AgBTxAACkiAcAIEU8AAAp4gEASBEPAECKeAAAUsQDAJAiHgCAFPEAAKSIBwAgRTwAACniAQBIEQ8AQIp4AABGbjxsuOHLSk9PT/0KI17v2Ppl3LgN231NAFbxr0enLrHFFluW6dMvLePHb9LuqwJDrne9f0XyxImbtvuqAHTvnocgHACgvbouHgCA9hIPAECKeAAAUsQDAJAiHgCAFPEAAKSIBwAgRTwAACniAQBIEQ8AQIp4AABSxAMAkCIeAIAU8QAApIgHACBFPAAAKeIBAEgRDwBAingAAFLEAwCQIh4AgBTxAACkiAcAIEU8AAAp4gEASBEPAECKeAAAUsQDAJAiHgCAFPEAAKSIBwAgRTwAACniAQBIEQ8AQIp4AABSxAMAkCIeAIAU8QAApIgHACBFPAAAKeIBAEgRDwBAingAAFLEAwCQIh4AgBTxAACkiAcAIEU8AAAp4gEASBEPAECKeAAAUsQDAJAiHgCAFPEAAKSIBwAgRTwAACniAQBIEQ8AQIp4AABSxAMAkCIeAIAU8QAApIgHACBFPAAAKeIBAEgRDwBAingAAFLEAwCQIh4AgBTxAACkjM0tTifpW7aw3Vdh1Op7YeEqX4dkG+YX6FDioQttvPH4st5665elT8xs91UZ9ZY+ObRzEPMc8w3QScRDF9pss83Lued+vTz33KLS6caM6S3jx48rixYtLStX9pXRYF2OOcIh5hugk4iHLhVPKN3wpDJ2bG+ZOHGjMn/+4rJixeiIh9E4ZmB0ccAkAJAiHgCAFPEAAKSIBwAgRTwAACniAQBIEQ8AQIp4AABSxAMAkCIeAIAU8QAApIgHACBFPAAAKeIBAEgRDwBAingAAFLEAwCQIh4AgBTxAACkiAcAIEU8AAAp4gEASBEPAECKeAAAUsQDAJAiHgCAFPEAAKSIBwAgRTwAACniAQBIEQ8AQIp4AABSxAMAkNLTNE1T1rFYZV/fmlc7ZkxvWbmyr4w2xj16jMYxB+MeXUbjuEfymHt7e0pPT0/74gEAGLm8bQEApIgHACBFPAAAKeIBAEgRDwBAingAAFLEAwCQIh4AgBTxAACkiAcAIEU8AAAp4gEASBEPAMDQxkNfX1/55je/Wfbdd9+yyy67lGOPPbY89thja1x+7ty55bOf/WzZc889yx577FFOPfXU8vTTT/dfvt12263x3xNPPFGXeeGFF8qXv/zl8ta3vrVMmTKlrm/evHlluLRjzLfffvtqL//LX/5SunXc4YYbbijvfe97y84771wOPvjg8qMf/WiVy+fPn1/Xsfvuu5dp06bVeV+yZEkZTu0Y949//OPVzvc//vGP0q3jjvVdccUV5V3vele9337sYx8rd9111yrriPEdf/zxZddddy377LNPmT59elm5cmUZyWO+9NJLVzvXwyk77kceeaQcd9xxZbfddiv77bdf/d0VK1asssxVV11VDjzwwLLTTjuVI444otxzzz0dNdftGvelHTDfQ6JJ+ta3vtXssccezU033dTce++9zSc+8Ynmne98Z/PCCy+sdvmjjjqqOfzww5t77rmnufvuu5tDDz20+dCHPtR/+TPPPLPKvwceeKCu/3Of+1z/MmeccUbzjne8o7n11lubO+64oznkkEOaI488shku7RjzVVddVcc8eNk1bbMbxn3zzTc322+/ffP973+/+fvf/95ceeWVzeTJk5vf/va3q6wjfueuu+5q/vznPzf777//KrfLSB331772tbqewfO9YsWKplvHfdlllzVvectbmquvvrp56KGH6vp33nnn5sEHH6yXL1u2rK7/uOOOa+6///7mV7/6VTNt2rTmoosuGrFjDqecckpz+umnv2iuh1Nm3AsWLGj22muvOva4X8bj8Lvf/e7m85//fP8y1157bbPTTjs1119/fX08i/HFXM6dO7dj5rod4+6U+R4KqXiIG3jKlCn1ia3l2WefrTfeT37ykxctH5dNmjSpufHGG/t/9utf/7r+bP78+avdxsknn1wnqDWZTz311IseaONOGev461//2gy1dow5fOlLX2pOOOGEpl2GYtznnHNO84EPfGCV34sQPPvss+t/x3zG8rNnz+6//A9/+EOz3Xbb1b+DkTru8KlPfWqV74fbUIx7t912a84///xVfu/oo4+uLwZCrDeeaONBuuUHP/hBs+uuuw5LJLdjzOGggw5qrrjiiqZdsuOO67rLLrus8oR422231XE/9thj9ft4Ao4Ablm+fHnztre9rcZUJ8x1u8bdCfM9VFJvW9x3331l8eLF9e2Dlk022aRsv/325dZbb33R8uPGjSsbbbRR3UX73HPP1X/XX3992WabbervDfbHP/6x/PKXvyxnn312WX/99ft334fYTdgSv7/llluudpvrWjvGHO6///6y7bbblnYZinFvttlm5YEHHigzZ86MaK1vwTz44IN1d1+47bbbyite8YpVxh1vXfT09PT/HYzEcY/E+Y63FRcuXFh39w705je/udxyyy39873DDjuUl7/85f2Xx/081nXvvfeWkTjmZcuW1V3hb3jDG0q7ZMf96KOP1uu76aab9v8slm3NYbyVE2MauL6xY8fW26G1vnbPdbvGvawD5nuojM0s/NRTT9WvW2211So/32KLLfovGyieDL/61a+Ws846q96g8SQQy1555ZWlt/fF3XLBBRfU944G3vni/cSJEyeWDTbYYK22ua61Y8whnmxi3B/84AfrbTBp0qT6/urAJ5xuG/dHP/rRcuedd5aPf/zjZcyYMfX9zhNOOKG8//3vr5fHOAdvL9Y7YcKE8uSTT5aROu5nn322jj0ekK6++up63EfM8+mnn16fmLpx3PEkEcu0juFpefzxx/uPV4r1vvKVr3zR9kLMdxwfMtLGPHv27Dr/v/jFL8pXvvKVejxXHN8Tc90a+1DLjjt+/swzz9TrHX+/rTGFeAL9d+uLJ+xOmOvWdRjucc/ugPkeKqk9D60D1wa+Qg7xxB43ymDxKiuqMg4cioNKvvvd75ZXvepV5cQTT6zFOVCU2t13310vG7zNwdv7d9tc19ox5rgzLVq0qDz//PPli1/8YrnkkkvK5ptvXo466qj6xzgchmLcMa54YowH32uuuaacccYZ9eCyH/7whx0x163rMNzjjlBsreu8886rB5LFtuLgqzlz5nTluOPBNg4QjYPFIpziAfTnP/95uemmm8ry5cvrOpYuXbra7YVuvG+vzZhnzZpVv2644Ybloosuqk8oDz30UD2wMm6P4ZAd90EHHVQWLFhQ/zbjMSn+Js8555z6KjvGtTbra/dct2vcszpgvjtiz0Pstmvtimn9d4gbKm6cwX72s5/VKo87z8Ybb1x/dtlll5X999+/PnAeffTR/cted9119dVW7NoavM3Y3mBr2ua61o4xR8lGWMT611tvvfqzHXfcsR7F+73vfa+egdCN4z755JPrg+uRRx7Zvzs3XnWff/75dQ/Lv5vrl73sZWU4tGPc8Sr25ptvrnua4tVsuPjii8vb3/72cu2119ajvbtx3GeeeWYNpsMPP7w+8caT7jHHHFNmzJjRv83B89160B2O+W7HmA855JB61P7AXeFvetOb6s9+85vf1DNxOm3cr3/96+sTX4wroinmJv6m44XM+PHjV1nfQAPX1+65bl2H4R73IR0w3x2x56G1eyZ25QwU38cxCIPFbtjY7dq6o4XYtRc/i/eTBp4+Ezfk+973vhetI3Z1Rf0NnqA1bXNda8eYW+/FtcIhxG7ReE988CmA3TLu2G0bxR0RNFCcLhXzG/9irgdvL+Y9LhuuXXztGHeIB5dWOIR48HnNa17TtfMd4gH2wgsvrMer/OlPf6pvycQrtte+9rX18tXNd+v7br1v/39jDgOfSEL8bcdbc8PxNux/Mu5wwAEH1OOzfve739XQPfTQQ+sr8a233nqt1tfuuW7XuDthvjsiHiZPnlzvOAM/ayAOEIpXxPE+zmDxBxN3qoG7hGL3T5zvG1XXEiUXu3X32muvF61j6tSp9Yl24AFzDz/8cH1QXd0217V2jPn3v/99fcUy8PzjOLc43kd74xvfWIbDuh53PMjGE2IcGDhQfB+hFHewWG/coQZGVutAs/g7GKnjjlel8ZkB8XstsRs8DrTq1vkO8So8XpHH+GOcsRv/xhtvLHvvvXe9PNYb6x/4dl4cVBoHJcb1GYljjrCIz4CIvRIt8fvxWNCpcx3RFMftxGNQPPHFbvo4yDvGGJ/ZEAcER0ANXF8sG7/XWl+757pd476wA+Z7yGRPz7jgggvqeaxxitLA82TjPN44Jz3OX12yZEld9umnn67LximHsWz8O/7445t99923WbhwYf86r7vuumaHHXZoVq5cudptnnbaac0BBxzQzJw5s/9zHuLc2+Ey3GNetGhR/XyDj3zkI83f/va35r777qu3we67797885//7Npxf+Mb36inSsXY4/MO4mt8/+1vf7te3tfXV8+hj9MaY57j8xHidhh4mttIHPcTTzxRT/H79Kc/3cyaNau588476+l98TkfS5cu7dpxT58+vX5/yy231NOrTz311Gbvvfdu5s2bVy+PscUYP/nJT9bfb537H+fij9Qxx/057vdnnXVWvTyWi8ez+LuPv/9OHHecqhiPPXHKcfz9xjxNnTq1ufTSS/vXN2PGjHrKY3zuQevzDuLzFFqnOXbCXLdj3H/rkPkeCul4iBs4zmvdc8896zmwxx57bP85r/E1zoG95ppr+pePc/bjDhYTFr9z0kkn9S/fcvnll9cP41iTxYsXN1/4whfqA2z8iyfS1p1xOLRjzI8++mj9/IdYR3zITPyRx4erDKd1Pe5Y33e+8536mRYxpve85z31w3QG3onmzJlTxx3bizthfN7FcD6Btmvc8SE0xxxzTH1winPf4zaIqOjmcccD8nnnnVf/zmNMsezDDz+8yjYfeeSROu4dd9yx2WeffeqT75peRIyUMceHnx122GF1e7Ge+NChgZ9/0Injvv3225sPf/jD9YnywAMPXO3nFkQM77fffnWZI444on6QVifNdbvG3QnzPRR64n/avfcDAOge/o+xAIAU8QAApIgHACBFPAAAKeIBAEgRDwBAingAAFLEAwCQIh4AgBTxAACkiAcAoGT8H5+yNqVYiMM3AAAAAElFTkSuQmCC", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "## results\n", "print('mean:', np.mean(scores))\n", "print('std: ', np.std(scores))\n", "sns.boxplot(x=scores);" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Bootstrap Validierung" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.6321205588285577\n" ] } ], "source": [ "print(1 - 1 / np.exp(1))" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.631855\n" ] } ], "source": [ "## experimental\n", "pop_size = 1000000\n", "pop = range(pop_size)\n", "smpl = np.random.choice(pop, pop_size, replace=True)\n", "smpl_size = len(np.unique(smpl))\n", "print(smpl_size / pop_size)" ] }, { "cell_type": "markdown", "metadata": { "tags": [] }, "source": [ "### Train - Eval - Test - Split" ] } ], "metadata": { "kernelspec": { "display_name": "teaching", "language": "python", "name": "teaching" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.0" }, "toc": { "base_numbering": "4.2", "nav_menu": { "height": "189px", "width": "303.333px" }, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "4.2 Validierung und mehr - Validierungstechniken", "title_sidebar": "Contents", "toc_cell": true, "toc_position": { "height": "calc(100% - 180px)", "left": "10px", "top": "150px", "width": "230.667px" }, "toc_section_display": true, "toc_window_display": true }, "toc-autonumbering": true, "varInspector": { "cols": { "lenName": 16, "lenType": 16, "lenVar": 40 }, "kernels_config": { "python": { "delete_cmd_postfix": "", "delete_cmd_prefix": "del ", "library": "var_list.py", "varRefreshCmd": "print(var_dic_list())" }, "r": { "delete_cmd_postfix": ") ", "delete_cmd_prefix": "rm(", "library": "var_list.r", "varRefreshCmd": "cat(var_dic_list()) " } }, "position": { "height": "298.85px", "left": "782px", "right": "20px", "top": "120px", "width": "350px" }, "types_to_exclude": [ "module", "function", "builtin_function_or_method", "instance", "_Feature" ], "window_display": false } }, "nbformat": 4, "nbformat_minor": 4 }