refactor: move things around
This commit is contained in:
@@ -0,0 +1,398 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "763373f9-1ba3-4fb6-9ae3-d0aeb6be07e2",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Workshop 03 - Loesungsvorschlag**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ac5573af-46dd-43fa-886e-ca228e61edaf",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Data Frame"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "22a6e66a-eeaa-4ebe-aa95-ad983a2b1110",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"## prepare and read data\n",
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"\n",
|
||||
"datapath = '../../3_data'\n",
|
||||
"filename = 'melb_data.csv'\n",
|
||||
"from os import chdir; chdir(datapath)\n",
|
||||
"\n",
|
||||
"data = pd.read_csv(filename)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "7c42aa1f-2ab1-43b2-a9a7-808f83dd724f",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Entfernen von Beobachtungen nach Bedingung"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "71e9e7ca-c6cd-40bc-b9ff-adb11721d669",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"## remove outliers on Price\n",
|
||||
"data = data[data.Price < 8000000]\n",
|
||||
"\n",
|
||||
"## remove selected observation2\n",
|
||||
"data = data[data.YearBuilt != 1196]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "309c5083-cefb-4b25-91e3-1eebda35555f",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Entfernen von Duplikaten"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1eb307e8-df31-4a37-a9ed-c80a1d354cf9",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"hier kein Bedarf"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "raw",
|
||||
"id": "757d2596-04e0-445d-8ea9-9cd5c92ef702",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## remove duplicates\n",
|
||||
"data.drop_duplicates(ignore_index=True, inplace = True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "da78d619-572c-4649-8c00-fde22bcbc8bb",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Entfernen fragwürdiger Variablen"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "c689473c-e969-4c69-a131-4d3a680b1f83",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"vars_to_drop = ['Unnamed: 0', 'Suburb', 'Address', 'SellerG', 'Postcode', 'Bedroom2']\n",
|
||||
"data = data.drop(vars_to_drop, axis=1)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "8816b6f2-1bb3-4795-9fdb-cd7a50494d09",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Einsetzen von Werten für NAs"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "c637332d-24e5-48b1-bb58-ea704ee3f92e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"## mode for all cat vars (if any)\n",
|
||||
"cat_feats = data.select_dtypes(include=['object']).columns\n",
|
||||
"for c in cat_feats:\n",
|
||||
" data[c].fillna(data[c].mode()[0], inplace = True)\n",
|
||||
"\n",
|
||||
"## median for all num features\n",
|
||||
"num_feats = data.select_dtypes(include=['int64', 'float64']).columns\n",
|
||||
"for c in num_feats:\n",
|
||||
" data[c].fillna(data[c].median(), inplace = True) "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "5e85ccf8-e99e-4630-a4e6-6d380994ab26",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Kategoriale Variablen"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4b888038-88e8-46a5-988e-9f0dfd38d86d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Reduzieren der Kardinalität"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"id": "e5a432e1-eb89-4d1e-b0cf-f799a8ac5216",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"## Regionname: combine rarest 3 levels to 'Victoria'\n",
|
||||
"data.Regionname = np.where(\n",
|
||||
" (data.Regionname == 'Eastern Victoria') |\n",
|
||||
" (data.Regionname == 'Northern Victoria') |\n",
|
||||
" (data.Regionname == 'Western Victoria'),\n",
|
||||
" 'Victoria', data.Regionname)\n",
|
||||
"\n",
|
||||
"## Method: combine 'SA' to 'S'\n",
|
||||
"data.Method = np.where(data.Method == 'SA', 'S', data.Method)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "dbacc9d0-d357-4020-8062-6b968aba5e9e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Nummerisiren - Faktorisieren"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"id": "ec29d19a-347b-4793-b2ce-4831f7a9bfa2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data.CouncilArea = data.CouncilArea.factorize()[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3937e91f-85c4-4f1c-bd50-564e8de620ea",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Nummerisiren - Ordial Encodieren"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"id": "3f250d95-862f-4016-af38-1c6ff8750db2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data.Type.replace(\n",
|
||||
" ['h', 'u', 't'], \n",
|
||||
" [1, 2, 3], \n",
|
||||
" inplace=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "154a4b0f-6e2f-498a-89f4-64e80de30fd4",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Nummerisieren - Binär Encodieren"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0d422904-43d5-4c32-9c7b-cc16ddfbf39d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"hier kein Bedarf"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "807990cc-9fac-47b7-a165-3194f142b30c",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Nummerisieren - Ordinal Encodieren"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"id": "91ef6cb1-157a-46e0-b9e6-0bf025b953b3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"## one-hot encoding\n",
|
||||
"## apply for all categorical variables except Date (will be transformed later)\n",
|
||||
"ignore = 'Date'\n",
|
||||
"sel_vars = data.select_dtypes(include=['object']).columns.drop(ignore)\n",
|
||||
"data = pd.get_dummies(data, columns=sel_vars, drop_first=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1a1dd82b-e0d9-44d2-a57b-80f4bb3b2c72",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Numerische Variablen"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "607eab42-5a74-40fd-813d-eed338e93d3e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Logarithmieren"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 25,
|
||||
"id": "d5a6ff61-f2dc-47d0-ac10-82c811afe903",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"## logarithme and rename\n",
|
||||
"data.Landsize = np.log10(data.Landsize + 1)\n",
|
||||
"data.BuildingArea = np.log10(data.BuildingArea + 1)\n",
|
||||
"data.rename(columns={\n",
|
||||
" 'Landsize' : 'logLandsize',\n",
|
||||
" 'BuildingArea' : 'logBuildingArea'\n",
|
||||
"}, inplace=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "7dace176-8b5d-4533-bdd2-119e63a7bfa2",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Binär umcodieren"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "9f508469-97f4-4025-be52-45643c300bfe",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"hier kein Bedarf"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "177c878f-ddf4-49f9-95c8-e9cc1af76473",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Andere Tätigkeiten"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e1324181-c90f-4367-968f-1723bf2784ac",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Konstruktion"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 30,
|
||||
"id": "2e939e6a-7a96-4942-945f-4f21d0af48e5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"## construct month, year and day_of_week\n",
|
||||
"Date = pd.to_datetime(data.Date, format='%d/%m/%Y')\n",
|
||||
"data['month'] = Date.dt.month\n",
|
||||
"data['year'] = Date.dt.year\n",
|
||||
"data['day_of_week'] = Date.dt.day_of_week\n",
|
||||
"data.drop('Date', axis=1, inplace=True) ## not longer used\n",
|
||||
"#print(data.info()) ## check"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3931a58a-6d47-494f-8bb2-9ea6f2914ff3",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Bereinigen der Variablennamen"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 32,
|
||||
"id": "70daeeee-8257-4969-b9ff-49d700251b54",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"old_names = data.columns\n",
|
||||
"new_names = old_names.str.replace('[^a-zA-Z0-9_]', '_', regex=True)\n",
|
||||
"for i in range(len(old_names)):\n",
|
||||
" data.rename(columns={old_names[i]:new_names[i]}, inplace=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "bbac2f0b-1f1e-41e2-aae9-60ec4238b28c",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Standardisieren"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "906c9083-a03d-4a21-930e-f87e21079359",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"hier kein Bedarf"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "92bae929-be6a-4061-bdb1-572cc9fc8390",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Speichern unter neuem Namen"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "882cd439-5cb6-4ab3-8695-09af2670f8f3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data.to_csv('melb_data_prep.csv', index=False)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.7"
|
||||
},
|
||||
"toc-autonumbering": true
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -0,0 +1,224 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"toc": true
|
||||
},
|
||||
"source": [
|
||||
"# WS 08 Regression mit Standardisieren und Logarithmieren"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"## prepare env, read and prepare data\n",
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"import seaborn as sns; sns.set()\n",
|
||||
"\n",
|
||||
"#codepath = '../2_code' ## for import of user defined module\n",
|
||||
"#datapath = '../3_data'\n",
|
||||
"codepath = '.././2_code' ## for import of user defined module\n",
|
||||
"datapath = '../../3_data'\n",
|
||||
"\n",
|
||||
"from sys import path; path.insert(1, codepath)\n",
|
||||
"from os import chdir; chdir(datapath)\n",
|
||||
"\n",
|
||||
"from bfh_cas_pml import prep_data\n",
|
||||
"X_train, X_test, y_train, y_test = prep_data('melb_data_prep.csv', target='Price', seed=1234)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"-105513873.23403685\n",
|
||||
"[ 245383.60581414 -141356.39759052 -40383.66643969 161336.03949841\n",
|
||||
" 40391.14829949 83303.27089591]\n",
|
||||
"[1331246.16325189 2557493.2373921 871684.82823291 1495633.275723\n",
|
||||
" 1549557.61151302 634348.67092323]\n",
|
||||
"0.5601419746121152\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"## baseline\n",
|
||||
"from sklearn.linear_model import LinearRegression\n",
|
||||
"from sklearn.metrics import r2_score\n",
|
||||
"model = LinearRegression()\n",
|
||||
"model.fit(X_train, y_train)\n",
|
||||
"y_pred = model.predict(X_test)\n",
|
||||
"\n",
|
||||
"print(model.intercept_)\n",
|
||||
"print(model.coef_[:6])\n",
|
||||
"print(y_pred[:6])\n",
|
||||
"print(r2_score(y_test, y_pred))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"1055902.69523731\n",
|
||||
"[ 235020.76662584 -96493.73493151 -243470.62893089 106305.85273776\n",
|
||||
" 35544.05464669 71047.51543032]\n",
|
||||
"[1331246.16325187 2557493.23739203 871684.82823297 1495633.27572294\n",
|
||||
" 1549557.611513 634348.67092323]\n",
|
||||
"0.5601419746121148\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"## scaled features\n",
|
||||
"from sklearn.preprocessing import StandardScaler\n",
|
||||
"scaler = StandardScaler()\n",
|
||||
"scaler.fit(X_train)\n",
|
||||
"X_train_sc = scaler.transform(X_train)\n",
|
||||
"X_test_sc = scaler.transform(X_test)\n",
|
||||
"\n",
|
||||
"model = LinearRegression()\n",
|
||||
"model.fit(X_train_sc, y_train)\n",
|
||||
"y_pred = model.predict(X_test_sc)\n",
|
||||
"\n",
|
||||
"print(model.intercept_)\n",
|
||||
"print(model.coef_[:6])\n",
|
||||
"print(y_pred[:6])\n",
|
||||
"print(r2_score(y_test, y_pred))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Fazit**\n",
|
||||
"* Auswirkung von Skalieren der Features\n",
|
||||
" * Koeffizienten und Intercept: Einfluss\n",
|
||||
" * Prediction: kein Einfluss\n",
|
||||
" * Score: natürlich auch kein Einfluss, wird ja aus Prediction berechnet"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"0.5519266421486302\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"## log target\n",
|
||||
"y_train_log = np.log10(y_train)\n",
|
||||
"y_test_log = np.log10(y_test)\n",
|
||||
"\n",
|
||||
"model = LinearRegression()\n",
|
||||
"model.fit(X_train, y_train_log)\n",
|
||||
"y_pred = model.predict(X_test)\n",
|
||||
"print(r2_score(10**y_test_log, 10**y_pred))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Fazit**\n",
|
||||
"* wird sogar etwas schlechter\n",
|
||||
"* kombination mit skalierten Features erübrigt sich hier, da skalieren ja offenbar keinen Einfluss auf score hat"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.7"
|
||||
},
|
||||
"toc": {
|
||||
"base_numbering": "1",
|
||||
"nav_menu": {},
|
||||
"number_sections": false,
|
||||
"sideBar": true,
|
||||
"skip_h1_title": true,
|
||||
"title_cell": "WS 11 Regression - mit FE - solution",
|
||||
"title_sidebar": "Contents",
|
||||
"toc_cell": true,
|
||||
"toc_position": {
|
||||
"height": "calc(100% - 180px)",
|
||||
"left": "10px",
|
||||
"top": "150px",
|
||||
"width": "195.933px"
|
||||
},
|
||||
"toc_section_display": true,
|
||||
"toc_window_display": true
|
||||
},
|
||||
"varInspector": {
|
||||
"cols": {
|
||||
"lenName": 16,
|
||||
"lenType": 16,
|
||||
"lenVar": 40
|
||||
},
|
||||
"kernels_config": {
|
||||
"python": {
|
||||
"delete_cmd_postfix": "",
|
||||
"delete_cmd_prefix": "del ",
|
||||
"library": "var_list.py",
|
||||
"varRefreshCmd": "print(var_dic_list())"
|
||||
},
|
||||
"r": {
|
||||
"delete_cmd_postfix": ") ",
|
||||
"delete_cmd_prefix": "rm(",
|
||||
"library": "var_list.r",
|
||||
"varRefreshCmd": "cat(var_dic_list()) "
|
||||
}
|
||||
},
|
||||
"position": {
|
||||
"height": "321.85px",
|
||||
"left": "785px",
|
||||
"right": "20px",
|
||||
"top": "118px",
|
||||
"width": "350px"
|
||||
},
|
||||
"types_to_exclude": [
|
||||
"module",
|
||||
"function",
|
||||
"builtin_function_or_method",
|
||||
"instance",
|
||||
"_Feature"
|
||||
],
|
||||
"window_display": false
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -0,0 +1,212 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"toc": true
|
||||
},
|
||||
"source": [
|
||||
"# WS 14 Random Search CV"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"* untersuchen Sie Kombinationen von Parameterwerten bei RandomForestClassifier\n",
|
||||
"* Vorschlag:\n",
|
||||
" * n_estimators in [50, 100, 150, 200]\n",
|
||||
" * max_features in [3, 5, 7, 9]\n",
|
||||
" * criterion in ['gini', 'entropy']\n",
|
||||
" * min_samples_leaf in [1, 2, 3, 4]\n",
|
||||
"* wenden Sie 5-fach Kreuzvalidierung an\n",
|
||||
"* setzen Sie die Anzahl der zu untersuchenden Kombinationen auf 12\n",
|
||||
"* arbeiten Sie ohne setzen von random_state, damit anschliessend die Ergebnisse verglichen werden können"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"## import libraries\n",
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"\n",
|
||||
"## load data\n",
|
||||
"datapath = '../../3_data'\n",
|
||||
"from os import chdir; chdir(datapath)\n",
|
||||
"bank_df = pd.read_csv('bank_data_prep.csv')\n",
|
||||
"\n",
|
||||
"## features - target - split\n",
|
||||
"X = bank_df.drop('y', axis=1)\n",
|
||||
"y = bank_df['y']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"best_params_ : {'n_estimators': 50, 'min_samples_leaf': 4, 'max_features': 9, 'criterion': 'entropy'}\n",
|
||||
"best_score_ : 0.8884381338742393\n",
|
||||
"CPU times: total: 3.09 s\n",
|
||||
"Wall time: 43.6 s\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"%%time\n",
|
||||
"## import classes from sklearn\n",
|
||||
"from sklearn.ensemble import RandomForestClassifier\n",
|
||||
"from sklearn.model_selection import RandomizedSearchCV\n",
|
||||
"\n",
|
||||
"## define parameter grid\n",
|
||||
"parameter_grid = {'n_estimators': [50, 100, 150, 200],\n",
|
||||
" 'max_features': [3, 5, 7, 9],\n",
|
||||
" 'criterion': ['gini', 'entropy'],\n",
|
||||
" 'min_samples_leaf': [1, 2, 3, 4]}\n",
|
||||
"\n",
|
||||
"## define RandomizedSearchCV\n",
|
||||
"rscv = RandomizedSearchCV(\n",
|
||||
" estimator=RandomForestClassifier(random_state=1234), \n",
|
||||
" param_distributions=parameter_grid, \n",
|
||||
" cv=5,\n",
|
||||
" n_iter=12,\n",
|
||||
" random_state=1234,\n",
|
||||
" n_jobs=-1)\n",
|
||||
"\n",
|
||||
"## run RandomizedSearchCV\n",
|
||||
"rscv.fit(X, y)\n",
|
||||
"\n",
|
||||
"## evaluate RandomizedSearchCV\n",
|
||||
"print('best_params_ :', rscv.best_params_)\n",
|
||||
"print('best_score_ :', rscv.best_score_)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#rscv.best_estimator_"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"(9860, 29)"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"X.shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Fazit:**\n",
|
||||
"* n_estimators: [50, 100, 150, 200] -> 50\n",
|
||||
" * hier müsste der Suchbereich nach unten erweiter werden\n",
|
||||
"* max_features: [3, 5, 7, 9] -> 9\n",
|
||||
" * hier müsste der Suchbereich nach oben erweiter werden\n",
|
||||
"* criterion: ['gini', 'entropy']\n",
|
||||
" * Suchbereich ok\n",
|
||||
"* min_samples_leaf: [1, 2, 3, 4] -> 4\n",
|
||||
" * hier müsste der Suchbereich nach oben erweiter werden"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.7"
|
||||
},
|
||||
"toc": {
|
||||
"base_numbering": "",
|
||||
"nav_menu": {},
|
||||
"number_sections": true,
|
||||
"sideBar": true,
|
||||
"skip_h1_title": false,
|
||||
"title_cell": "WS 17 Validierung - Random Search CV",
|
||||
"title_sidebar": "Contents",
|
||||
"toc_cell": false,
|
||||
"toc_position": {
|
||||
"height": "calc(100% - 180px)",
|
||||
"left": "10px",
|
||||
"top": "150px",
|
||||
"width": "195.867px"
|
||||
},
|
||||
"toc_section_display": true,
|
||||
"toc_window_display": true
|
||||
},
|
||||
"varInspector": {
|
||||
"cols": {
|
||||
"lenName": 16,
|
||||
"lenType": 16,
|
||||
"lenVar": 40
|
||||
},
|
||||
"kernels_config": {
|
||||
"python": {
|
||||
"delete_cmd_postfix": "",
|
||||
"delete_cmd_prefix": "del ",
|
||||
"library": "var_list.py",
|
||||
"varRefreshCmd": "print(var_dic_list())"
|
||||
},
|
||||
"r": {
|
||||
"delete_cmd_postfix": ") ",
|
||||
"delete_cmd_prefix": "rm(",
|
||||
"library": "var_list.r",
|
||||
"varRefreshCmd": "cat(var_dic_list()) "
|
||||
}
|
||||
},
|
||||
"position": {
|
||||
"height": "306.85px",
|
||||
"left": "862px",
|
||||
"right": "20px",
|
||||
"top": "137px",
|
||||
"width": "350px"
|
||||
},
|
||||
"types_to_exclude": [
|
||||
"module",
|
||||
"function",
|
||||
"builtin_function_or_method",
|
||||
"instance",
|
||||
"_Feature"
|
||||
],
|
||||
"window_display": false
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user