Files
cas-pml/SL/aufgaben/template/4_WS/WS 02 Feature Exploration.ipynb
T
2026-05-21 14:16:30 +02:00

235 lines
7.4 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"tags": []
},
"source": [
"# WS 02 Feature Engineering Exploration Overview.ipynb"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"* compiles the most important characteristic values from a loaded data frame and stores them in an Excel spreadsheet"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" index var_names dtypes nas uniques modes \\\n",
"0 0 Unnamed: 0 int64 0 18396 None \n",
"1 1 Suburb object 0 330 Reservoir \n",
"2 2 Address object 0 18134 1/1 Clarendon St \n",
"3 3 Rooms int64 0 11 None \n",
"4 4 Type object 0 3 h \n",
"5 5 Price float64 0 2470 None \n",
"6 6 Method object 0 5 S \n",
"7 7 SellerG object 0 305 Nelson \n",
"8 8 Date object 0 58 27/05/2017 \n",
"9 9 Distance float64 1 210 None \n",
"10 10 Postcode float64 1 205 None \n",
"11 11 Bedroom2 float64 3469 12 None \n",
"12 12 Bathroom float64 3471 9 None \n",
"13 13 Car float64 3576 11 None \n",
"14 14 Landsize float64 4793 1449 None \n",
"15 15 BuildingArea float64 10634 613 None \n",
"16 16 YearBuilt float64 9438 144 None \n",
"17 17 CouncilArea object 6163 33 Moreland \n",
"18 18 Lattitude float64 3332 7518 None \n",
"19 19 Longtitude float64 3332 8168 None \n",
"20 20 Regionname object 1 8 Southern Metropolitan \n",
"21 21 Propertycount float64 1 324 None \n",
"\n",
" means medians \n",
"0 1.182679e+04 11820.500000 \n",
"1 NaN NaN \n",
"2 NaN NaN \n",
"3 2.935040e+00 3.000000 \n",
"4 NaN NaN \n",
"5 1.056697e+06 880000.000000 \n",
"6 NaN NaN \n",
"7 NaN NaN \n",
"8 NaN NaN \n",
"9 1.038999e+01 9.700000 \n",
"10 3.107140e+03 3085.000000 \n",
"11 2.913043e+00 3.000000 \n",
"12 1.538492e+00 1.000000 \n",
"13 1.615520e+00 2.000000 \n",
"14 5.581164e+02 440.000000 \n",
"15 1.512202e+02 126.000000 \n",
"16 1.965880e+03 1970.000000 \n",
"17 NaN NaN \n",
"18 -3.780985e+01 -37.803625 \n",
"19 1.449963e+02 145.000920 \n",
"20 NaN NaN \n",
"21 7.517975e+03 6567.000000 \n"
]
}
],
"source": [
"## import libraries\n",
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"\n",
"## define data path\n",
"datapath = '../3_data'\n",
"from os import chdir; chdir(datapath)\n",
"\n",
"## load data\n",
"data = pd.read_csv('melb_data.csv')\n",
"#data.info()\n",
"\n",
"## var names\n",
"var_names = pd.Series(data.columns)\n",
"#print(var_names)\n",
"\n",
"dtypes = pd.Series(data.dtypes.values)\n",
"#print(dtypes)\n",
"\n",
"## nas\n",
"nas = pd.Series(data.isna().sum().values)\n",
"#print(nas)\n",
"\n",
"## uniques\n",
"uniques = []\n",
"for c in var_names:\n",
" uniques.append(data[c].nunique())\n",
"uniques = pd.Series(uniques)\n",
"#print(uniques)\n",
"\n",
"modes = []\n",
"for i in range(len(var_names)):\n",
" if dtypes[i] == 'object':\n",
" #modes.append(data[c].mode()[0])\n",
" modes.append(data[var_names[i]].mode()[0])\n",
" \n",
" else:\n",
" modes.append(None)\n",
"modes = pd.Series(modes)\n",
"#print(modes)\n",
"\n",
"## means\n",
"means = []\n",
"for i in range(len(var_names)):\n",
" if dtypes[i] != 'object':\n",
" means.append(data[var_names[i]].mean())\n",
" else:\n",
" means.append(None)\n",
"means = pd.Series(means)\n",
"#print(means)\n",
"\n",
"## medians\n",
"medians = []\n",
"for i in range(len(var_names)):\n",
" if dtypes[i] != 'object':\n",
" medians.append(data[var_names[i]].median())\n",
" else:\n",
" medians.append(None)\n",
"medians = pd.Series(medians)\n",
"#print(medians)\n",
"\n",
"## collect results\n",
"overview = pd.DataFrame(dict(\n",
" var_names = var_names, \n",
" dtypes = dtypes,\n",
" nas = nas,\n",
" uniques = uniques,\n",
" modes = modes,\n",
" means = means,\n",
" medians = medians\n",
")).reset_index()\n",
"print(overview)\n",
"\n",
"overview.to_excel('ws_02_overview.xlsx', index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.7"
},
"toc": {
"base_numbering": "1",
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "WS 02 Feature Engineering - Exploration kategoriale Variablen",
"title_sidebar": "Contents",
"toc_cell": true,
"toc_position": {
"height": "calc(100% - 180px)",
"left": "10px",
"top": "150px",
"width": "180.6px"
},
"toc_section_display": true,
"toc_window_display": true
},
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"oldHeight": 217.64999999999998,
"position": {
"height": "238.85px",
"left": "802.2px",
"right": "20px",
"top": "116px",
"width": "326.8px"
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"varInspector_section_display": "block",
"window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 4
}