Files
cas-pml/SL/aufgaben/template/4_WS/Loesungen/WS 03 Loesung.ipynb
T
2026-05-21 14:16:30 +02:00

399 lines
9.0 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"id": "763373f9-1ba3-4fb6-9ae3-d0aeb6be07e2",
"metadata": {},
"source": [
"**Workshop 03 - Loesungsvorschlag**"
]
},
{
"cell_type": "markdown",
"id": "ac5573af-46dd-43fa-886e-ca228e61edaf",
"metadata": {},
"source": [
"# Data Frame"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "22a6e66a-eeaa-4ebe-aa95-ad983a2b1110",
"metadata": {},
"outputs": [],
"source": [
"## prepare and read data\n",
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"datapath = '../../3_data'\n",
"filename = 'melb_data.csv'\n",
"from os import chdir; chdir(datapath)\n",
"\n",
"data = pd.read_csv(filename)"
]
},
{
"cell_type": "markdown",
"id": "7c42aa1f-2ab1-43b2-a9a7-808f83dd724f",
"metadata": {},
"source": [
"## Entfernen von Beobachtungen nach Bedingung"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "71e9e7ca-c6cd-40bc-b9ff-adb11721d669",
"metadata": {},
"outputs": [],
"source": [
"## remove outliers on Price\n",
"data = data[data.Price < 8000000]\n",
"\n",
"## remove selected observation2\n",
"data = data[data.YearBuilt != 1196]"
]
},
{
"cell_type": "markdown",
"id": "309c5083-cefb-4b25-91e3-1eebda35555f",
"metadata": {},
"source": [
"## Entfernen von Duplikaten"
]
},
{
"cell_type": "markdown",
"id": "1eb307e8-df31-4a37-a9ed-c80a1d354cf9",
"metadata": {},
"source": [
"hier kein Bedarf"
]
},
{
"cell_type": "raw",
"id": "757d2596-04e0-445d-8ea9-9cd5c92ef702",
"metadata": {},
"source": [
"## remove duplicates\n",
"data.drop_duplicates(ignore_index=True, inplace = True)"
]
},
{
"cell_type": "markdown",
"id": "da78d619-572c-4649-8c00-fde22bcbc8bb",
"metadata": {},
"source": [
"## Entfernen fragwürdiger Variablen"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "c689473c-e969-4c69-a131-4d3a680b1f83",
"metadata": {},
"outputs": [],
"source": [
"vars_to_drop = ['Unnamed: 0', 'Suburb', 'Address', 'SellerG', 'Postcode', 'Bedroom2']\n",
"data = data.drop(vars_to_drop, axis=1)"
]
},
{
"cell_type": "markdown",
"id": "8816b6f2-1bb3-4795-9fdb-cd7a50494d09",
"metadata": {},
"source": [
"## Einsetzen von Werten für NAs"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "c637332d-24e5-48b1-bb58-ea704ee3f92e",
"metadata": {},
"outputs": [],
"source": [
"## mode for all cat vars (if any)\n",
"cat_feats = data.select_dtypes(include=['object']).columns\n",
"for c in cat_feats:\n",
" data[c].fillna(data[c].mode()[0], inplace = True)\n",
"\n",
"## median for all num features\n",
"num_feats = data.select_dtypes(include=['int64', 'float64']).columns\n",
"for c in num_feats:\n",
" data[c].fillna(data[c].median(), inplace = True) "
]
},
{
"cell_type": "markdown",
"id": "5e85ccf8-e99e-4630-a4e6-6d380994ab26",
"metadata": {},
"source": [
"# Kategoriale Variablen"
]
},
{
"cell_type": "markdown",
"id": "4b888038-88e8-46a5-988e-9f0dfd38d86d",
"metadata": {},
"source": [
"## Reduzieren der Kardinalität"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "e5a432e1-eb89-4d1e-b0cf-f799a8ac5216",
"metadata": {},
"outputs": [],
"source": [
"## Regionname: combine rarest 3 levels to 'Victoria'\n",
"data.Regionname = np.where(\n",
" (data.Regionname == 'Eastern Victoria') |\n",
" (data.Regionname == 'Northern Victoria') |\n",
" (data.Regionname == 'Western Victoria'),\n",
" 'Victoria', data.Regionname)\n",
"\n",
"## Method: combine 'SA' to 'S'\n",
"data.Method = np.where(data.Method == 'SA', 'S', data.Method)"
]
},
{
"cell_type": "markdown",
"id": "dbacc9d0-d357-4020-8062-6b968aba5e9e",
"metadata": {},
"source": [
"## Nummerisiren - Faktorisieren"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "ec29d19a-347b-4793-b2ce-4831f7a9bfa2",
"metadata": {},
"outputs": [],
"source": [
"data.CouncilArea = data.CouncilArea.factorize()[0]"
]
},
{
"cell_type": "markdown",
"id": "3937e91f-85c4-4f1c-bd50-564e8de620ea",
"metadata": {},
"source": [
"## Nummerisiren - Ordial Encodieren"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "3f250d95-862f-4016-af38-1c6ff8750db2",
"metadata": {},
"outputs": [],
"source": [
"data.Type.replace(\n",
" ['h', 'u', 't'], \n",
" [1, 2, 3], \n",
" inplace=True)"
]
},
{
"cell_type": "markdown",
"id": "154a4b0f-6e2f-498a-89f4-64e80de30fd4",
"metadata": {},
"source": [
"## Nummerisieren - Binär Encodieren"
]
},
{
"cell_type": "markdown",
"id": "0d422904-43d5-4c32-9c7b-cc16ddfbf39d",
"metadata": {},
"source": [
"hier kein Bedarf"
]
},
{
"cell_type": "markdown",
"id": "807990cc-9fac-47b7-a165-3194f142b30c",
"metadata": {},
"source": [
"## Nummerisieren - Ordinal Encodieren"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "91ef6cb1-157a-46e0-b9e6-0bf025b953b3",
"metadata": {},
"outputs": [],
"source": [
"## one-hot encoding\n",
"## apply for all categorical variables except Date (will be transformed later)\n",
"ignore = 'Date'\n",
"sel_vars = data.select_dtypes(include=['object']).columns.drop(ignore)\n",
"data = pd.get_dummies(data, columns=sel_vars, drop_first=True)"
]
},
{
"cell_type": "markdown",
"id": "1a1dd82b-e0d9-44d2-a57b-80f4bb3b2c72",
"metadata": {},
"source": [
"# Numerische Variablen"
]
},
{
"cell_type": "markdown",
"id": "607eab42-5a74-40fd-813d-eed338e93d3e",
"metadata": {},
"source": [
"## Logarithmieren"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "d5a6ff61-f2dc-47d0-ac10-82c811afe903",
"metadata": {},
"outputs": [],
"source": [
"## logarithme and rename\n",
"data.Landsize = np.log10(data.Landsize + 1)\n",
"data.BuildingArea = np.log10(data.BuildingArea + 1)\n",
"data.rename(columns={\n",
" 'Landsize' : 'logLandsize',\n",
" 'BuildingArea' : 'logBuildingArea'\n",
"}, inplace=True)"
]
},
{
"cell_type": "markdown",
"id": "7dace176-8b5d-4533-bdd2-119e63a7bfa2",
"metadata": {},
"source": [
"## Binär umcodieren"
]
},
{
"cell_type": "markdown",
"id": "9f508469-97f4-4025-be52-45643c300bfe",
"metadata": {},
"source": [
"hier kein Bedarf"
]
},
{
"cell_type": "markdown",
"id": "177c878f-ddf4-49f9-95c8-e9cc1af76473",
"metadata": {},
"source": [
"# Andere Tätigkeiten"
]
},
{
"cell_type": "markdown",
"id": "e1324181-c90f-4367-968f-1723bf2784ac",
"metadata": {},
"source": [
"## Konstruktion"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "2e939e6a-7a96-4942-945f-4f21d0af48e5",
"metadata": {},
"outputs": [],
"source": [
"## construct month, year and day_of_week\n",
"Date = pd.to_datetime(data.Date, format='%d/%m/%Y')\n",
"data['month'] = Date.dt.month\n",
"data['year'] = Date.dt.year\n",
"data['day_of_week'] = Date.dt.day_of_week\n",
"data.drop('Date', axis=1, inplace=True) ## not longer used\n",
"#print(data.info()) ## check"
]
},
{
"cell_type": "markdown",
"id": "3931a58a-6d47-494f-8bb2-9ea6f2914ff3",
"metadata": {},
"source": [
"## Bereinigen der Variablennamen"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "70daeeee-8257-4969-b9ff-49d700251b54",
"metadata": {},
"outputs": [],
"source": [
"old_names = data.columns\n",
"new_names = old_names.str.replace('[^a-zA-Z0-9_]', '_', regex=True)\n",
"for i in range(len(old_names)):\n",
" data.rename(columns={old_names[i]:new_names[i]}, inplace=True)"
]
},
{
"cell_type": "markdown",
"id": "bbac2f0b-1f1e-41e2-aae9-60ec4238b28c",
"metadata": {},
"source": [
"## Standardisieren"
]
},
{
"cell_type": "markdown",
"id": "906c9083-a03d-4a21-930e-f87e21079359",
"metadata": {},
"source": [
"hier kein Bedarf"
]
},
{
"cell_type": "markdown",
"id": "92bae929-be6a-4061-bdb1-572cc9fc8390",
"metadata": {},
"source": [
"## Speichern unter neuem Namen"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "882cd439-5cb6-4ab3-8695-09af2670f8f3",
"metadata": {},
"outputs": [],
"source": [
"data.to_csv('melb_data_prep.csv', index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.7"
},
"toc-autonumbering": true
},
"nbformat": 4,
"nbformat_minor": 5
}