feature: add workshop 3 solution

This commit is contained in:
2026-05-28 12:15:41 +02:00
parent f5d32750b1
commit 3d4a91052a
3 changed files with 18525 additions and 2 deletions
+1 -1
View File
@@ -86,7 +86,7 @@ workshop3
## Ausführen
```sh
cd src && python prepare.py
python src/prepare.py
```
## Offene Punkte / Selbstcheck
File diff suppressed because it is too large Load Diff
+130 -1
View File
@@ -25,6 +25,135 @@ def inspect(df: pd.DataFrame) -> None:
print(df.YearBuilt.sort_values().head(5)) # das ominöse 1196
def e1_remove_observations(df: pd.DataFrame) -> pd.DataFrame:
"""E1: Ausreisser und fehlerhafte Beobachtungen entfernen."""
before = len(df)
df = df[df.Price < 8000000] # nur Werte bis 8000000 berücksichtigen
df = df[
df.YearBuilt != 1196
] # Wert mit year == 1196 verwerfen (NaN bleiben erhalten!)
print(f"E1: {before} -> {len(df)} Zeilen ({before - len(df)} entfernt)")
return df
def e2_remove_duplicates(df: pd.DataFrame) -> pd.DataFrame:
"""E2: Duplikate -> laut Drehbuch kein Bedarf, hier nur Verifikation."""
n = df.duplicated().sum()
print(f"E2: {n} Duplikate gefunden")
return df # bewusst kein drop_duplicates: Drehbuch sagt kein Bedarf
def e3_remove_unwanted(df: pd.DataFrame) -> pd.DataFrame:
"""E3: Unerwünschte Variablen entfernen."""
unwanted = ["Unnamed: 0", "Suburb", "Address", "SellerG", "Postcode", "Bedroom2"]
df = df.drop(unwanted, axis=1)
print(f"E3: {len(unwanted)} Spalten verworfen ({', '.join(unwanted)})")
return df
def e4_impute_nas(df: pd.DataFrame) -> pd.DataFrame:
"""E4: NAs füllen: kategorial: Modalwert, numerisch: Median."""
cat_feats = df.select_dtypes(include=["object"]).columns
num_feats = df.select_dtypes(include=["int64", "float64"]).columns
for c in cat_feats:
df[c] = df[c].fillna(df[c].mode()[0])
for c in num_feats:
df[c] = df[c].fillna(df[c].median())
print(f"E4: NAs verbleibend: {df.isna().sum().sum()}")
return df
def e5_reduce_cardinality(df: pd.DataFrame) -> pd.DataFrame:
"""E5: Kardinalität reduzieren -> seltene Levels zusammenlegen."""
df.Method = np.where(df.Method == "SA", "S", df.Method)
df.Regionname = np.where(
(df.Regionname == "Eastern Victoria")
| (df.Regionname == "Northern Victoria")
| (df.Regionname == "Western Victoria"),
"Victoria",
df.Regionname,
)
print(
f"E5: Method-Levels: {df.Method.nunique()}, "
f"Regionname-Levels: {df.Regionname.nunique()}"
)
return df
def e22_factorize(df: pd.DataFrame) -> pd.DataFrame:
"""E2.2: Faktorisieren — CouncilArea (nominal, hohe Kardinalität)."""
before = df.CouncilArea.nunique()
df.CouncilArea = df.CouncilArea.factorize()[0]
print(f"E2.2: CouncilArea faktorisiert ({before} Levels -> 0..{before - 1})")
return df
def e23_ordinal(df: pd.DataFrame) -> pd.DataFrame:
"""E2.3: Ordinal encodieren — Type (h, u, t -> 1, 2, 3)."""
df.Type = df.Type.map({"h": 1, "u": 2, "t": 3})
print(f"E2.3: Type ordinal codiert -> {sorted(df.Type.unique())}")
return df
def e25_onehot(df: pd.DataFrame) -> pd.DataFrame:
"""E2.5: One-Hot für verbleibende kategoriale Variablen ausser Date."""
before = df.shape[1]
ignore = "Date"
sel_vars = df.select_dtypes(include=["object"]).columns.drop(ignore)
df = pd.get_dummies(df, columns=sel_vars, drop_first=True)
print(f"E2.5: One-Hot auf {list(sel_vars)}: {before} -> {df.shape[1]} Spalten")
return df
def e31_logarithm(df: pd.DataFrame) -> pd.DataFrame:
"""E3.1: Rechtsschiefe Flächen logarithmieren (+1 wegen Nullwerten)
und umbenennen."""
df.Landsize = np.log10(df.Landsize + 1)
df.BuildingArea = np.log10(df.BuildingArea + 1)
print("E3.1: Nullwerte Landsize:", (df.Landsize == 0).sum())
df = df.rename(
columns={
"Landsize": "logLandsize",
"BuildingArea": "logBuildingArea",
}
)
df.logLandsize.describe()
return df
def e41_construct(df: pd.DataFrame) -> pd.DataFrame:
"""E4.1: Date in month/year/day_of_week zerlegen, Date droppen."""
date = pd.to_datetime(df.Date, format="%d/%m/%Y")
df["month"] = date.dt.month
df["year"] = date.dt.year
df["day_of_week"] = date.dt.day_of_week
df = df.drop("Date", axis=1)
return df
def e42_clean_names(df: pd.DataFrame) -> pd.DataFrame:
"""E4.2: Unerlaubte Zeichen in Spaltennamen durch _ ersetzen."""
df.columns = df.columns.str.replace(r"[^a-zA-Z0-9_]", "_", regex=True)
return df
if __name__ == "__main__":
data = load()
inspect(data)
# Data Frame
data = e1_remove_observations(data)
data = e2_remove_duplicates(data)
data = e3_remove_unwanted(data)
data = e4_impute_nas(data)
# Kategoriale Variablen
data = e5_reduce_cardinality(data)
data = e22_factorize(data)
data = e23_ordinal(data)
data = e25_onehot(data)
# Numerische Variablen
data = e31_logarithm(data)
# E3.2 binär umcodieren: kein Bedarf (kein pdays/previous-Äquivalent)
# Andere Tätigkeiten
data = e41_construct(data)
data = e42_clean_names(data)
# E4.3 Standardisieren: kein Bedarf (modellabhängig, gehört ans Training)
data.to_csv(OUT, index=False)
print(f"Fertig: {OUT} geschrieben ({data.shape[0]} Zeilen, {data.shape[1]} Spalten)")