feature: add workshop 3 solution
This commit is contained in:
@@ -86,7 +86,7 @@ workshop3
|
||||
## Ausführen
|
||||
|
||||
```sh
|
||||
cd src && python prepare.py
|
||||
python src/prepare.py
|
||||
```
|
||||
|
||||
## Offene Punkte / Selbstcheck
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -25,6 +25,135 @@ def inspect(df: pd.DataFrame) -> None:
|
||||
print(df.YearBuilt.sort_values().head(5)) # das ominöse 1196
|
||||
|
||||
|
||||
def e1_remove_observations(df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""E1: Ausreisser und fehlerhafte Beobachtungen entfernen."""
|
||||
before = len(df)
|
||||
df = df[df.Price < 8000000] # nur Werte bis 8000000 berücksichtigen
|
||||
df = df[
|
||||
df.YearBuilt != 1196
|
||||
] # Wert mit year == 1196 verwerfen (NaN bleiben erhalten!)
|
||||
print(f"E1: {before} -> {len(df)} Zeilen ({before - len(df)} entfernt)")
|
||||
return df
|
||||
|
||||
|
||||
def e2_remove_duplicates(df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""E2: Duplikate -> laut Drehbuch kein Bedarf, hier nur Verifikation."""
|
||||
n = df.duplicated().sum()
|
||||
print(f"E2: {n} Duplikate gefunden")
|
||||
return df # bewusst kein drop_duplicates: Drehbuch sagt kein Bedarf
|
||||
|
||||
|
||||
def e3_remove_unwanted(df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""E3: Unerwünschte Variablen entfernen."""
|
||||
unwanted = ["Unnamed: 0", "Suburb", "Address", "SellerG", "Postcode", "Bedroom2"]
|
||||
df = df.drop(unwanted, axis=1)
|
||||
print(f"E3: {len(unwanted)} Spalten verworfen ({', '.join(unwanted)})")
|
||||
return df
|
||||
|
||||
|
||||
def e4_impute_nas(df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""E4: NAs füllen: kategorial: Modalwert, numerisch: Median."""
|
||||
cat_feats = df.select_dtypes(include=["object"]).columns
|
||||
num_feats = df.select_dtypes(include=["int64", "float64"]).columns
|
||||
for c in cat_feats:
|
||||
df[c] = df[c].fillna(df[c].mode()[0])
|
||||
for c in num_feats:
|
||||
df[c] = df[c].fillna(df[c].median())
|
||||
print(f"E4: NAs verbleibend: {df.isna().sum().sum()}")
|
||||
return df
|
||||
|
||||
|
||||
def e5_reduce_cardinality(df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""E5: Kardinalität reduzieren -> seltene Levels zusammenlegen."""
|
||||
df.Method = np.where(df.Method == "SA", "S", df.Method)
|
||||
df.Regionname = np.where(
|
||||
(df.Regionname == "Eastern Victoria")
|
||||
| (df.Regionname == "Northern Victoria")
|
||||
| (df.Regionname == "Western Victoria"),
|
||||
"Victoria",
|
||||
df.Regionname,
|
||||
)
|
||||
print(
|
||||
f"E5: Method-Levels: {df.Method.nunique()}, "
|
||||
f"Regionname-Levels: {df.Regionname.nunique()}"
|
||||
)
|
||||
return df
|
||||
|
||||
|
||||
def e22_factorize(df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""E2.2: Faktorisieren — CouncilArea (nominal, hohe Kardinalität)."""
|
||||
before = df.CouncilArea.nunique()
|
||||
df.CouncilArea = df.CouncilArea.factorize()[0]
|
||||
print(f"E2.2: CouncilArea faktorisiert ({before} Levels -> 0..{before - 1})")
|
||||
return df
|
||||
|
||||
|
||||
def e23_ordinal(df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""E2.3: Ordinal encodieren — Type (h, u, t -> 1, 2, 3)."""
|
||||
df.Type = df.Type.map({"h": 1, "u": 2, "t": 3})
|
||||
print(f"E2.3: Type ordinal codiert -> {sorted(df.Type.unique())}")
|
||||
return df
|
||||
|
||||
|
||||
def e25_onehot(df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""E2.5: One-Hot für verbleibende kategoriale Variablen ausser Date."""
|
||||
before = df.shape[1]
|
||||
ignore = "Date"
|
||||
sel_vars = df.select_dtypes(include=["object"]).columns.drop(ignore)
|
||||
df = pd.get_dummies(df, columns=sel_vars, drop_first=True)
|
||||
print(f"E2.5: One-Hot auf {list(sel_vars)}: {before} -> {df.shape[1]} Spalten")
|
||||
return df
|
||||
|
||||
|
||||
def e31_logarithm(df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""E3.1: Rechtsschiefe Flächen logarithmieren (+1 wegen Nullwerten)
|
||||
und umbenennen."""
|
||||
df.Landsize = np.log10(df.Landsize + 1)
|
||||
df.BuildingArea = np.log10(df.BuildingArea + 1)
|
||||
print("E3.1: Nullwerte Landsize:", (df.Landsize == 0).sum())
|
||||
df = df.rename(
|
||||
columns={
|
||||
"Landsize": "logLandsize",
|
||||
"BuildingArea": "logBuildingArea",
|
||||
}
|
||||
)
|
||||
df.logLandsize.describe()
|
||||
return df
|
||||
|
||||
|
||||
def e41_construct(df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""E4.1: Date in month/year/day_of_week zerlegen, Date droppen."""
|
||||
date = pd.to_datetime(df.Date, format="%d/%m/%Y")
|
||||
df["month"] = date.dt.month
|
||||
df["year"] = date.dt.year
|
||||
df["day_of_week"] = date.dt.day_of_week
|
||||
df = df.drop("Date", axis=1)
|
||||
return df
|
||||
|
||||
|
||||
def e42_clean_names(df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""E4.2: Unerlaubte Zeichen in Spaltennamen durch _ ersetzen."""
|
||||
df.columns = df.columns.str.replace(r"[^a-zA-Z0-9_]", "_", regex=True)
|
||||
return df
|
||||
|
||||
if __name__ == "__main__":
|
||||
data = load()
|
||||
inspect(data)
|
||||
# Data Frame
|
||||
data = e1_remove_observations(data)
|
||||
data = e2_remove_duplicates(data)
|
||||
data = e3_remove_unwanted(data)
|
||||
data = e4_impute_nas(data)
|
||||
# Kategoriale Variablen
|
||||
data = e5_reduce_cardinality(data)
|
||||
data = e22_factorize(data)
|
||||
data = e23_ordinal(data)
|
||||
data = e25_onehot(data)
|
||||
# Numerische Variablen
|
||||
data = e31_logarithm(data)
|
||||
# E3.2 binär umcodieren: kein Bedarf (kein pdays/previous-Äquivalent)
|
||||
# Andere Tätigkeiten
|
||||
data = e41_construct(data)
|
||||
data = e42_clean_names(data)
|
||||
# E4.3 Standardisieren: kein Bedarf (modellabhängig, gehört ans Training)
|
||||
data.to_csv(OUT, index=False)
|
||||
print(f"Fertig: {OUT} geschrieben ({data.shape[0]} Zeilen, {data.shape[1]} Spalten)")
|
||||
|
||||
Reference in New Issue
Block a user