From ae6beafa3d56e68d6e1372568d6924fc12fbfbfb Mon Sep 17 00:00:00 2001 From: Felix Blanke Date: Mon, 28 Aug 2023 12:10:05 +0200 Subject: [PATCH] Simplify download script --- download_digital.py | 44 +++++++++++++++++++++++++------------------- wsgi.py | 11 ++--------- 2 files changed, 27 insertions(+), 28 deletions(-) diff --git a/download_digital.py b/download_digital.py index a59ceb8..ca3035c 100644 --- a/download_digital.py +++ b/download_digital.py @@ -71,6 +71,20 @@ bundesland_dict = { } +landesbezirk_dict = { + "100": "Nord", + "200": "Niedersachsen-Bremen", + "300": "Berlin-Brandenburg", + "400": "Nordrhein-Westfalen", + "500": "Rheinland-Pfalz-Saarland", + "600": "Hessen", + "700": "Sachsen, Sachsen-Anhalt, Thüringen", + "800": "Bayern", + "900": "Baden-Württemberg", + "1000": "Hamburg", +} + + def get_bez_data( tags: list[str], url: str = "https://beschaeftigtenbefragung.verdi.de/" ) -> list[dict]: @@ -90,22 +104,22 @@ def get_bez_data( def construct_dataframe( bez_data: dict[str, dict], - grouped: bool = False, special_tag: str | None = None, - no_processing: bool = False, ): data = {} - if not no_processing: + + first_key = next(iter(bez_data.keys())) + if first_key in landesbezirk_dict: + data["Landesbezirk"] = pd.Series( + [v["name"] for v in bez_data.values()], index=list(bez_data.keys()) + ) + else: data["Bundesland"] = pd.Series( [bundesland_dict[k] for k in bez_data], index=list(bez_data.keys()) ) data["Bezirk"] = pd.Series( [v["name"] for v in bez_data.values()], index=list(bez_data.keys()) ) - else: - data["Landesbezirk"] = pd.Series( - [v["name"] for v in bez_data.values()], index=list(bez_data.keys()) - ) tot_col_data = [] tot_col_index = [] @@ -121,15 +135,7 @@ def construct_dataframe( tot_col_index.append(k) data["Digitale Befragung"] = pd.Series(tot_col_data, index=tot_col_index) - df = pd.DataFrame(data=data) - df = df.astype({"Digitale Befragung": "Int32"}) - - if grouped and no_processing: - raise ValueError - elif grouped: - df = df.groupby("Bundesland", as_index=False)[["Digitale Befragung"]].sum() - - return df + return pd.DataFrame(data=data).astype({"Digitale Befragung": "Int32"}) def main( @@ -138,7 +144,6 @@ def main( dry_run: bool = False, grouped: bool = False, special_tag: str | None = None, - no_processing: bool = False, folder: str = "data", name: str = "data", sheet_name: str = "digital", @@ -146,11 +151,12 @@ def main( bez_data = get_bez_data([tag], url)[0] df = construct_dataframe( bez_data=bez_data, - grouped=grouped, special_tag=special_tag, - no_processing=no_processing, ) + if grouped: + df = df.groupby("Bundesland", as_index=False)[["Digitale Befragung"]].sum() + if dry_run: print(df) else: diff --git a/wsgi.py b/wsgi.py index 4cec264..25b0596 100644 --- a/wsgi.py +++ b/wsgi.py @@ -30,15 +30,8 @@ cache = Cache(app) def get_tables(url: str) -> tuple[pd.DataFrame, pd.DataFrame]: bez_data = get_bez_data(["bez_data_0", "bez_data_2"], url) - df = construct_dataframe( - bez_data=bez_data[0], - grouped=False, - special_tag="stud", - ) - - df_state = construct_dataframe( - bez_data=bez_data[1], grouped=False, no_processing=True - ) + df = construct_dataframe(bez_data=bez_data[0], special_tag="stud") + df_state = construct_dataframe(bez_data=bez_data[1]) return df, df_state