import json from datetime import datetime from pathlib import Path import fire import pandas as pd import requests from bs4 import BeautifulSoup bundesland_dict = { "110": "Schleswig-Holstein", "112": "Schleswig-Holstein", "103": "Schleswig-Holstein", "111": "Schleswig-Holstein", "109": "Mecklenburg-Vorpommern", "105": "Mecklenburg-Vorpommern", "108": "Mecklenburg-Vorpommern", "213": "Niedersachsen", "214": "Bremen", "217": "Niedersachsen", "215": "Niedersachsen", "305": "Berlin", "306": "Brandenburg", "307": "Brandenburg", "308": "Brandenburg", "432": "NRW", "433": "NRW", "435": "NRW", "437": "NRW", "442": "NRW", "443": "NRW", "444": "NRW", "445": "NRW", "446": "NRW", "447": "NRW", "448": "NRW", "506": "Rheinland-Pfalz", "507": "Rheinland-Pfalz", "508": "Rheinland-Pfalz", "601": "Hessen", "603": "Hessen", "604": "Hessen", "608": "Hessen", "609": "Hessen", "610": "Hessen", "701": "Sachsen", "706": "Sachsen-Anhalt", "707": "Sachsen-Anhalt", "712": "Thüringen", "713": "Sachsen", "801": "Bayern", "802": "Bayern", "803": "Bayern", "804": "Bayern", "806": "Bayern", "808": "Bayern", "809": "Bayern", "810": "Bayern", "811": "Bayern", "812": "Bayern", "813": "Bayern", "814": "Bayern", "904": "Baden-Württemberg", "905": "Baden-Württemberg", "912": "Baden-Württemberg", "914": "Baden-Württemberg", "915": "Baden-Württemberg", "916": "Baden-Württemberg", "917": "Baden-Württemberg", "1001": "Hamburg", } landesbezirk_dict = { "100": "Nord", "200": "Niedersachsen-Bremen", "300": "Berlin-Brandenburg", "400": "Nordrhein-Westfalen", "500": "Rheinland-Pfalz-Saarland", "600": "Hessen", "700": "Sachsen, Sachsen-Anhalt, Thüringen", "800": "Bayern", "900": "Baden-Württemberg", "1000": "Hamburg", } def get_landesbezirk(id: str): return landesbezirk_dict[str((int(id) // 100) * 100)] def get_bez_data( tags: list[str], url: str = "https://beschaeftigtenbefragung.verdi.de/" ) -> list[dict]: r = requests.get(url) soup = BeautifulSoup(r.text, "html.parser") bez_data = [] for tag in tags: for a in soup.find_all("script"): script_contents = a.decode_contents() if script_contents.find(tag) >= 0: break substring = script_contents[script_contents.find(tag) + len(tag) + 3 :] bez_data.append(json.loads(substring[: substring.find("\n") - 1])) return bez_data def construct_dataframe( bez_data: dict[str, dict], special_tag: str | None = None, ): data = {} first_key = next(iter(bez_data.keys())) if first_key in landesbezirk_dict: data["Landesbezirk"] = pd.Series( [v["name"] for v in bez_data.values()], index=list(bez_data.keys()) ) else: data["Bundesland"] = pd.Series( [bundesland_dict[k] for k in bez_data], index=list(bez_data.keys()) ) data["Bezirk"] = pd.Series( [v["name"] for v in bez_data.values()], index=list(bez_data.keys()) ) tot_col_data = [] tot_col_index = [] if special_tag: for k, v in bez_data.items(): if "sp" in v and special_tag in v["sp"]: tot_col_data.append(v["sp"][special_tag]) tot_col_index.append(k) else: for k, v in bez_data.items(): if "tot" in v: tot_col_data.append(v["tot"]) tot_col_index.append(k) data["Digitale Befragung"] = pd.Series(tot_col_data, index=tot_col_index) return pd.DataFrame(data=data).astype({"Digitale Befragung": "Int32"}) def main( url: str = "https://beschaeftigtenbefragung.verdi.de/", tag: str = "bez_data_2", dry_run: bool = False, grouped: bool = False, special_tag: str | None = None, folder: str = "data", name: str = "data", sheet_name: str = "digital", ) -> None: bez_data = get_bez_data([tag], url)[0] df = construct_dataframe( bez_data=bez_data, special_tag=special_tag, ) if grouped: df = df.groupby("Bundesland", as_index=False)[["Digitale Befragung"]].sum() if dry_run: print(df) else: filename = f"{folder}/{datetime.today().strftime('%Y-%m-%d')}_{name}.ods" if Path(filename).exists(): print("File already exists!") else: df.to_excel(filename, sheet_name=sheet_name) if __name__ == "__main__": fire.Fire(main)