import json from datetime import datetime from pathlib import Path import fire import pandas as pd import requests from bs4 import BeautifulSoup bundesland_dict = { "110": "Schleswig-Holstein", "112": "Schleswig-Holstein", "103": "Schleswig-Holstein", "111": "Schleswig-Holstein", "109": "Mecklenburg-Vorpommern", "105": "Mecklenburg-Vorpommern", "108": "Mecklenburg-Vorpommern", "213": "Niedersachsen", "214": "Bremen", "217": "Niedersachsen", "215": "Niedersachsen", "305": "Berlin", "306": "Brandenburg", "307": "Brandenburg", "308": "Brandenburg", "432": "NRW", "433": "NRW", "435": "NRW", "437": "NRW", "442": "NRW", "443": "NRW", "444": "NRW", "445": "NRW", "446": "NRW", "447": "NRW", "448": "NRW", "506": "Rheinland-Pfalz", "507": "Rheinland-Pfalz", "508": "Rheinland-Pfalz", "601": "Hessen", "603": "Hessen", "604": "Hessen", "608": "Hessen", "609": "Hessen", "610": "Hessen", "701": "Sachsen", "706": "Sachsen-Anhalt", "707": "Sachsen-Anhalt", "712": "Thüringen", "713": "Sachsen", "801": "Bayern", "802": "Bayern", "803": "Bayern", "804": "Bayern", "806": "Bayern", "808": "Bayern", "809": "Bayern", "810": "Bayern", "811": "Bayern", "812": "Bayern", "813": "Bayern", "814": "Bayern", "904": "Baden-Württemberg", "905": "Baden-Württemberg", "912": "Baden-Württemberg", "914": "Baden-Württemberg", "915": "Baden-Württemberg", "916": "Baden-Württemberg", "917": "Baden-Württemberg", "1001": "Hamburg", } def construct_dataframe( url: str = "https://beschaeftigtenbefragung.verdi.de/", tag: str = "bez_data_2", grouped: bool = False, ): r = requests.get(url) soup = BeautifulSoup(r.text, "html.parser") for a in soup.find_all("script"): script_contents = a.decode_contents() if script_contents.find(tag) >= 0: break substring = script_contents[script_contents.find(tag) + len(tag) + 3 :] bez_data = json.loads(substring[: substring.find("\n") - 1]) data = {} data["Bundesland"] = pd.Series( [bundesland_dict[k] for k in bez_data], index=list(bez_data.keys()) ) data["Bezirk"] = pd.Series( [v["name"] for v in bez_data.values()], index=list(bez_data.keys()) ) tot_col_data = [] tot_col_index = [] for k, v in bez_data.items(): if "tot" in v: tot_col_data.append(v["tot"]) tot_col_index.append(k) data["Digitale Befragung"] = pd.Series(tot_col_data, index=tot_col_index) df = pd.DataFrame(data=data) if grouped: df = df.groupby("Bundesland", as_index=False)[["Digitale Befragung"]].sum() return df def main( url: str = "https://beschaeftigtenbefragung.verdi.de/", tag: str = "bez_data_2", dry_run: bool = False, grouped: bool = False, ) -> None: df = construct_dataframe(url=url, tag=tag, grouped=grouped) if dry_run: print(df) else: filename = f"data/{datetime.today().strftime('%Y-%m-%d')}_data.ods" if Path(filename).exists(): print("File already exists!") else: df.to_excel(filename, sheet_name="digital") if __name__ == "__main__": fire.Fire(main)