From 2e91f30d7f37ccf0d43fc0cc88dace1e404f45b6 Mon Sep 17 00:00:00 2001 From: Felix Blanke Date: Thu, 17 Aug 2023 15:52:52 +0200 Subject: [PATCH] Add download_digital --- download_digital.py | 116 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 116 insertions(+) create mode 100644 download_digital.py diff --git a/download_digital.py b/download_digital.py new file mode 100644 index 0000000..a4b15e0 --- /dev/null +++ b/download_digital.py @@ -0,0 +1,116 @@ +from pathlib import Path + +import pandas as pd +from selenium import webdriver +from selenium.webdriver.firefox.options import Options +from selenium.webdriver.common.by import By +import json + +from datetime import datetime + +bundesland_dict = { + "110": "Schleswig-Holstein", + "112": "Schleswig-Holstein", + "103": "Schleswig-Holstein", + "111": "Schleswig-Holstein", + "109": "Mecklenburg-Vorpommern", + "105": "Mecklenburg-Vorpommern", + "108": "Mecklenburg-Vorpommern", + "213": "Niedersachsen", + "214": "Bremen", + "217": "Niedersachsen", + "215": "Niedersachsen", + "305": "Berlin", + "306": "Brandenburg", + "307": "Brandenburg", + "308": "Brandenburg", + "432": "NRW", + "433": "NRW", + "435": "NRW", + "437": "NRW", + "442": "NRW", + "443": "NRW", + "444": "NRW", + "445": "NRW", + "446": "NRW", + "447": "NRW", + "448": "NRW", + "506": "Rheinland-Pfalz", + "507": "Rheinland-Pfalz", + "508": "Rheinland-Pfalz", + "601": "Hessen", + "603": "Hessen", + "604": "Hessen", + "608": "Hessen", + "609": "Hessen", + "610": "Hessen", + "701": "Sachsen", + "706": "Sachsen-Anhalt", + "707": "Sachsen-Anhalt", + "712": "Thüringen", + "713": "Sachsen", + "801": "Bayern", + "802": "Bayern", + "803": "Bayern", + "804": "Bayern", + "806": "Bayern", + "808": "Bayern", + "809": "Bayern", + "810": "Bayern", + "811": "Bayern", + "812": "Bayern", + "813": "Bayern", + "814": "Bayern", + "904": "Baden-Württemberg", + "905": "Baden-Württemberg", + "912": "Baden-Württemberg", + "914": "Baden-Württemberg", + "915": "Baden-Württemberg", + "916": "Baden-Württemberg", + "917": "Baden-Württemberg", + "1001": "Hamburg", +} + + +def main( + url: str = "https://beschaeftigtenbefragung.verdi.de/", tag: str = "bez_data_2" +) -> pd.DataFrame: + options = Options() + options.add_argument("--headless") + driver = webdriver.Firefox(options=options) + driver.get(url) + + for a in driver.find_elements(By.XPATH, "//script"): + script_contents = a.get_attribute("innerHTML") + if script_contents.find(tag) >= 0: + break + + substring = script_contents[script_contents.find(tag) + len(tag) + 3 :] + bez_data = json.loads(substring[: substring.find("\n") - 1]) + + data = {} + data["Bundesland"] = pd.Series( + [bundesland_dict[k] for k in bez_data], index=list(bez_data.keys()) + ) + data["Bezirk"] = pd.Series( + [v["name"] for v in bez_data.values()], index=list(bez_data.keys()) + ) + + tot_col_data = [] + tot_col_index = [] + for k, v in bez_data.items(): + if "tot" in v: + tot_col_data.append(v["tot"]) + tot_col_index.append(k) + + data["Digitale Befragung"] = pd.Series(tot_col_data, index=tot_col_index) + df = pd.DataFrame(data=data) + filename = f"data/{datetime.today().strftime('%Y-%m-%d')}_data.ods" + if Path(filename).exists(): + print("File already exists!") + else: + df.to_excel(filename, sheet_name="digital") + + +if __name__ == "__main__": + main()