tvstud-download-digital/download_digital.py
2023-08-17 15:52:52 +02:00

117 lines
3.0 KiB
Python

from pathlib import Path
import pandas as pd
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
import json
from datetime import datetime
bundesland_dict = {
"110": "Schleswig-Holstein",
"112": "Schleswig-Holstein",
"103": "Schleswig-Holstein",
"111": "Schleswig-Holstein",
"109": "Mecklenburg-Vorpommern",
"105": "Mecklenburg-Vorpommern",
"108": "Mecklenburg-Vorpommern",
"213": "Niedersachsen",
"214": "Bremen",
"217": "Niedersachsen",
"215": "Niedersachsen",
"305": "Berlin",
"306": "Brandenburg",
"307": "Brandenburg",
"308": "Brandenburg",
"432": "NRW",
"433": "NRW",
"435": "NRW",
"437": "NRW",
"442": "NRW",
"443": "NRW",
"444": "NRW",
"445": "NRW",
"446": "NRW",
"447": "NRW",
"448": "NRW",
"506": "Rheinland-Pfalz",
"507": "Rheinland-Pfalz",
"508": "Rheinland-Pfalz",
"601": "Hessen",
"603": "Hessen",
"604": "Hessen",
"608": "Hessen",
"609": "Hessen",
"610": "Hessen",
"701": "Sachsen",
"706": "Sachsen-Anhalt",
"707": "Sachsen-Anhalt",
"712": "Thüringen",
"713": "Sachsen",
"801": "Bayern",
"802": "Bayern",
"803": "Bayern",
"804": "Bayern",
"806": "Bayern",
"808": "Bayern",
"809": "Bayern",
"810": "Bayern",
"811": "Bayern",
"812": "Bayern",
"813": "Bayern",
"814": "Bayern",
"904": "Baden-Württemberg",
"905": "Baden-Württemberg",
"912": "Baden-Württemberg",
"914": "Baden-Württemberg",
"915": "Baden-Württemberg",
"916": "Baden-Württemberg",
"917": "Baden-Württemberg",
"1001": "Hamburg",
}
def main(
url: str = "https://beschaeftigtenbefragung.verdi.de/", tag: str = "bez_data_2"
) -> pd.DataFrame:
options = Options()
options.add_argument("--headless")
driver = webdriver.Firefox(options=options)
driver.get(url)
for a in driver.find_elements(By.XPATH, "//script"):
script_contents = a.get_attribute("innerHTML")
if script_contents.find(tag) >= 0:
break
substring = script_contents[script_contents.find(tag) + len(tag) + 3 :]
bez_data = json.loads(substring[: substring.find("\n") - 1])
data = {}
data["Bundesland"] = pd.Series(
[bundesland_dict[k] for k in bez_data], index=list(bez_data.keys())
)
data["Bezirk"] = pd.Series(
[v["name"] for v in bez_data.values()], index=list(bez_data.keys())
)
tot_col_data = []
tot_col_index = []
for k, v in bez_data.items():
if "tot" in v:
tot_col_data.append(v["tot"])
tot_col_index.append(k)
data["Digitale Befragung"] = pd.Series(tot_col_data, index=tot_col_index)
df = pd.DataFrame(data=data)
filename = f"data/{datetime.today().strftime('%Y-%m-%d')}_data.ods"
if Path(filename).exists():
print("File already exists!")
else:
df.to_excel(filename, sheet_name="digital")
if __name__ == "__main__":
main()