tvstud-download-digital/download_digital.py
2023-08-17 22:22:08 +02:00

126 lines
3.3 KiB
Python

import json
from datetime import datetime
from pathlib import Path
import fire
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
bundesland_dict = {
"110": "Schleswig-Holstein",
"112": "Schleswig-Holstein",
"103": "Schleswig-Holstein",
"111": "Schleswig-Holstein",
"109": "Mecklenburg-Vorpommern",
"105": "Mecklenburg-Vorpommern",
"108": "Mecklenburg-Vorpommern",
"213": "Niedersachsen",
"214": "Bremen",
"217": "Niedersachsen",
"215": "Niedersachsen",
"305": "Berlin",
"306": "Brandenburg",
"307": "Brandenburg",
"308": "Brandenburg",
"432": "NRW",
"433": "NRW",
"435": "NRW",
"437": "NRW",
"442": "NRW",
"443": "NRW",
"444": "NRW",
"445": "NRW",
"446": "NRW",
"447": "NRW",
"448": "NRW",
"506": "Rheinland-Pfalz",
"507": "Rheinland-Pfalz",
"508": "Rheinland-Pfalz",
"601": "Hessen",
"603": "Hessen",
"604": "Hessen",
"608": "Hessen",
"609": "Hessen",
"610": "Hessen",
"701": "Sachsen",
"706": "Sachsen-Anhalt",
"707": "Sachsen-Anhalt",
"712": "Thüringen",
"713": "Sachsen",
"801": "Bayern",
"802": "Bayern",
"803": "Bayern",
"804": "Bayern",
"806": "Bayern",
"808": "Bayern",
"809": "Bayern",
"810": "Bayern",
"811": "Bayern",
"812": "Bayern",
"813": "Bayern",
"814": "Bayern",
"904": "Baden-Württemberg",
"905": "Baden-Württemberg",
"912": "Baden-Württemberg",
"914": "Baden-Württemberg",
"915": "Baden-Württemberg",
"916": "Baden-Württemberg",
"917": "Baden-Württemberg",
"1001": "Hamburg",
}
def main(
url: str = "https://beschaeftigtenbefragung.verdi.de/",
tag: str = "bez_data_2",
dry_run: bool = False,
grouped: bool = False,
) -> pd.DataFrame:
options = Options()
options.add_argument("--headless")
driver = webdriver.Firefox(options=options)
driver.get(url)
for a in driver.find_elements(By.XPATH, "//script"):
script_contents = a.get_attribute("innerHTML")
if script_contents.find(tag) >= 0:
break
substring = script_contents[script_contents.find(tag) + len(tag) + 3 :]
bez_data = json.loads(substring[: substring.find("\n") - 1])
data = {}
data["Bundesland"] = pd.Series(
[bundesland_dict[k] for k in bez_data], index=list(bez_data.keys())
)
data["Bezirk"] = pd.Series(
[v["name"] for v in bez_data.values()], index=list(bez_data.keys())
)
tot_col_data = []
tot_col_index = []
for k, v in bez_data.items():
if "tot" in v:
tot_col_data.append(v["tot"])
tot_col_index.append(k)
data["Digitale Befragung"] = pd.Series(tot_col_data, index=tot_col_index)
df = pd.DataFrame(data=data)
filename = f"data/{datetime.today().strftime('%Y-%m-%d')}_data.ods"
if grouped:
df = df.groupby("Bundesland")[["Digitale Befragung"]].sum()
if dry_run:
df.loc["Total"] = df.sum(numeric_only=True)
print(df)
else:
if Path(filename).exists():
print("File already exists!")
else:
df.to_excel(filename, sheet_name="digital")
if __name__ == "__main__":
fire.Fire(main)