tvstud-download-digital/download_digital.py
2023-08-17 22:49:55 +02:00

124 lines
3.2 KiB
Python

import json
from datetime import datetime
from pathlib import Path
import fire
import pandas as pd
import requests
from bs4 import BeautifulSoup
bundesland_dict = {
"110": "Schleswig-Holstein",
"112": "Schleswig-Holstein",
"103": "Schleswig-Holstein",
"111": "Schleswig-Holstein",
"109": "Mecklenburg-Vorpommern",
"105": "Mecklenburg-Vorpommern",
"108": "Mecklenburg-Vorpommern",
"213": "Niedersachsen",
"214": "Bremen",
"217": "Niedersachsen",
"215": "Niedersachsen",
"305": "Berlin",
"306": "Brandenburg",
"307": "Brandenburg",
"308": "Brandenburg",
"432": "NRW",
"433": "NRW",
"435": "NRW",
"437": "NRW",
"442": "NRW",
"443": "NRW",
"444": "NRW",
"445": "NRW",
"446": "NRW",
"447": "NRW",
"448": "NRW",
"506": "Rheinland-Pfalz",
"507": "Rheinland-Pfalz",
"508": "Rheinland-Pfalz",
"601": "Hessen",
"603": "Hessen",
"604": "Hessen",
"608": "Hessen",
"609": "Hessen",
"610": "Hessen",
"701": "Sachsen",
"706": "Sachsen-Anhalt",
"707": "Sachsen-Anhalt",
"712": "Thüringen",
"713": "Sachsen",
"801": "Bayern",
"802": "Bayern",
"803": "Bayern",
"804": "Bayern",
"806": "Bayern",
"808": "Bayern",
"809": "Bayern",
"810": "Bayern",
"811": "Bayern",
"812": "Bayern",
"813": "Bayern",
"814": "Bayern",
"904": "Baden-Württemberg",
"905": "Baden-Württemberg",
"912": "Baden-Württemberg",
"914": "Baden-Württemberg",
"915": "Baden-Württemberg",
"916": "Baden-Württemberg",
"917": "Baden-Württemberg",
"1001": "Hamburg",
}
def main(
url: str = "https://beschaeftigtenbefragung.verdi.de/",
tag: str = "bez_data_2",
dry_run: bool = False,
grouped: bool = False,
) -> pd.DataFrame:
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
for a in soup.find_all("script"):
script_contents = a.decode_contents()
if script_contents.find(tag) >= 0:
break
substring = script_contents[script_contents.find(tag) + len(tag) + 3 :]
bez_data = json.loads(substring[: substring.find("\n") - 1])
data = {}
data["Bundesland"] = pd.Series(
[bundesland_dict[k] for k in bez_data], index=list(bez_data.keys())
)
data["Bezirk"] = pd.Series(
[v["name"] for v in bez_data.values()], index=list(bez_data.keys())
)
tot_col_data = []
tot_col_index = []
for k, v in bez_data.items():
if "tot" in v:
tot_col_data.append(v["tot"])
tot_col_index.append(k)
data["Digitale Befragung"] = pd.Series(tot_col_data, index=tot_col_index)
df = pd.DataFrame(data=data)
filename = f"data/{datetime.today().strftime('%Y-%m-%d')}_data.ods"
if grouped:
df = df.groupby("Bundesland")[["Digitale Befragung"]].sum()
if dry_run:
df.loc["Total"] = df.sum(numeric_only=True)
with pd.option_context("display.max_rows", None):
print(df.astype({"Digitale Befragung": "Int32"}))
else:
if Path(filename).exists():
print("File already exists!")
else:
df.to_excel(filename, sheet_name="digital")
if __name__ == "__main__":
fire.Fire(main)