176 lines
4.6 KiB
Python
176 lines
4.6 KiB
Python
import json
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
import fire
|
|
import pandas as pd
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
bundesland_dict = {
|
|
"110": "Schleswig-Holstein",
|
|
"112": "Schleswig-Holstein",
|
|
"103": "Schleswig-Holstein",
|
|
"111": "Schleswig-Holstein",
|
|
"109": "Mecklenburg-Vorpommern",
|
|
"105": "Mecklenburg-Vorpommern",
|
|
"108": "Mecklenburg-Vorpommern",
|
|
"213": "Niedersachsen",
|
|
"214": "Bremen",
|
|
"217": "Niedersachsen",
|
|
"215": "Niedersachsen",
|
|
"305": "Berlin",
|
|
"306": "Brandenburg",
|
|
"307": "Brandenburg",
|
|
"308": "Brandenburg",
|
|
"432": "NRW",
|
|
"433": "NRW",
|
|
"435": "NRW",
|
|
"437": "NRW",
|
|
"442": "NRW",
|
|
"443": "NRW",
|
|
"444": "NRW",
|
|
"445": "NRW",
|
|
"446": "NRW",
|
|
"447": "NRW",
|
|
"448": "NRW",
|
|
"506": "Rheinland-Pfalz",
|
|
"507": "Rheinland-Pfalz",
|
|
"508": "Rheinland-Pfalz",
|
|
"601": "Hessen",
|
|
"603": "Hessen",
|
|
"604": "Hessen",
|
|
"608": "Hessen",
|
|
"609": "Hessen",
|
|
"610": "Hessen",
|
|
"701": "Sachsen",
|
|
"706": "Sachsen-Anhalt",
|
|
"707": "Sachsen-Anhalt",
|
|
"712": "Thüringen",
|
|
"713": "Sachsen",
|
|
"801": "Bayern",
|
|
"802": "Bayern",
|
|
"803": "Bayern",
|
|
"804": "Bayern",
|
|
"806": "Bayern",
|
|
"808": "Bayern",
|
|
"809": "Bayern",
|
|
"810": "Bayern",
|
|
"811": "Bayern",
|
|
"812": "Bayern",
|
|
"813": "Bayern",
|
|
"814": "Bayern",
|
|
"904": "Baden-Württemberg",
|
|
"905": "Baden-Württemberg",
|
|
"912": "Baden-Württemberg",
|
|
"914": "Baden-Württemberg",
|
|
"915": "Baden-Württemberg",
|
|
"916": "Baden-Württemberg",
|
|
"917": "Baden-Württemberg",
|
|
"1001": "Hamburg",
|
|
}
|
|
|
|
|
|
landesbezirk_dict = {
|
|
"100": "Nord",
|
|
"200": "Niedersachsen-Bremen",
|
|
"300": "Berlin-Brandenburg",
|
|
"400": "Nordrhein-Westfalen",
|
|
"500": "Rheinland-Pfalz-Saarland",
|
|
"600": "Hessen",
|
|
"700": "Sachsen, Sachsen-Anhalt, Thüringen",
|
|
"800": "Bayern",
|
|
"900": "Baden-Württemberg",
|
|
"1000": "Hamburg",
|
|
}
|
|
|
|
|
|
def get_landesbezirk(id: str):
|
|
return landesbezirk_dict[str((int(id) // 100) * 100)]
|
|
|
|
|
|
def get_bez_data(
|
|
tags: list[str], url: str = "https://beschaeftigtenbefragung.verdi.de/"
|
|
) -> list[dict]:
|
|
r = requests.get(url)
|
|
soup = BeautifulSoup(r.text, "html.parser")
|
|
bez_data = []
|
|
for tag in tags:
|
|
for a in soup.find_all("script"):
|
|
script_contents = a.decode_contents()
|
|
if script_contents.find(tag) >= 0:
|
|
break
|
|
|
|
substring = script_contents[script_contents.find(tag) + len(tag) + 3 :]
|
|
bez_data.append(json.loads(substring[: substring.find("\n") - 1]))
|
|
return bez_data
|
|
|
|
|
|
def construct_dataframe(
|
|
bez_data: dict[str, dict],
|
|
special_tag: str | None = None,
|
|
):
|
|
data = {}
|
|
|
|
first_key = next(iter(bez_data.keys()))
|
|
if first_key in landesbezirk_dict:
|
|
data["Landesbezirk"] = pd.Series(
|
|
[v["name"] for v in bez_data.values()], index=list(bez_data.keys())
|
|
)
|
|
else:
|
|
data["Bundesland"] = pd.Series(
|
|
[bundesland_dict[k] for k in bez_data], index=list(bez_data.keys())
|
|
)
|
|
data["Bezirk"] = pd.Series(
|
|
[v["name"] for v in bez_data.values()], index=list(bez_data.keys())
|
|
)
|
|
|
|
tot_col_data = []
|
|
tot_col_index = []
|
|
if special_tag:
|
|
for k, v in bez_data.items():
|
|
if "sp" in v and special_tag in v["sp"]:
|
|
tot_col_data.append(v["sp"][special_tag])
|
|
tot_col_index.append(k)
|
|
else:
|
|
for k, v in bez_data.items():
|
|
if "tot" in v:
|
|
tot_col_data.append(v["tot"])
|
|
tot_col_index.append(k)
|
|
|
|
data["Digitale Befragung"] = pd.Series(tot_col_data, index=tot_col_index)
|
|
return pd.DataFrame(data=data).astype({"Digitale Befragung": "Int32"})
|
|
|
|
|
|
def main(
|
|
url: str = "https://beschaeftigtenbefragung.verdi.de/",
|
|
tag: str = "bez_data_2",
|
|
dry_run: bool = False,
|
|
grouped: bool = False,
|
|
special_tag: str | None = None,
|
|
folder: str = "data",
|
|
name: str = "data",
|
|
sheet_name: str = "digital",
|
|
) -> None:
|
|
bez_data = get_bez_data([tag], url)[0]
|
|
df = construct_dataframe(
|
|
bez_data=bez_data,
|
|
special_tag=special_tag,
|
|
)
|
|
|
|
if grouped:
|
|
df = df.groupby("Bundesland", as_index=False)[["Digitale Befragung"]].sum()
|
|
|
|
if dry_run:
|
|
print(df)
|
|
else:
|
|
filename = f"{folder}/{datetime.today().strftime('%Y-%m-%d')}_{name}.ods"
|
|
if Path(filename).exists():
|
|
print("File already exists!")
|
|
else:
|
|
df.to_excel(filename, sheet_name=sheet_name)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
fire.Fire(main)
|