Simplify download script

This commit is contained in:
Felix Blanke 2023-08-28 12:10:05 +02:00
parent 3bb4f432e4
commit ae6beafa3d
2 changed files with 27 additions and 28 deletions

View File

@ -71,6 +71,20 @@ bundesland_dict = {
} }
landesbezirk_dict = {
"100": "Nord",
"200": "Niedersachsen-Bremen",
"300": "Berlin-Brandenburg",
"400": "Nordrhein-Westfalen",
"500": "Rheinland-Pfalz-Saarland",
"600": "Hessen",
"700": "Sachsen, Sachsen-Anhalt, Thüringen",
"800": "Bayern",
"900": "Baden-Württemberg",
"1000": "Hamburg",
}
def get_bez_data( def get_bez_data(
tags: list[str], url: str = "https://beschaeftigtenbefragung.verdi.de/" tags: list[str], url: str = "https://beschaeftigtenbefragung.verdi.de/"
) -> list[dict]: ) -> list[dict]:
@ -90,22 +104,22 @@ def get_bez_data(
def construct_dataframe( def construct_dataframe(
bez_data: dict[str, dict], bez_data: dict[str, dict],
grouped: bool = False,
special_tag: str | None = None, special_tag: str | None = None,
no_processing: bool = False,
): ):
data = {} data = {}
if not no_processing:
first_key = next(iter(bez_data.keys()))
if first_key in landesbezirk_dict:
data["Landesbezirk"] = pd.Series(
[v["name"] for v in bez_data.values()], index=list(bez_data.keys())
)
else:
data["Bundesland"] = pd.Series( data["Bundesland"] = pd.Series(
[bundesland_dict[k] for k in bez_data], index=list(bez_data.keys()) [bundesland_dict[k] for k in bez_data], index=list(bez_data.keys())
) )
data["Bezirk"] = pd.Series( data["Bezirk"] = pd.Series(
[v["name"] for v in bez_data.values()], index=list(bez_data.keys()) [v["name"] for v in bez_data.values()], index=list(bez_data.keys())
) )
else:
data["Landesbezirk"] = pd.Series(
[v["name"] for v in bez_data.values()], index=list(bez_data.keys())
)
tot_col_data = [] tot_col_data = []
tot_col_index = [] tot_col_index = []
@ -121,15 +135,7 @@ def construct_dataframe(
tot_col_index.append(k) tot_col_index.append(k)
data["Digitale Befragung"] = pd.Series(tot_col_data, index=tot_col_index) data["Digitale Befragung"] = pd.Series(tot_col_data, index=tot_col_index)
df = pd.DataFrame(data=data) return pd.DataFrame(data=data).astype({"Digitale Befragung": "Int32"})
df = df.astype({"Digitale Befragung": "Int32"})
if grouped and no_processing:
raise ValueError
elif grouped:
df = df.groupby("Bundesland", as_index=False)[["Digitale Befragung"]].sum()
return df
def main( def main(
@ -138,7 +144,6 @@ def main(
dry_run: bool = False, dry_run: bool = False,
grouped: bool = False, grouped: bool = False,
special_tag: str | None = None, special_tag: str | None = None,
no_processing: bool = False,
folder: str = "data", folder: str = "data",
name: str = "data", name: str = "data",
sheet_name: str = "digital", sheet_name: str = "digital",
@ -146,11 +151,12 @@ def main(
bez_data = get_bez_data([tag], url)[0] bez_data = get_bez_data([tag], url)[0]
df = construct_dataframe( df = construct_dataframe(
bez_data=bez_data, bez_data=bez_data,
grouped=grouped,
special_tag=special_tag, special_tag=special_tag,
no_processing=no_processing,
) )
if grouped:
df = df.groupby("Bundesland", as_index=False)[["Digitale Befragung"]].sum()
if dry_run: if dry_run:
print(df) print(df)
else: else:

11
wsgi.py
View File

@ -30,15 +30,8 @@ cache = Cache(app)
def get_tables(url: str) -> tuple[pd.DataFrame, pd.DataFrame]: def get_tables(url: str) -> tuple[pd.DataFrame, pd.DataFrame]:
bez_data = get_bez_data(["bez_data_0", "bez_data_2"], url) bez_data = get_bez_data(["bez_data_0", "bez_data_2"], url)
df = construct_dataframe( df = construct_dataframe(bez_data=bez_data[0], special_tag="stud")
bez_data=bez_data[0], df_state = construct_dataframe(bez_data=bez_data[1])
grouped=False,
special_tag="stud",
)
df_state = construct_dataframe(
bez_data=bez_data[1], grouped=False, no_processing=True
)
return df, df_state return df, df_state