From 195338aa8902ad3573625e8f8e582b667a42a574 Mon Sep 17 00:00:00 2001 From: Felix Blanke Date: Sun, 27 Aug 2023 00:22:44 +0200 Subject: [PATCH] Make downloaded site reusable --- download_digital.py | 32 +++++++++++++++++--------------- wsgi.py | 10 +++++----- 2 files changed, 22 insertions(+), 20 deletions(-) diff --git a/download_digital.py b/download_digital.py index 60563f3..1f4da91 100644 --- a/download_digital.py +++ b/download_digital.py @@ -70,25 +70,27 @@ bundesland_dict = { "1001": "Hamburg", } +def get_bez_data(tags: list[str], url: str = "https://beschaeftigtenbefragung.verdi.de/") -> list[dict]: + r = requests.get(url) + soup = BeautifulSoup(r.text, "html.parser") + bez_data = [] + for tag in tags: + for a in soup.find_all("script"): + script_contents = a.decode_contents() + if script_contents.find(tag) >= 0: + break + + substring = script_contents[script_contents.find(tag) + len(tag) + 3 :] + bez_data.append(json.loads(substring[: substring.find("\n") - 1])) + return bez_data + def construct_dataframe( - url: str = "https://beschaeftigtenbefragung.verdi.de/", - tag: str = "bez_data_2", + bez_data: dict[str, dict], grouped: bool = False, special_tag: str | None = None, no_processing: bool = False, ): - r = requests.get(url) - soup = BeautifulSoup(r.text, "html.parser") - - for a in soup.find_all("script"): - script_contents = a.decode_contents() - if script_contents.find(tag) >= 0: - break - - substring = script_contents[script_contents.find(tag) + len(tag) + 3 :] - bez_data = json.loads(substring[: substring.find("\n") - 1]) - data = {} if not no_processing: data["Bundesland"] = pd.Series( @@ -135,9 +137,9 @@ def main( special_tag: str | None = None, no_processing: bool = False, ) -> None: + bez_data = get_bez_data([tag], url)[0] df = construct_dataframe( - url=url, - tag=tag, + bez_data=bez_data, grouped=grouped, special_tag=special_tag, no_processing=no_processing, diff --git a/wsgi.py b/wsgi.py index 15308b0..a96acab 100644 --- a/wsgi.py +++ b/wsgi.py @@ -1,7 +1,7 @@ import pandas as pd from flask import Flask, render_template, request -from download_digital import construct_dataframe +from download_digital import construct_dataframe, get_bez_data app = Flask(__name__) @@ -10,16 +10,16 @@ app = Flask(__name__) def tables( url: str = "https://beschaeftigtenbefragung.verdi.de/", ): + bez_data = get_bez_data(["bez_data_0", "bez_data_2"], url) + df = construct_dataframe( - url=url, - tag="bez_data_0", + bez_data=bez_data[0], grouped=False, special_tag="stud", ) df_state = construct_dataframe( - url=url, - tag="bez_data_2", + bez_data=bez_data[1], grouped=False, no_processing=True )