Make downloaded site reusable

This commit is contained in:
Felix Blanke 2023-08-27 00:22:44 +02:00
parent 6bc8c909d5
commit 195338aa89
2 changed files with 22 additions and 20 deletions

View File

@ -70,25 +70,27 @@ bundesland_dict = {
"1001": "Hamburg",
}
def get_bez_data(tags: list[str], url: str = "https://beschaeftigtenbefragung.verdi.de/") -> list[dict]:
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
bez_data = []
for tag in tags:
for a in soup.find_all("script"):
script_contents = a.decode_contents()
if script_contents.find(tag) >= 0:
break
substring = script_contents[script_contents.find(tag) + len(tag) + 3 :]
bez_data.append(json.loads(substring[: substring.find("\n") - 1]))
return bez_data
def construct_dataframe(
url: str = "https://beschaeftigtenbefragung.verdi.de/",
tag: str = "bez_data_2",
bez_data: dict[str, dict],
grouped: bool = False,
special_tag: str | None = None,
no_processing: bool = False,
):
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
for a in soup.find_all("script"):
script_contents = a.decode_contents()
if script_contents.find(tag) >= 0:
break
substring = script_contents[script_contents.find(tag) + len(tag) + 3 :]
bez_data = json.loads(substring[: substring.find("\n") - 1])
data = {}
if not no_processing:
data["Bundesland"] = pd.Series(
@ -135,9 +137,9 @@ def main(
special_tag: str | None = None,
no_processing: bool = False,
) -> None:
bez_data = get_bez_data([tag], url)[0]
df = construct_dataframe(
url=url,
tag=tag,
bez_data=bez_data,
grouped=grouped,
special_tag=special_tag,
no_processing=no_processing,

10
wsgi.py
View File

@ -1,7 +1,7 @@
import pandas as pd
from flask import Flask, render_template, request
from download_digital import construct_dataframe
from download_digital import construct_dataframe, get_bez_data
app = Flask(__name__)
@ -10,16 +10,16 @@ app = Flask(__name__)
def tables(
url: str = "https://beschaeftigtenbefragung.verdi.de/",
):
bez_data = get_bez_data(["bez_data_0", "bez_data_2"], url)
df = construct_dataframe(
url=url,
tag="bez_data_0",
bez_data=bez_data[0],
grouped=False,
special_tag="stud",
)
df_state = construct_dataframe(
url=url,
tag="bez_data_2",
bez_data=bez_data[1],
grouped=False,
no_processing=True
)