Make downloaded site reusable
This commit is contained in:
parent
6bc8c909d5
commit
195338aa89
@ -70,25 +70,27 @@ bundesland_dict = {
|
|||||||
"1001": "Hamburg",
|
"1001": "Hamburg",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def get_bez_data(tags: list[str], url: str = "https://beschaeftigtenbefragung.verdi.de/") -> list[dict]:
|
||||||
def construct_dataframe(
|
|
||||||
url: str = "https://beschaeftigtenbefragung.verdi.de/",
|
|
||||||
tag: str = "bez_data_2",
|
|
||||||
grouped: bool = False,
|
|
||||||
special_tag: str | None = None,
|
|
||||||
no_processing: bool = False,
|
|
||||||
):
|
|
||||||
r = requests.get(url)
|
r = requests.get(url)
|
||||||
soup = BeautifulSoup(r.text, "html.parser")
|
soup = BeautifulSoup(r.text, "html.parser")
|
||||||
|
bez_data = []
|
||||||
|
for tag in tags:
|
||||||
for a in soup.find_all("script"):
|
for a in soup.find_all("script"):
|
||||||
script_contents = a.decode_contents()
|
script_contents = a.decode_contents()
|
||||||
if script_contents.find(tag) >= 0:
|
if script_contents.find(tag) >= 0:
|
||||||
break
|
break
|
||||||
|
|
||||||
substring = script_contents[script_contents.find(tag) + len(tag) + 3 :]
|
substring = script_contents[script_contents.find(tag) + len(tag) + 3 :]
|
||||||
bez_data = json.loads(substring[: substring.find("\n") - 1])
|
bez_data.append(json.loads(substring[: substring.find("\n") - 1]))
|
||||||
|
return bez_data
|
||||||
|
|
||||||
|
|
||||||
|
def construct_dataframe(
|
||||||
|
bez_data: dict[str, dict],
|
||||||
|
grouped: bool = False,
|
||||||
|
special_tag: str | None = None,
|
||||||
|
no_processing: bool = False,
|
||||||
|
):
|
||||||
data = {}
|
data = {}
|
||||||
if not no_processing:
|
if not no_processing:
|
||||||
data["Bundesland"] = pd.Series(
|
data["Bundesland"] = pd.Series(
|
||||||
@ -135,9 +137,9 @@ def main(
|
|||||||
special_tag: str | None = None,
|
special_tag: str | None = None,
|
||||||
no_processing: bool = False,
|
no_processing: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
bez_data = get_bez_data([tag], url)[0]
|
||||||
df = construct_dataframe(
|
df = construct_dataframe(
|
||||||
url=url,
|
bez_data=bez_data,
|
||||||
tag=tag,
|
|
||||||
grouped=grouped,
|
grouped=grouped,
|
||||||
special_tag=special_tag,
|
special_tag=special_tag,
|
||||||
no_processing=no_processing,
|
no_processing=no_processing,
|
||||||
|
|||||||
10
wsgi.py
10
wsgi.py
@ -1,7 +1,7 @@
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
from flask import Flask, render_template, request
|
from flask import Flask, render_template, request
|
||||||
|
|
||||||
from download_digital import construct_dataframe
|
from download_digital import construct_dataframe, get_bez_data
|
||||||
|
|
||||||
app = Flask(__name__)
|
app = Flask(__name__)
|
||||||
|
|
||||||
@ -10,16 +10,16 @@ app = Flask(__name__)
|
|||||||
def tables(
|
def tables(
|
||||||
url: str = "https://beschaeftigtenbefragung.verdi.de/",
|
url: str = "https://beschaeftigtenbefragung.verdi.de/",
|
||||||
):
|
):
|
||||||
|
bez_data = get_bez_data(["bez_data_0", "bez_data_2"], url)
|
||||||
|
|
||||||
df = construct_dataframe(
|
df = construct_dataframe(
|
||||||
url=url,
|
bez_data=bez_data[0],
|
||||||
tag="bez_data_0",
|
|
||||||
grouped=False,
|
grouped=False,
|
||||||
special_tag="stud",
|
special_tag="stud",
|
||||||
)
|
)
|
||||||
|
|
||||||
df_state = construct_dataframe(
|
df_state = construct_dataframe(
|
||||||
url=url,
|
bez_data=bez_data[1],
|
||||||
tag="bez_data_2",
|
|
||||||
grouped=False,
|
grouped=False,
|
||||||
no_processing=True
|
no_processing=True
|
||||||
)
|
)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user