Compare commits

...

11 Commits

Author SHA1 Message Date
Felix Blanke
092a1d7417 Format 2023-08-29 00:27:55 +02:00
Felix Blanke
014217604c Fix timezone 2023-08-29 00:27:26 +02:00
Felix Blanke
c5f6067e8b Encapsulate svg conversion 2023-08-28 17:12:06 +02:00
Felix Blanke
45647def39 Do not plot lines if no datapoint in set 2023-08-28 17:08:50 +02:00
Felix Blanke
11a4cf4248 Allow filtering for landesbezirk 2023-08-28 17:08:03 +02:00
Felix Blanke
dc80671295 Group by Landesbezirk 2023-08-28 16:56:01 +02:00
Felix Blanke
046fce6bb0 Print occurred errors 2023-08-28 12:12:58 +02:00
Felix Blanke
204195ac06 Allow multiple target lines 2023-08-28 12:12:42 +02:00
Felix Blanke
6f29bdc6da Fix index col for read data 2023-08-28 12:12:08 +02:00
Felix Blanke
7aca691596 Encapsulate df creation 2023-08-28 12:11:35 +02:00
Felix Blanke
ae6beafa3d Simplify download script 2023-08-28 12:10:05 +02:00
2 changed files with 132 additions and 84 deletions

View File

@ -71,6 +71,24 @@ bundesland_dict = {
} }
landesbezirk_dict = {
"100": "Nord",
"200": "Niedersachsen-Bremen",
"300": "Berlin-Brandenburg",
"400": "Nordrhein-Westfalen",
"500": "Rheinland-Pfalz-Saarland",
"600": "Hessen",
"700": "Sachsen, Sachsen-Anhalt, Thüringen",
"800": "Bayern",
"900": "Baden-Württemberg",
"1000": "Hamburg",
}
def get_landesbezirk(id: str):
return landesbezirk_dict[str((int(id) // 100) * 100)]
def get_bez_data( def get_bez_data(
tags: list[str], url: str = "https://beschaeftigtenbefragung.verdi.de/" tags: list[str], url: str = "https://beschaeftigtenbefragung.verdi.de/"
) -> list[dict]: ) -> list[dict]:
@ -90,22 +108,22 @@ def get_bez_data(
def construct_dataframe( def construct_dataframe(
bez_data: dict[str, dict], bez_data: dict[str, dict],
grouped: bool = False,
special_tag: str | None = None, special_tag: str | None = None,
no_processing: bool = False,
): ):
data = {} data = {}
if not no_processing:
first_key = next(iter(bez_data.keys()))
if first_key in landesbezirk_dict:
data["Landesbezirk"] = pd.Series(
[v["name"] for v in bez_data.values()], index=list(bez_data.keys())
)
else:
data["Bundesland"] = pd.Series( data["Bundesland"] = pd.Series(
[bundesland_dict[k] for k in bez_data], index=list(bez_data.keys()) [bundesland_dict[k] for k in bez_data], index=list(bez_data.keys())
) )
data["Bezirk"] = pd.Series( data["Bezirk"] = pd.Series(
[v["name"] for v in bez_data.values()], index=list(bez_data.keys()) [v["name"] for v in bez_data.values()], index=list(bez_data.keys())
) )
else:
data["Landesbezirk"] = pd.Series(
[v["name"] for v in bez_data.values()], index=list(bez_data.keys())
)
tot_col_data = [] tot_col_data = []
tot_col_index = [] tot_col_index = []
@ -121,15 +139,7 @@ def construct_dataframe(
tot_col_index.append(k) tot_col_index.append(k)
data["Digitale Befragung"] = pd.Series(tot_col_data, index=tot_col_index) data["Digitale Befragung"] = pd.Series(tot_col_data, index=tot_col_index)
df = pd.DataFrame(data=data) return pd.DataFrame(data=data).astype({"Digitale Befragung": "Int32"})
df = df.astype({"Digitale Befragung": "Int32"})
if grouped and no_processing:
raise ValueError
elif grouped:
df = df.groupby("Bundesland", as_index=False)[["Digitale Befragung"]].sum()
return df
def main( def main(
@ -138,7 +148,6 @@ def main(
dry_run: bool = False, dry_run: bool = False,
grouped: bool = False, grouped: bool = False,
special_tag: str | None = None, special_tag: str | None = None,
no_processing: bool = False,
folder: str = "data", folder: str = "data",
name: str = "data", name: str = "data",
sheet_name: str = "digital", sheet_name: str = "digital",
@ -146,11 +155,12 @@ def main(
bez_data = get_bez_data([tag], url)[0] bez_data = get_bez_data([tag], url)[0]
df = construct_dataframe( df = construct_dataframe(
bez_data=bez_data, bez_data=bez_data,
grouped=grouped,
special_tag=special_tag, special_tag=special_tag,
no_processing=no_processing,
) )
if grouped:
df = df.groupby("Bundesland", as_index=False)[["Digitale Befragung"]].sum()
if dry_run: if dry_run:
print(df) print(df)
else: else:

134
wsgi.py
View File

@ -1,5 +1,8 @@
import datetime import datetime
import io import io
import locale
import os
import time
from itertools import chain from itertools import chain
from pathlib import Path from pathlib import Path
@ -11,7 +14,7 @@ import pandas as pd
from flask import Flask, Markup, render_template, request from flask import Flask, Markup, render_template, request
from flask_caching import Cache from flask_caching import Cache
from download_digital import construct_dataframe, get_bez_data from download_digital import construct_dataframe, get_bez_data, get_landesbezirk
config = { config = {
"CACHE_TYPE": "FileSystemCache", "CACHE_TYPE": "FileSystemCache",
@ -19,7 +22,9 @@ config = {
"CACHE_THRESHOLD": 50, "CACHE_THRESHOLD": 50,
"CACHE_DIR": "cache", "CACHE_DIR": "cache",
} }
import locale
os.environ["TZ"] = "Europe/Berlin"
time.tzset()
locale.setlocale(locale.LC_ALL, "de_DE.UTF-8") locale.setlocale(locale.LC_ALL, "de_DE.UTF-8")
app = Flask(__name__) app = Flask(__name__)
@ -30,28 +35,18 @@ cache = Cache(app)
def get_tables(url: str) -> tuple[pd.DataFrame, pd.DataFrame]: def get_tables(url: str) -> tuple[pd.DataFrame, pd.DataFrame]:
bez_data = get_bez_data(["bez_data_0", "bez_data_2"], url) bez_data = get_bez_data(["bez_data_0", "bez_data_2"], url)
df = construct_dataframe( df = construct_dataframe(bez_data=bez_data[0], special_tag="stud")
bez_data=bez_data[0], df_state = construct_dataframe(bez_data=bez_data[1])
grouped=False,
special_tag="stud",
)
df_state = construct_dataframe(
bez_data=bez_data[1], grouped=False, no_processing=True
)
return df, df_state return df, df_state
def plot( def create_plot_df(
current_df: pd.DataFrame | None = None, current_df: pd.DataFrame | None,
data_folder: str = "data", data_folder: str,
sheet_name: str = "digital", sheet_name: str,
total_target: int = 1500, curr_datetime,
alpha: float | None = None, ) -> pd.DataFrame:
) -> str:
curr_datetime = datetime.datetime.now()
data_dict = {} data_dict = {}
## Important: If multiple results are stored for the same date ## Important: If multiple results are stored for the same date
@ -61,31 +56,55 @@ def plot(
for f in sorted(Path(data_folder).iterdir()): for f in sorted(Path(data_folder).iterdir()):
with f.open("rb") as ff: with f.open("rb") as ff:
df = pd.read_excel(ff, sheet_name=sheet_name) df = pd.read_excel(ff, sheet_name=sheet_name, index_col=0)
if "Landesbezirk" not in df.columns:
df["Landesbezirk"] = df.index.map(get_landesbezirk)
df = df.astype({"Digitale Befragung": "Int32"}) df = df.astype({"Digitale Befragung": "Int32"})
sum_val = df[["Digitale Befragung"]].sum().iloc[0] df = df.groupby("Landesbezirk")[["Digitale Befragung"]].sum()
key = f.name[:10] key = f.name[:10]
data_dict[key] = sum_val data_dict[key] = df["Digitale Befragung"]
data_dict["2023-08-15"] = 275 df = pd.DataFrame(data=data_dict).T
series = pd.Series(data_dict.values(), index=data_dict) df.index = df.index.astype("datetime64[ns]") + pd.DateOffset(hours=10)
series.index = series.index.astype("datetime64[ns]") + pd.DateOffset(hours=10)
df = series.to_frame("Digitale Befragung")
df = df.reindex( df = df.reindex(
pd.date_range(start="2023-08-15", end=curr_datetime) pd.date_range(start="2023-08-15", end=curr_datetime) + pd.DateOffset(hours=10)
+ pd.DateOffset(hours=10)
) )
if current_df is not None: if current_df is not None:
if "Landesbezirk" not in current_df.columns:
current_df["Landesbezirk"] = current_df.index.map(get_landesbezirk)
current_df = current_df.astype({"Digitale Befragung": "Int32"}) current_df = current_df.astype({"Digitale Befragung": "Int32"})
sum_val = current_df[["Digitale Befragung"]].sum().iloc[0] current_df = current_df.groupby("Landesbezirk")[["Digitale Befragung"]].sum()
df.loc[curr_datetime] = sum_val
df.loc[curr_datetime] = current_df["Digitale Befragung"]
if pd.isna(df.loc[df.index.max()][0]): if pd.isna(df.loc[df.index.max()][0]):
df = df.drop([df.index.max()]) df = df.drop([df.index.max()])
return df
def plot(
current_df: pd.DataFrame | None = None,
data_folder: str = "data",
sheet_name: str = "digital",
total_targets: tuple[int, ...] = (1500,),
alpha: float | None = None,
landesbez_str: str | None = None,
) -> str:
curr_datetime = datetime.datetime.now()
df = create_plot_df(
current_df=current_df,
data_folder=data_folder,
sheet_name=sheet_name,
curr_datetime=curr_datetime,
)
fig = plt.figure(dpi=300) fig = plt.figure(dpi=300)
# fill weekends # fill weekends
@ -95,39 +114,51 @@ def plot(
if day.weekday() >= 5: if day.weekday() >= 5:
plt.gca().axvspan(days[idx], days[idx + 1], alpha=0.2, color="gray") plt.gca().axvspan(days[idx], days[idx + 1], alpha=0.2, color="gray")
series = df.sum(axis=1) if landesbez_str is None else df[landesbez_str]
plot_df = series.to_frame("Digitale Befragung").replace(0, np.nan)
plot_df = plot_df.astype({"Digitale Befragung": "float32"})
if not pd.isna(plot_df).all().item():
if alpha is not None: if alpha is not None:
plt.fill_between( plt.fill_between(
df.dropna().index, plot_df.dropna().index,
df.dropna()["Digitale Befragung"], plot_df.dropna()["Digitale Befragung"],
color="#e4004e", color="#e4004e",
alpha=alpha, alpha=alpha,
) )
plt.plot( plt.plot(
df.dropna().index, plot_df.dropna().index,
df.dropna()["Digitale Befragung"], plot_df.dropna()["Digitale Befragung"],
ls="--", ls="--",
marker="o", marker="o",
lw=1, lw=1,
color="#e4004e", color="#e4004e",
markersize=4, markersize=4,
label=landesbez_str,
) )
if current_df is not None: if current_df is not None:
plt.annotate( plt.annotate(
"Jetzt", "Jetzt",
(df.dropna().index[-1], df.dropna()["Digitale Befragung"][-1] * 1.03), (
plot_df.dropna().index[-1],
plot_df.dropna()["Digitale Befragung"][-1] * 1.03,
),
fontsize=8, fontsize=8,
ha="center", ha="center",
) )
plt.plot(df.index, df["Digitale Befragung"], lw=1.5, color="#e4004e") plt.plot(
plot_df.index,
plot_df["Digitale Befragung"],
lw=1.5,
color="#e4004e",
label=landesbez_str,
)
plt.title("Teilnahme an Digitaler Beschäftigtenbefragung") plt.title("Teilnahme an Digitaler Beschäftigtenbefragung")
plt.ylabel("# Teilnahmen") plt.ylabel("# Teilnahmen")
plt.ylim(0, total_target + 100) plt.ylim(0, total_targets[0] + 100)
# plt.gcf().autofmt_xdate()
# use timezone offset to center tick labels # use timezone offset to center tick labels
plt.gca().xaxis.set_major_locator( plt.gca().xaxis.set_major_locator(
@ -142,18 +173,24 @@ def plot(
plt.gca().tick_params("x", length=0, which="major") plt.gca().tick_params("x", length=0, which="major")
def val_to_perc(val): def val_to_perc(val):
return 100 * val / total_target return 100 * val / total_targets[0]
def perc_to_val(perc): def perc_to_val(perc):
return perc * total_target / 100 return perc * total_targets[0] / 100
sec_ax = plt.gca().secondary_yaxis("right", functions=(val_to_perc, perc_to_val)) sec_ax = plt.gca().secondary_yaxis("right", functions=(val_to_perc, perc_to_val))
sec_ax.set_ylabel("# Teilnahmen [% Erfolg]") sec_ax.set_ylabel("# Teilnahmen [% Erfolg]")
sec_ax.yaxis.set_major_formatter(mtick.PercentFormatter()) sec_ax.yaxis.set_major_formatter(mtick.PercentFormatter())
for total_target in total_targets:
plt.axhline(y=total_target, color="#48a9be", linestyle="--") plt.axhline(y=total_target, color="#48a9be", linestyle="--")
plt.tight_layout() plt.tight_layout()
return fig
def convert_fig_to_svg(fig: plt.Figure) -> str:
# Convert plot to SVG image # Convert plot to SVG image
imgdata = io.StringIO() imgdata = io.StringIO()
fig.savefig(imgdata, format="svg") fig.savefig(imgdata, format="svg")
@ -209,23 +246,24 @@ def tables(
df_state = df_state.sort_values("Landesbezirk") df_state = df_state.sort_values("Landesbezirk")
image = plot(df_state) fig = plot(df_state)
timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
except Exception: except Exception as e:
print(e)
last_file = sorted(Path("data").iterdir())[-1] last_file = sorted(Path("data").iterdir())[-1]
key = last_file.name[:10] key = last_file.name[:10]
with (Path("data") / f"{key}_data.ods").open("rb") as ff: with (Path("data") / f"{key}_data.ods").open("rb") as ff:
df = pd.read_excel(ff, sheet_name="digital").astype( df = pd.read_excel(ff, sheet_name="digital", index_col=0).astype(
{"Digitale Befragung": "Int32"} {"Digitale Befragung": "Int32"}
) )
with (Path("data") / f"{key}_state_data.ods").open("rb") as ff: with (Path("data") / f"{key}_state_data.ods").open("rb") as ff:
df_state = pd.read_excel(ff, sheet_name="digital").astype( df_state = pd.read_excel(ff, sheet_name="digital", index_col=0).astype(
{"Digitale Befragung": "Int32"} {"Digitale Befragung": "Int32"}
) )
image = plot() fig = plot()
timestamp = Markup(f'<font color="red">{key} 10:00:00</font>') timestamp = Markup(f'<font color="red">{key} 10:00:00</font>')
_print_as_html(df_state) _print_as_html(df_state)
@ -235,7 +273,7 @@ def tables(
"base.html", "base.html",
tables="\n".join(output_str), tables="\n".join(output_str),
timestamp=timestamp, timestamp=timestamp,
image=image, image=convert_fig_to_svg(fig),
) )