2023-08-27 19:06:34 +02:00

228 lines
6.4 KiB
Python

import base64
import datetime
import io
from itertools import chain
from pathlib import Path
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import numpy as np
import pandas as pd
from flask import Flask, render_template, request
from flask_caching import Cache
from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
from download_digital import construct_dataframe, get_bez_data
config = {
"CACHE_TYPE": "FileSystemCache",
"CACHE_DEFAULT_TIMEOUT": 300,
"CACHE_THRESHOLD": 1000,
"CACHE_DIR": "cache",
}
import locale
locale.setlocale(locale.LC_ALL, "de_DE.UTF-8")
app = Flask(__name__)
app.config.from_mapping(config)
cache = Cache(app)
def get_tables(url: str) -> tuple[pd.DataFrame, pd.DataFrame]:
bez_data = get_bez_data(["bez_data_0", "bez_data_2"], url)
df = construct_dataframe(
bez_data=bez_data[0],
grouped=False,
special_tag="stud",
)
df_state = construct_dataframe(
bez_data=bez_data[1], grouped=False, no_processing=True
)
return df, df_state
def plot(
current_df: pd.DataFrame | None = None,
data_folder: str = "data",
sheet_name: str = "digital",
total_target: int = 1500,
plot_all: bool = False,
alpha: float | None = None,
) -> str:
data_dict = {}
## Important: If multiple results are stored for the same date
## the last is used. So this relies on the Landesbezirk data
## to be stored with a filename that is lexigraphically larger
## than the single district results.
for f in sorted(Path(data_folder).iterdir()):
with f.open("rb") as ff:
df = pd.read_excel(ff, sheet_name=sheet_name)
df = df.astype({"Digitale Befragung": "Int32"})
sum_val = df[["Digitale Befragung"]].sum().iloc[0]
key = f.name[:10]
data_dict[key] = sum_val
data_dict["2023-08-15"] = 275
series = pd.Series(data_dict.values(), index=data_dict)
series.index = series.index.astype("datetime64[ns]") + pd.DateOffset(hours=10)
df = series.to_frame("Digitale Befragung")
df = df.reindex(
pd.date_range(start="2023-08-15", end=max(data_dict.keys()))
+ pd.DateOffset(hours=10)
)
if current_df is not None:
current_df = current_df.astype({"Digitale Befragung": "Int32"})
sum_val = current_df[["Digitale Befragung"]].sum().iloc[0]
df.loc[datetime.datetime.now()] = sum_val
plt.figure(dpi=300)
# fill weekends
max_date = max(data_dict.keys())
max_date = datetime.datetime.strptime(max_date, "%Y-%m-%d") + datetime.timedelta(
days=1
)
days = pd.date_range(start="2023-08-14", end=max_date)
for idx, day in enumerate(days[:-1]):
if day.weekday() >= 5:
plt.gca().axvspan(days[idx], days[idx + 1], alpha=0.2, color="gray")
if alpha is not None:
plt.fill_between(
df.dropna().index,
df.dropna()["Digitale Befragung"],
color="#e4004e",
alpha=alpha,
)
plt.plot(
df.dropna().index,
df.dropna()["Digitale Befragung"],
ls="--",
marker="o",
lw=1,
color="#e4004e",
markersize=4,
)
if current_df is not None:
plt.annotate(
"Jetzt",
(df.dropna().index[-1], df.dropna()["Digitale Befragung"][-1] * 1.03),
fontsize=8,
ha="center",
)
plt.plot(df.index, df["Digitale Befragung"], lw=1.5, color="#e4004e")
plt.title("Teilnahme an Digitaler Beschäftigtenbefragung")
plt.ylabel("# Teilnahmen")
plt.ylim(0, total_target + 100)
# plt.gcf().autofmt_xdate()
# use timezone offset to center tick labels
plt.gca().xaxis.set_major_locator(
mdates.WeekdayLocator([mdates.TU], tz="Etc/GMT+12")
)
plt.gca().xaxis.set_minor_locator(mdates.DayLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter("%a %d.%m."))
plt.grid(True, which="major", axis="y")
plt.grid(True, which="minor", axis="x")
plt.gca().tick_params("x", length=0, which="major")
def val_to_perc(val):
return 100 * val / total_target
def perc_to_val(perc):
return perc * total_target / 100
sec_ax = plt.gca().secondary_yaxis("right", functions=(val_to_perc, perc_to_val))
sec_ax.set_ylabel("# Teilnahmen [% Erfolg]")
sec_ax.yaxis.set_major_formatter(mtick.PercentFormatter())
plt.axhline(y=total_target, color="#48a9be", linestyle="--")
plt.tight_layout()
# Convert plot to PNG image
pngImage = io.BytesIO()
FigureCanvas(plt.gcf()).print_png(pngImage)
# Encode PNG image to base64 string
pngImageB64String = "data:image/png;base64,"
pngImageB64String += base64.b64encode(pngImage.getvalue()).decode("utf8")
return pngImageB64String
@app.route("/")
@cache.cached(timeout=50)
def tables(
url: str = "https://beschaeftigtenbefragung.verdi.de/",
):
df, df_state = get_tables(url)
df = df.sort_values(
["Digitale Befragung", "Bundesland", "Bezirk"], ascending=[False, True, True]
)
df_state = df_state.sort_values("Landesbezirk")
output_str = []
def _print_as_html(df: pd.DataFrame):
df = df.astype({"Digitale Befragung": "Int32"})
with pd.option_context("display.max_rows", None):
table = df.to_html(
index_names=False,
justify="left",
index=False,
classes="sortable dataframe",
)
tfoot = [
" <tfoot>",
" <td>Gesamt</td>",
]
for i in range(len(df.columns) - 2):
tfoot.append(" <td/>")
tfoot.extend(
[
f" <td>{df['Digitale Befragung'].sum()}</td>",
" </tr>",
" </tfoot>",
]
)
tfoot = "\n".join(tfoot)
idx = table.index("</table>")
output_str.append(table[: idx - 1])
output_str.append(tfoot)
output_str.append(table[idx:])
_print_as_html(df_state)
_print_as_html(df)
image = plot(df_state, plot_all=True)
return render_template(
"base.html",
tables="\n".join(output_str),
timestamp=datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
image=image,
)
if __name__ == "__main__":
app.run()