2023-08-27 23:21:56 +02:00

223 lines
6.2 KiB
Python

import datetime
import io
from itertools import chain
from pathlib import Path
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import numpy as np
import pandas as pd
from flask import Flask, render_template, request
from flask_caching import Cache
from download_digital import construct_dataframe, get_bez_data
config = {
"CACHE_TYPE": "FileSystemCache",
"CACHE_DEFAULT_TIMEOUT": 300,
"CACHE_THRESHOLD": 50,
"CACHE_DIR": "cache",
}
import locale
locale.setlocale(locale.LC_ALL, "de_DE.UTF-8")
app = Flask(__name__)
app.config.from_mapping(config)
cache = Cache(app)
def get_tables(url: str) -> tuple[pd.DataFrame, pd.DataFrame]:
bez_data = get_bez_data(["bez_data_0", "bez_data_2"], url)
df = construct_dataframe(
bez_data=bez_data[0],
grouped=False,
special_tag="stud",
)
df_state = construct_dataframe(
bez_data=bez_data[1], grouped=False, no_processing=True
)
return df, df_state
def plot(
current_df: pd.DataFrame | None = None,
data_folder: str = "data",
sheet_name: str = "digital",
total_target: int = 1500,
plot_all: bool = False,
alpha: float | None = None,
) -> str:
data_dict = {}
## Important: If multiple results are stored for the same date
## the last is used. So this relies on the Landesbezirk data
## to be stored with a filename that is lexigraphically larger
## than the single district results.
for f in sorted(Path(data_folder).iterdir()):
with f.open("rb") as ff:
df = pd.read_excel(ff, sheet_name=sheet_name)
df = df.astype({"Digitale Befragung": "Int32"})
sum_val = df[["Digitale Befragung"]].sum().iloc[0]
key = f.name[:10]
data_dict[key] = sum_val
data_dict["2023-08-15"] = 275
series = pd.Series(data_dict.values(), index=data_dict)
series.index = series.index.astype("datetime64[ns]") + pd.DateOffset(hours=10)
df = series.to_frame("Digitale Befragung")
df = df.reindex(
pd.date_range(start="2023-08-15", end=max(data_dict.keys()))
+ pd.DateOffset(hours=10)
)
if current_df is not None:
current_df = current_df.astype({"Digitale Befragung": "Int32"})
sum_val = current_df[["Digitale Befragung"]].sum().iloc[0]
df.loc[datetime.datetime.now()] = sum_val
fig = plt.figure(dpi=300)
# fill weekends
max_date = max(data_dict.keys())
max_date = datetime.datetime.strptime(max_date, "%Y-%m-%d") + datetime.timedelta(
days=1
)
days = pd.date_range(start="2023-08-14", end=max_date)
for idx, day in enumerate(days[:-1]):
if day.weekday() >= 5:
plt.gca().axvspan(days[idx], days[idx + 1], alpha=0.2, color="gray")
if alpha is not None:
plt.fill_between(
df.dropna().index,
df.dropna()["Digitale Befragung"],
color="#e4004e",
alpha=alpha,
)
plt.plot(
df.dropna().index,
df.dropna()["Digitale Befragung"],
ls="--",
marker="o",
lw=1,
color="#e4004e",
markersize=4,
)
if current_df is not None:
plt.annotate(
"Jetzt",
(df.dropna().index[-1], df.dropna()["Digitale Befragung"][-1] * 1.03),
fontsize=8,
ha="center",
)
plt.plot(df.index, df["Digitale Befragung"], lw=1.5, color="#e4004e")
plt.title("Teilnahme an Digitaler Beschäftigtenbefragung")
plt.ylabel("# Teilnahmen")
plt.ylim(0, total_target + 100)
# plt.gcf().autofmt_xdate()
# use timezone offset to center tick labels
plt.gca().xaxis.set_major_locator(
mdates.WeekdayLocator([mdates.TU], tz="Etc/GMT+12")
)
plt.gca().xaxis.set_minor_locator(mdates.DayLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter("%a %d.%m."))
plt.grid(True, which="major", axis="y")
plt.grid(True, which="minor", axis="x")
plt.gca().tick_params("x", length=0, which="major")
def val_to_perc(val):
return 100 * val / total_target
def perc_to_val(perc):
return perc * total_target / 100
sec_ax = plt.gca().secondary_yaxis("right", functions=(val_to_perc, perc_to_val))
sec_ax.set_ylabel("# Teilnahmen [% Erfolg]")
sec_ax.yaxis.set_major_formatter(mtick.PercentFormatter())
plt.axhline(y=total_target, color="#48a9be", linestyle="--")
plt.tight_layout()
# Convert plot to SVG image
imgdata = io.StringIO()
fig.savefig(imgdata, format="svg")
svgImage.seek(0) # rewind the data
return svgImage.read()
@app.route("/")
@cache.cached()
def tables(
url: str = "https://beschaeftigtenbefragung.verdi.de/",
):
df, df_state = get_tables(url)
df = df.sort_values(
["Digitale Befragung", "Bundesland", "Bezirk"], ascending=[False, True, True]
)
df_state = df_state.sort_values("Landesbezirk")
output_str = []
def _print_as_html(df: pd.DataFrame):
df = df.astype({"Digitale Befragung": "Int32"})
with pd.option_context("display.max_rows", None):
table = df.to_html(
index_names=False,
justify="left",
index=False,
classes="sortable dataframe",
)
tfoot = [
" <tfoot>",
" <td>Gesamt</td>",
]
for i in range(len(df.columns) - 2):
tfoot.append(" <td/>")
tfoot.extend(
[
f" <td>{df['Digitale Befragung'].sum()}</td>",
" </tr>",
" </tfoot>",
]
)
tfoot = "\n".join(tfoot)
idx = table.index("</table>")
output_str.append(table[: idx - 1])
output_str.append(tfoot)
output_str.append(table[idx:])
_print_as_html(df_state)
_print_as_html(df)
image = plot(df_state, plot_all=True)
return render_template(
"base.html",
tables="\n".join(output_str),
timestamp=datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
image=image,
)
if __name__ == "__main__":
app.run()