tvstud-download-digital/wsgi.py

import base64
import datetime
import io
from itertools import chain
from pathlib import Path

import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import numpy as np
import pandas as pd
from flask import Flask, render_template, request
from flask_caching import Cache
from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas

from download_digital import construct_dataframe, get_bez_data

config = {
    "CACHE_TYPE": "FileSystemCache",
    "CACHE_DEFAULT_TIMEOUT": 300,
    "CACHE_THRESHOLD": 1000,
    "CACHE_DIR": "cache",
}
import locale

locale.setlocale(locale.LC_ALL, "de_DE.UTF-8")
app = Flask(__name__)
app.config.from_mapping(config)
cache = Cache(app)


def get_tables(url: str) -> tuple[pd.DataFrame, pd.DataFrame]:
    bez_data = get_bez_data(["bez_data_0", "bez_data_2"], url)

    df = construct_dataframe(
        bez_data=bez_data[0],
        grouped=False,
        special_tag="stud",
    )

    df_state = construct_dataframe(
        bez_data=bez_data[1], grouped=False, no_processing=True
    )

    return df, df_state


def plot(
    current_df: pd.DataFrame | None = None,
    data_folder: str = "data",
    sheet_name: str = "digital",
    total_target: int = 1500,
    plot_all: bool = False,
    alpha: float | None = None,
) -> str:
    data_dict = {}

    ## Important: If multiple results are stored for the same date
    ##            the last is used. So this relies on the Landesbezirk data
    ##            to be stored with a filename that is lexigraphically larger
    ##            than the single district results.

    for f in sorted(Path(data_folder).iterdir()):
        with f.open("rb") as ff:
            df = pd.read_excel(ff, sheet_name=sheet_name)
        df = df.astype({"Digitale Befragung": "Int32"})
        sum_val = df[["Digitale Befragung"]].sum().iloc[0]
        key = f.name[:10]
        data_dict[key] = sum_val

    data_dict["2023-08-15"] = 275

    series = pd.Series(data_dict.values(), index=data_dict)
    series.index = series.index.astype("datetime64[ns]") + pd.DateOffset(hours=10)

    df = series.to_frame("Digitale Befragung")
    df = df.reindex(
        pd.date_range(start="2023-08-15", end=max(data_dict.keys()))
        + pd.DateOffset(hours=10)
    )

    if current_df is not None:
        current_df = current_df.astype({"Digitale Befragung": "Int32"})
        sum_val = current_df[["Digitale Befragung"]].sum().iloc[0]
        df.loc[datetime.datetime.now()] = sum_val

    plt.figure(dpi=300)

    # fill weekends
    max_date = max(data_dict.keys())
    max_date = datetime.datetime.strptime(max_date, "%Y-%m-%d") + datetime.timedelta(
        days=1
    )
    days = pd.date_range(start="2023-08-14", end=max_date)
    for idx, day in enumerate(days[:-1]):
        if day.weekday() >= 5:
            plt.gca().axvspan(days[idx], days[idx + 1], alpha=0.2, color="gray")

    if alpha is not None:
        plt.fill_between(
            df.dropna().index,
            df.dropna()["Digitale Befragung"],
            color="#e4004e",
            alpha=alpha,
        )

    plt.plot(
        df.dropna().index,
        df.dropna()["Digitale Befragung"],
        ls="--",
        marker="o",
        lw=1,
        color="#e4004e",
        markersize=4,
    )

    if current_df is not None:
        plt.annotate(
            "Jetzt",
            (df.dropna().index[-1], df.dropna()["Digitale Befragung"][-1] * 1.03),
            fontsize=8,
            ha="center",
        )

    plt.plot(df.index, df["Digitale Befragung"], lw=1.5, color="#e4004e")

    plt.title("Teilnahme an Digitaler Beschäftigtenbefragung")
    plt.ylabel("# Teilnahmen")
    plt.ylim(0, total_target + 100)

    # plt.gcf().autofmt_xdate()

    # use timezone offset to center tick labels
    plt.gca().xaxis.set_major_locator(
        mdates.WeekdayLocator([mdates.TU], tz="Etc/GMT+12")
    )
    plt.gca().xaxis.set_minor_locator(mdates.DayLocator())
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter("%a %d.%m."))

    plt.grid(True, which="major", axis="y")
    plt.grid(True, which="minor", axis="x")

    plt.gca().tick_params("x", length=0, which="major")

    def val_to_perc(val):
        return 100 * val / total_target

    def perc_to_val(perc):
        return perc * total_target / 100

    sec_ax = plt.gca().secondary_yaxis("right", functions=(val_to_perc, perc_to_val))
    sec_ax.set_ylabel("# Teilnahmen [% Erfolg]")
    sec_ax.yaxis.set_major_formatter(mtick.PercentFormatter())

    plt.axhline(y=total_target, color="#48a9be", linestyle="--")
    plt.tight_layout()

    # Convert plot to PNG image
    pngImage = io.BytesIO()
    FigureCanvas(plt.gcf()).print_png(pngImage)

    # Encode PNG image to base64 string
    pngImageB64String = "data:image/png;base64,"
    pngImageB64String += base64.b64encode(pngImage.getvalue()).decode("utf8")

    return pngImageB64String


@app.route("/")
@cache.cached(timeout=50)
def tables(
    url: str = "https://beschaeftigtenbefragung.verdi.de/",
):
    df, df_state = get_tables(url)

    df = df.sort_values(
        ["Digitale Befragung", "Bundesland", "Bezirk"], ascending=[False, True, True]
    )

    df_state = df_state.sort_values("Landesbezirk")

    output_str = []

    def _print_as_html(df: pd.DataFrame):
        df = df.astype({"Digitale Befragung": "Int32"})
        with pd.option_context("display.max_rows", None):
            table = df.to_html(
                index_names=False,
                justify="left",
                index=False,
                classes="sortable dataframe",
            )

            tfoot = [
                "  <tfoot>",
                "      <td>Gesamt</td>",
            ]
            for i in range(len(df.columns) - 2):
                tfoot.append("      <td/>")
            tfoot.extend(
                [
                    f"      <td>{df['Digitale Befragung'].sum()}</td>",
                    "    </tr>",
                    "  </tfoot>",
                ]
            )
            tfoot = "\n".join(tfoot)
            idx = table.index("</table>")
            output_str.append(table[: idx - 1])
            output_str.append(tfoot)
            output_str.append(table[idx:])

    _print_as_html(df_state)
    _print_as_html(df)

    image = plot(df_state, plot_all=True)

    return render_template(
        "base.html",
        tables="\n".join(output_str),
        timestamp=datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        image=image,
    )


if __name__ == "__main__":
    app.run()