From dc806712953f49bde06aaef9586e3f21ccc5cca9 Mon Sep 17 00:00:00 2001
From: Felix Blanke <felixblanke@uni-bonn.de>
Date: Mon, 28 Aug 2023 16:56:01 +0200
Subject: [PATCH] Group by Landesbezirk

---
 download_digital.py |  4 ++++
 wsgi.py             | 24 +++++++++++++++---------
 2 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/download_digital.py b/download_digital.py
index ca3035c..c8119da 100644
--- a/download_digital.py
+++ b/download_digital.py
@@ -85,6 +85,10 @@ landesbezirk_dict = {
 }
 
 
+def get_landesbezirk(id: str):
+    return landesbezirk_dict[str((int(id) // 100) * 100)]
+
+
 def get_bez_data(
     tags: list[str], url: str = "https://beschaeftigtenbefragung.verdi.de/"
 ) -> list[dict]:
diff --git a/wsgi.py b/wsgi.py
index 3786245..8e6ed3f 100644
--- a/wsgi.py
+++ b/wsgi.py
@@ -11,7 +11,7 @@ import pandas as pd
 from flask import Flask, Markup, render_template, request
 from flask_caching import Cache
 
-from download_digital import construct_dataframe, get_bez_data
+from download_digital import construct_dataframe, get_bez_data, get_landesbezirk
 
 config = {
     "CACHE_TYPE": "FileSystemCache",
@@ -52,26 +52,32 @@ def create_plot_df(
     for f in sorted(Path(data_folder).iterdir()):
         with f.open("rb") as ff:
             df = pd.read_excel(ff, sheet_name=sheet_name, index_col=0)
+
+        if "Landesbezirk" not in df.columns:
+            df["Landesbezirk"] = df.index.map(get_landesbezirk)
+
         df = df.astype({"Digitale Befragung": "Int32"})
-        sum_val = df[["Digitale Befragung"]].sum().iloc[0]
+        df = df.groupby("Landesbezirk")[["Digitale Befragung"]].sum()
+
         key = f.name[:10]
-        data_dict[key] = sum_val
+        data_dict[key] = df["Digitale Befragung"]
 
-    data_dict["2023-08-15"] = 275
+    df = pd.DataFrame(data=data_dict).T
 
-    series = pd.Series(data_dict.values(), index=data_dict)
-    series.index = series.index.astype("datetime64[ns]") + pd.DateOffset(hours=10)
+    df.index = df.index.astype("datetime64[ns]") + pd.DateOffset(hours=10)
 
-    df = series.to_frame("Digitale Befragung")
     df = df.reindex(
         pd.date_range(start="2023-08-15", end=curr_datetime)
         + pd.DateOffset(hours=10)
     )
 
     if current_df is not None:
+        if "Landesbezirk" not in current_df.columns:
+            current_df["Landesbezirk"] = current_df.index.map(get_landesbezirk)
         current_df = current_df.astype({"Digitale Befragung": "Int32"})
-        sum_val = current_df[["Digitale Befragung"]].sum().iloc[0]
-        df.loc[curr_datetime] = sum_val
+        current_df = current_df.groupby("Landesbezirk")[["Digitale Befragung"]].sum()
+
+        df.loc[curr_datetime] = current_df["Digitale Befragung"]
 
         if pd.isna(df.loc[df.index.max()][0]):
             df = df.drop([df.index.max()])