import datetime from pathlib import Path import fire import matplotlib import matplotlib.dates as mdates import matplotlib.pyplot as plt import matplotlib.ticker as mtick import numpy as np import pandas as pd import scipy from wsgi import create_fig, create_plot_df, get_tables, plot def create_dfs(url: str = "https://beschaeftigtenbefragung.verdi.de/"): try: df, df_state, curr_datetime = get_tables(url) df = df.sort_values( ["Digitale Befragung", "Bundesland", "Bezirk"], ascending=[False, True, True], ) df_state = df_state.sort_values("Landesbezirk") plot_df = create_plot_df(curr_datetime, df_state) except Exception as e: print(e) last_file = sorted(Path("data").iterdir())[-1] key = last_file.name[:10] with (Path("data") / f"{key}_data.ods").open("rb") as ff: df = pd.read_excel(ff, sheet_name="digital", index_col=0).astype( {"Digitale Befragung": "Int32"} ) with (Path("data") / f"{key}_state_data.ods").open("rb") as ff: df_state = pd.read_excel(ff, sheet_name="digital", index_col=0).astype( {"Digitale Befragung": "Int32"} ) plot_df = create_plot_df(None, None) return df, df_state, plot_df def main(): df, df_state, plot_df = create_dfs() plot(plot_df, landesbez_str=[None], max_shading_date="2023-10-02") plt.gcf().set_size_inches(10, 5) target_time = pd.Timestamp("2023-10-01") xlim = plt.xlim() plt.xlim(xlim[0], pd.Timestamp("2023-10-02")) plt.ylim(0, 3500 * 1.025) data = plot_df.dropna().sum(1) data = data.iloc[3:] casted_timepoints = data.index.to_numpy().astype(np.int64) reg = scipy.stats.linregress(casted_timepoints, data) print(f"Regression R^2: {reg.rvalue**2:.6f}") date_range = pd.date_range(start="2023-08-21 10:00:00", end=target_time) date_range = date_range.to_series(index=np.arange(len(date_range))) date_range.loc[len(date_range)] = target_time regression_curve = lambda x: reg.intercept + reg.slope * x.astype(np.int64) vals = regression_curve(date_range.to_numpy()) print(f"Projizierte Teilnahme am {target_time}: {vals[-1]:.2f}") now = pd.Timestamp.now() print( f"Projizierte Teilnahme jetzt: {regression_curve(pd.Series([now]).to_numpy()).item():.2f}" ) print() for target in [1500, 2500, 3500]: target_reached_date = (target - reg.intercept) / reg.slope print( f"Ziel {target} erreicht am {pd.Timestamp(target_reached_date).strftime('%Y-%m-%d %X')}" ) num_skipped_days = 2 x = date_range.to_numpy().astype(np.int64) curr_time = x[data.index.argmax() + num_skipped_days] delta = 3500 - data[-1] target_line = data[-1] + delta / (x[-1] - curr_time) * ( x[data.index.argmax() + num_skipped_days :] - curr_time ) plt.plot( date_range, vals, label=f"Lineare Regression ($R^2={reg.rvalue**2:.3f}$)", color="tab:green", zorder=1, ) plt.plot( date_range[data.index.argmax() + num_skipped_days :], target_line, label="Ziellinie", color="tab:orange", linestyle=":", zorder=1, ) # plt.gca().relim() # make sure all the data fits # plt.gca().autoscale() # auto-scale plt.xlabel("Zeit in Tagen ab dem 15.08.") plt.axvline(x=target_time, color="tab:red", linestyle="--") plt.legend() plt.gca().xaxis.set_major_locator(matplotlib.ticker.NullLocator()) plt.gca().xaxis.set_major_locator(matplotlib.ticker.NullLocator()) plt.gca().xaxis.set_major_formatter(mdates.DateFormatter("%d.%m.")) plt.gca().set_xticks([target_time]) plt.title("Projektion Teilnahme an Digitaler Beschäftigtenbefragung") plt.savefig("plots/regression.png", bbox_inches="tight", dpi=300) if __name__ == "__main__": fire.Fire(main)