diff --git a/regression.py b/regression.py new file mode 100644 index 0000000..b5fd14c --- /dev/null +++ b/regression.py @@ -0,0 +1,99 @@ +import datetime +from pathlib import Path + +import fire +import matplotlib +import matplotlib.pyplot as plt +import scipy +import numpy as np +import pandas as pd + +from wsgi import create_fig, create_plot_df, plot + +import matplotlib.dates as mdates +import matplotlib.pyplot as plt +import matplotlib.ticker as mtick + + +def create_dfs(): + last_file = sorted(Path("data").iterdir())[-1] + key = last_file.name[:10] + + with (Path("data") / f"{key}_data.ods").open("rb") as ff: + df = pd.read_excel(ff, sheet_name="digital", index_col=0).astype( + {"Digitale Befragung": "Int32"} + ) + with (Path("data") / f"{key}_state_data.ods").open("rb") as ff: + df_state = pd.read_excel(ff, sheet_name="digital", index_col=0).astype( + {"Digitale Befragung": "Int32"} + ) + + plot_df = create_plot_df(None, None) + + + return df, df_state, plot_df + + +def main(): + df, df_state, plot_df = create_dfs() + + plot(plot_df, landesbez_str=[None], max_shading_date = "2023-10-02") + + plt.gcf().set_size_inches(10, 5) + + target_time = pd.Timestamp("2023-10-01") + xlim = plt.xlim() + plt.xlim(xlim[0], pd.Timestamp("2023-10-02")) + + plt.ylim(0, 3500 * 1.025) + + data = plot_df.dropna().sum(1) + data = data.iloc[3:] + casted_timepoints = data.index.to_numpy().astype(np.int64) + reg = scipy.stats.linregress(casted_timepoints, data) + + print(f"Regression R^2: {reg.rvalue**2:.6f}") + + date_range = pd.date_range(start="2023-08-21 10:00:00", end=target_time) + date_range = date_range.to_series(index=np.arange(len(date_range))) + date_range.loc[len(date_range)] = target_time + + regression_curve = lambda x: reg.intercept + reg.slope * x.astype(np.int64) + vals = regression_curve(date_range.to_numpy()) + print(f"Projizierte Teilnahme am {target_time}: {vals[-1]:.2f}") + now = pd.Timestamp.now() + print(f"Projizierte Teilnahme jetzt: {regression_curve(pd.Series([now]).to_numpy()).item():.2f}") + + print() + + for target in [1500, 2500, 3500]: + target_reached_date = (target - reg.intercept) / reg.slope + print(f"Ziel {target} erreicht am {pd.Timestamp(target_reached_date).strftime('%Y-%m-%d %X')}") + + num_skipped_days = 2 + + x = date_range.to_numpy().astype(np.int64) + curr_time = x[data.index.argmax() + num_skipped_days] + + delta = 3500 - data[-1] + + target_line = data[-1] + delta / (x[-1] - curr_time) * (x[data.index.argmax() + num_skipped_days:] - curr_time) + + plt.plot(date_range, vals, label=f"Lineare Regression ($R^2={reg.rvalue**2:.3f}$)", color="tab:blue", zorder=1) + plt.plot(date_range[data.index.argmax() + num_skipped_days:], target_line, label="Ziellinie", color="tab:orange", linestyle=":", zorder=1) + # plt.gca().relim() # make sure all the data fits + # plt.gca().autoscale() # auto-scale + plt.xlabel("Zeit in Tagen ab dem 15.08.") + plt.axvline(x=target_time, color="tab:red", linestyle="--") + plt.legend() + plt.gca().xaxis.set_major_locator(matplotlib.ticker.NullLocator()) + plt.gca().xaxis.set_major_locator(matplotlib.ticker.NullLocator()) + plt.gca().xaxis.set_major_formatter(mdates.DateFormatter("%d.%m.")) + + plt.gca().set_xticks([target_time]) + plt.title("Projektion Teilnahme an Digitaler Beschäftigtenbefragung") + plt.savefig("plots/regression.png") + + +if __name__ == "__main__": + fire.Fire(main)