Add regression script

This commit is contained in:
Felix Blanke 2023-09-07 13:10:42 +02:00
parent 387b976f81
commit 06c68c5167

99
regression.py Normal file
View File

@ -0,0 +1,99 @@
import datetime
from pathlib import Path
import fire
import matplotlib
import matplotlib.pyplot as plt
import scipy
import numpy as np
import pandas as pd
from wsgi import create_fig, create_plot_df, plot
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
def create_dfs():
last_file = sorted(Path("data").iterdir())[-1]
key = last_file.name[:10]
with (Path("data") / f"{key}_data.ods").open("rb") as ff:
df = pd.read_excel(ff, sheet_name="digital", index_col=0).astype(
{"Digitale Befragung": "Int32"}
)
with (Path("data") / f"{key}_state_data.ods").open("rb") as ff:
df_state = pd.read_excel(ff, sheet_name="digital", index_col=0).astype(
{"Digitale Befragung": "Int32"}
)
plot_df = create_plot_df(None, None)
return df, df_state, plot_df
def main():
df, df_state, plot_df = create_dfs()
plot(plot_df, landesbez_str=[None], max_shading_date = "2023-10-02")
plt.gcf().set_size_inches(10, 5)
target_time = pd.Timestamp("2023-10-01")
xlim = plt.xlim()
plt.xlim(xlim[0], pd.Timestamp("2023-10-02"))
plt.ylim(0, 3500 * 1.025)
data = plot_df.dropna().sum(1)
data = data.iloc[3:]
casted_timepoints = data.index.to_numpy().astype(np.int64)
reg = scipy.stats.linregress(casted_timepoints, data)
print(f"Regression R^2: {reg.rvalue**2:.6f}")
date_range = pd.date_range(start="2023-08-21 10:00:00", end=target_time)
date_range = date_range.to_series(index=np.arange(len(date_range)))
date_range.loc[len(date_range)] = target_time
regression_curve = lambda x: reg.intercept + reg.slope * x.astype(np.int64)
vals = regression_curve(date_range.to_numpy())
print(f"Projizierte Teilnahme am {target_time}: {vals[-1]:.2f}")
now = pd.Timestamp.now()
print(f"Projizierte Teilnahme jetzt: {regression_curve(pd.Series([now]).to_numpy()).item():.2f}")
print()
for target in [1500, 2500, 3500]:
target_reached_date = (target - reg.intercept) / reg.slope
print(f"Ziel {target} erreicht am {pd.Timestamp(target_reached_date).strftime('%Y-%m-%d %X')}")
num_skipped_days = 2
x = date_range.to_numpy().astype(np.int64)
curr_time = x[data.index.argmax() + num_skipped_days]
delta = 3500 - data[-1]
target_line = data[-1] + delta / (x[-1] - curr_time) * (x[data.index.argmax() + num_skipped_days:] - curr_time)
plt.plot(date_range, vals, label=f"Lineare Regression ($R^2={reg.rvalue**2:.3f}$)", color="tab:blue", zorder=1)
plt.plot(date_range[data.index.argmax() + num_skipped_days:], target_line, label="Ziellinie", color="tab:orange", linestyle=":", zorder=1)
# plt.gca().relim() # make sure all the data fits
# plt.gca().autoscale() # auto-scale
plt.xlabel("Zeit in Tagen ab dem 15.08.")
plt.axvline(x=target_time, color="tab:red", linestyle="--")
plt.legend()
plt.gca().xaxis.set_major_locator(matplotlib.ticker.NullLocator())
plt.gca().xaxis.set_major_locator(matplotlib.ticker.NullLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter("%d.%m."))
plt.gca().set_xticks([target_time])
plt.title("Projektion Teilnahme an Digitaler Beschäftigtenbefragung")
plt.savefig("plots/regression.png")
if __name__ == "__main__":
fire.Fire(main)