Compare commits

...

2 Commits

Author SHA1 Message Date
Felix Blanke
76d890980c Format 2023-09-07 13:11:32 +02:00
Felix Blanke
06c68c5167 Add regression script 2023-09-07 13:10:42 +02:00

115
regression.py Normal file
View File

@ -0,0 +1,115 @@
import datetime
from pathlib import Path
import fire
import matplotlib
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import numpy as np
import pandas as pd
import scipy
from wsgi import create_fig, create_plot_df, plot
def create_dfs():
last_file = sorted(Path("data").iterdir())[-1]
key = last_file.name[:10]
with (Path("data") / f"{key}_data.ods").open("rb") as ff:
df = pd.read_excel(ff, sheet_name="digital", index_col=0).astype(
{"Digitale Befragung": "Int32"}
)
with (Path("data") / f"{key}_state_data.ods").open("rb") as ff:
df_state = pd.read_excel(ff, sheet_name="digital", index_col=0).astype(
{"Digitale Befragung": "Int32"}
)
plot_df = create_plot_df(None, None)
return df, df_state, plot_df
def main():
df, df_state, plot_df = create_dfs()
plot(plot_df, landesbez_str=[None], max_shading_date="2023-10-02")
plt.gcf().set_size_inches(10, 5)
target_time = pd.Timestamp("2023-10-01")
xlim = plt.xlim()
plt.xlim(xlim[0], pd.Timestamp("2023-10-02"))
plt.ylim(0, 3500 * 1.025)
data = plot_df.dropna().sum(1)
data = data.iloc[3:]
casted_timepoints = data.index.to_numpy().astype(np.int64)
reg = scipy.stats.linregress(casted_timepoints, data)
print(f"Regression R^2: {reg.rvalue**2:.6f}")
date_range = pd.date_range(start="2023-08-21 10:00:00", end=target_time)
date_range = date_range.to_series(index=np.arange(len(date_range)))
date_range.loc[len(date_range)] = target_time
regression_curve = lambda x: reg.intercept + reg.slope * x.astype(np.int64)
vals = regression_curve(date_range.to_numpy())
print(f"Projizierte Teilnahme am {target_time}: {vals[-1]:.2f}")
now = pd.Timestamp.now()
print(
f"Projizierte Teilnahme jetzt: {regression_curve(pd.Series([now]).to_numpy()).item():.2f}"
)
print()
for target in [1500, 2500, 3500]:
target_reached_date = (target - reg.intercept) / reg.slope
print(
f"Ziel {target} erreicht am {pd.Timestamp(target_reached_date).strftime('%Y-%m-%d %X')}"
)
num_skipped_days = 2
x = date_range.to_numpy().astype(np.int64)
curr_time = x[data.index.argmax() + num_skipped_days]
delta = 3500 - data[-1]
target_line = data[-1] + delta / (x[-1] - curr_time) * (
x[data.index.argmax() + num_skipped_days :] - curr_time
)
plt.plot(
date_range,
vals,
label=f"Lineare Regression ($R^2={reg.rvalue**2:.3f}$)",
color="tab:blue",
zorder=1,
)
plt.plot(
date_range[data.index.argmax() + num_skipped_days :],
target_line,
label="Ziellinie",
color="tab:orange",
linestyle=":",
zorder=1,
)
# plt.gca().relim() # make sure all the data fits
# plt.gca().autoscale() # auto-scale
plt.xlabel("Zeit in Tagen ab dem 15.08.")
plt.axvline(x=target_time, color="tab:red", linestyle="--")
plt.legend()
plt.gca().xaxis.set_major_locator(matplotlib.ticker.NullLocator())
plt.gca().xaxis.set_major_locator(matplotlib.ticker.NullLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter("%d.%m."))
plt.gca().set_xticks([target_time])
plt.title("Projektion Teilnahme an Digitaler Beschäftigtenbefragung")
plt.savefig("plots/regression.png")
if __name__ == "__main__":
fire.Fire(main)