tvstud-download-digital/regression.py
2023-09-19 19:46:01 +02:00

129 lines
3.9 KiB
Python

import datetime
from pathlib import Path
import fire
import matplotlib
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import numpy as np
import pandas as pd
import scipy
from wsgi import create_fig, create_plot_df, get_tables, plot
def create_dfs(url: str = "https://beschaeftigtenbefragung.verdi.de/"):
try:
df, df_state, curr_datetime = get_tables(url)
df = df.sort_values(
["Digitale Befragung", "Bundesland", "Bezirk"],
ascending=[False, True, True],
)
df_state = df_state.sort_values("Landesbezirk")
plot_df = create_plot_df(curr_datetime, df_state)
except Exception as e:
print(e)
last_file = sorted(Path("data").iterdir())[-1]
key = last_file.name[:10]
with (Path("data") / f"{key}_data.ods").open("rb") as ff:
df = pd.read_excel(ff, sheet_name="digital", index_col=0).astype(
{"Digitale Befragung": "Int32"}
)
with (Path("data") / f"{key}_state_data.ods").open("rb") as ff:
df_state = pd.read_excel(ff, sheet_name="digital", index_col=0).astype(
{"Digitale Befragung": "Int32"}
)
plot_df = create_plot_df(None, None)
return df, df_state, plot_df
def main():
df, df_state, plot_df = create_dfs()
plot(plot_df, landesbez_str=[None], max_shading_date="2023-10-02")
plt.gcf().set_size_inches(10, 5)
target_time = pd.Timestamp("2023-10-01")
xlim = plt.xlim()
plt.xlim(xlim[0], pd.Timestamp("2023-10-02"))
plt.ylim(0, 3500 * 1.025)
data = plot_df.dropna().sum(1)
data = data.iloc[3:]
casted_timepoints = data.index.to_numpy().astype(np.int64)
reg = scipy.stats.linregress(casted_timepoints, data)
print(f"Regression R^2: {reg.rvalue**2:.6f}")
date_range = pd.date_range(start="2023-08-21 10:00:00", end=target_time)
date_range = date_range.to_series(index=np.arange(len(date_range)))
date_range.loc[len(date_range)] = target_time
regression_curve = lambda x: reg.intercept + reg.slope * x.astype(np.int64)
vals = regression_curve(date_range.to_numpy())
print(f"Projizierte Teilnahme am {target_time}: {vals[-1]:.2f}")
now = pd.Timestamp.now()
print(
f"Projizierte Teilnahme jetzt: {regression_curve(pd.Series([now]).to_numpy()).item():.2f}"
)
print()
for target in [1500, 2500, 3500]:
target_reached_date = (target - reg.intercept) / reg.slope
print(
f"Ziel {target} erreicht am {pd.Timestamp(target_reached_date).strftime('%Y-%m-%d %X')}"
)
num_skipped_days = 2
x = date_range.to_numpy().astype(np.int64)
curr_time = x[data.index.argmax() + num_skipped_days]
delta = 3500 - data[-1]
target_line = data[-1] + delta / (x[-1] - curr_time) * (
x[data.index.argmax() + num_skipped_days :] - curr_time
)
plt.plot(
date_range,
vals,
label=f"Lineare Regression ($R^2={reg.rvalue**2:.3f}$)",
color="tab:green",
zorder=1,
)
plt.plot(
date_range[data.index.argmax() + num_skipped_days :],
target_line,
label="Ziellinie",
color="tab:orange",
linestyle=":",
zorder=1,
)
# plt.gca().relim() # make sure all the data fits
# plt.gca().autoscale() # auto-scale
plt.xlabel("Zeit in Tagen ab dem 15.08.")
plt.axvline(x=target_time, color="tab:red", linestyle="--")
plt.legend()
plt.gca().xaxis.set_major_locator(matplotlib.ticker.NullLocator())
plt.gca().xaxis.set_major_locator(matplotlib.ticker.NullLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter("%d.%m."))
plt.gca().set_xticks([target_time])
plt.title("Projektion Teilnahme an Digitaler Beschäftigtenbefragung")
plt.savefig("plots/regression.png", bbox_inches="tight", dpi=300)
if __name__ == "__main__":
fire.Fire(main)