129 lines
3.9 KiB
Python
129 lines
3.9 KiB
Python
import datetime
|
|
from pathlib import Path
|
|
|
|
import fire
|
|
import matplotlib
|
|
import matplotlib.dates as mdates
|
|
import matplotlib.pyplot as plt
|
|
import matplotlib.ticker as mtick
|
|
import numpy as np
|
|
import pandas as pd
|
|
import scipy
|
|
|
|
from wsgi import create_fig, create_plot_df, get_tables, plot
|
|
|
|
|
|
def create_dfs(url: str = "https://beschaeftigtenbefragung.verdi.de/"):
|
|
try:
|
|
df, df_state, curr_datetime = get_tables(url)
|
|
|
|
df = df.sort_values(
|
|
["Digitale Befragung", "Bundesland", "Bezirk"],
|
|
ascending=[False, True, True],
|
|
)
|
|
|
|
df_state = df_state.sort_values("Landesbezirk")
|
|
plot_df = create_plot_df(curr_datetime, df_state)
|
|
|
|
except Exception as e:
|
|
print(e)
|
|
last_file = sorted(Path("data").iterdir())[-1]
|
|
key = last_file.name[:10]
|
|
|
|
with (Path("data") / f"{key}_data.ods").open("rb") as ff:
|
|
df = pd.read_excel(ff, sheet_name="digital", index_col=0).astype(
|
|
{"Digitale Befragung": "Int32"}
|
|
)
|
|
with (Path("data") / f"{key}_state_data.ods").open("rb") as ff:
|
|
df_state = pd.read_excel(ff, sheet_name="digital", index_col=0).astype(
|
|
{"Digitale Befragung": "Int32"}
|
|
)
|
|
|
|
plot_df = create_plot_df(None, None)
|
|
|
|
return df, df_state, plot_df
|
|
|
|
|
|
def main():
|
|
df, df_state, plot_df = create_dfs()
|
|
|
|
plot(plot_df, landesbez_str=[None], max_shading_date="2023-10-02")
|
|
|
|
plt.gcf().set_size_inches(10, 5)
|
|
|
|
target_time = pd.Timestamp("2023-10-01")
|
|
xlim = plt.xlim()
|
|
plt.xlim(xlim[0], pd.Timestamp("2023-10-02"))
|
|
|
|
plt.ylim(0, 3500 * 1.025)
|
|
|
|
data = plot_df.dropna().sum(1)
|
|
data = data.iloc[3:]
|
|
casted_timepoints = data.index.to_numpy().astype(np.int64)
|
|
reg = scipy.stats.linregress(casted_timepoints, data)
|
|
|
|
print(f"Regression R^2: {reg.rvalue**2:.6f}")
|
|
|
|
date_range = pd.date_range(start="2023-08-21 10:00:00", end=target_time)
|
|
date_range = date_range.to_series(index=np.arange(len(date_range)))
|
|
date_range.loc[len(date_range)] = target_time
|
|
|
|
regression_curve = lambda x: reg.intercept + reg.slope * x.astype(np.int64)
|
|
vals = regression_curve(date_range.to_numpy())
|
|
print(f"Projizierte Teilnahme am {target_time}: {vals[-1]:.2f}")
|
|
now = pd.Timestamp.now()
|
|
print(
|
|
f"Projizierte Teilnahme jetzt: {regression_curve(pd.Series([now]).to_numpy()).item():.2f}"
|
|
)
|
|
|
|
print()
|
|
|
|
for target in [1500, 2500, 3500]:
|
|
target_reached_date = (target - reg.intercept) / reg.slope
|
|
print(
|
|
f"Ziel {target} erreicht am {pd.Timestamp(target_reached_date).strftime('%Y-%m-%d %X')}"
|
|
)
|
|
|
|
num_skipped_days = 2
|
|
|
|
x = date_range.to_numpy().astype(np.int64)
|
|
curr_time = x[data.index.argmax() + num_skipped_days]
|
|
|
|
delta = 3500 - data[-1]
|
|
|
|
target_line = data[-1] + delta / (x[-1] - curr_time) * (
|
|
x[data.index.argmax() + num_skipped_days :] - curr_time
|
|
)
|
|
|
|
plt.plot(
|
|
date_range,
|
|
vals,
|
|
label=f"Lineare Regression ($R^2={reg.rvalue**2:.3f}$)",
|
|
color="tab:green",
|
|
zorder=1,
|
|
)
|
|
plt.plot(
|
|
date_range[data.index.argmax() + num_skipped_days :],
|
|
target_line,
|
|
label="Ziellinie",
|
|
color="tab:orange",
|
|
linestyle=":",
|
|
zorder=1,
|
|
)
|
|
# plt.gca().relim() # make sure all the data fits
|
|
# plt.gca().autoscale() # auto-scale
|
|
plt.xlabel("Zeit in Tagen ab dem 15.08.")
|
|
plt.axvline(x=target_time, color="tab:red", linestyle="--")
|
|
plt.legend()
|
|
plt.gca().xaxis.set_major_locator(matplotlib.ticker.NullLocator())
|
|
plt.gca().xaxis.set_major_locator(matplotlib.ticker.NullLocator())
|
|
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter("%d.%m."))
|
|
|
|
plt.gca().set_xticks([target_time])
|
|
plt.title("Projektion Teilnahme an Digitaler Beschäftigtenbefragung")
|
|
plt.savefig("plots/regression.png", bbox_inches="tight", dpi=300)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
fire.Fire(main)
|