You've already forked donnees_meteo
Premiers modèles prédictifs
This commit is contained in:
344
docs/09 - Premiers modèles prédictifs/scripts/run_baselines.py
Normal file
344
docs/09 - Premiers modèles prédictifs/scripts/run_baselines.py
Normal file
@@ -0,0 +1,344 @@
|
||||
# scripts/run_baselines.py
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
import sys
|
||||
from typing import Iterable
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.metrics import (
|
||||
mean_absolute_error,
|
||||
mean_squared_error,
|
||||
precision_recall_fscore_support,
|
||||
brier_score_loss,
|
||||
)
|
||||
|
||||
PROJECT_ROOT = Path(__file__).resolve().parents[3]
|
||||
if str(PROJECT_ROOT) not in sys.path:
|
||||
sys.path.insert(0, str(PROJECT_ROOT))
|
||||
|
||||
from meteo.dataset import load_raw_csv
|
||||
from model.baselines import (
|
||||
persistence_baseline,
|
||||
moving_average_baseline,
|
||||
hourly_climatology_baseline,
|
||||
)
|
||||
from model.splits import chronological_split
|
||||
from model.features import _steps_from_minutes, DEFAULT_BASE_FREQ_MINUTES
|
||||
|
||||
|
||||
CSV_PATH = Path("data/weather_minutely.csv")
|
||||
DOC_DIR = Path(__file__).resolve().parent.parent
|
||||
DATA_DIR = DOC_DIR / "data"
|
||||
FIG_DIR = DOC_DIR / "figures"
|
||||
HORIZONS_MINUTES: tuple[int, ...] = (10, 60, 360, 1440)
|
||||
CONTINUOUS_TARGETS: tuple[str, ...] = ("temperature", "wind_speed")
|
||||
RAIN_TARGET: str = "rain_rate"
|
||||
MOVING_AVG_WINDOW_MINUTES = 60
|
||||
|
||||
|
||||
def _ensure_columns(df: pd.DataFrame, columns: Iterable[str]) -> None:
|
||||
missing = [c for c in columns if c not in df.columns]
|
||||
if missing:
|
||||
raise KeyError(f"Colonnes manquantes dans le DataFrame : {missing}")
|
||||
|
||||
|
||||
def _regression_scores(y_true: pd.Series, y_pred: pd.Series) -> dict[str, float]:
|
||||
return {
|
||||
"mae": float(mean_absolute_error(y_true, y_pred)),
|
||||
"rmse": float(np.sqrt(mean_squared_error(y_true, y_pred))),
|
||||
}
|
||||
|
||||
|
||||
def _classification_scores(y_true: pd.Series, proba: pd.Series, threshold: float = 0.5) -> dict[str, float]:
|
||||
proba = proba.clip(0.0, 1.0)
|
||||
y_pred = (proba >= threshold).astype(int)
|
||||
precision, recall, f1, _ = precision_recall_fscore_support(
|
||||
y_true, y_pred, average="binary", zero_division=0
|
||||
)
|
||||
try:
|
||||
brier = float(brier_score_loss(y_true, proba))
|
||||
except ValueError:
|
||||
brier = float("nan")
|
||||
return {
|
||||
"precision": float(precision),
|
||||
"recall": float(recall),
|
||||
"f1": float(f1),
|
||||
"brier": brier,
|
||||
}
|
||||
|
||||
|
||||
def evaluate_regression_baselines(
|
||||
series_train: pd.Series,
|
||||
series_eval: pd.Series,
|
||||
*,
|
||||
horizons: Iterable[int],
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Évalue persistance, moyenne mobile et climatologie horaire sur un jeu (validation ou test).
|
||||
"""
|
||||
rows: list[dict[str, object]] = []
|
||||
for horizon in horizons:
|
||||
# Persistance (évaluée sur le jeu cible uniquement)
|
||||
frame_persist = persistence_baseline(series_eval, horizon_minutes=horizon)
|
||||
reg_persist = _regression_scores(frame_persist["y_true"], frame_persist["y_pred"])
|
||||
rows.append(
|
||||
{
|
||||
"target": series_eval.name,
|
||||
"horizon_min": horizon,
|
||||
"baseline": "persistance",
|
||||
"n_samples": len(frame_persist),
|
||||
**reg_persist,
|
||||
}
|
||||
)
|
||||
|
||||
# Moyenne mobile (évaluée sur le jeu cible uniquement)
|
||||
frame_ma = moving_average_baseline(
|
||||
series_eval,
|
||||
horizon_minutes=horizon,
|
||||
window_minutes=MOVING_AVG_WINDOW_MINUTES,
|
||||
)
|
||||
reg_ma = _regression_scores(frame_ma["y_true"], frame_ma["y_pred"])
|
||||
rows.append(
|
||||
{
|
||||
"target": series_eval.name,
|
||||
"horizon_min": horizon,
|
||||
"baseline": f"moyenne_mobile_{MOVING_AVG_WINDOW_MINUTES}m",
|
||||
"n_samples": len(frame_ma),
|
||||
**reg_ma,
|
||||
}
|
||||
)
|
||||
|
||||
# Climatologie horaire : nécessite l'heure de la cible (utilise la partie train)
|
||||
steps = _steps_from_minutes(horizon, DEFAULT_BASE_FREQ_MINUTES)
|
||||
y_true = series_eval.shift(-steps)
|
||||
y_true = y_true.dropna()
|
||||
preds = hourly_climatology_baseline(
|
||||
series_train,
|
||||
eval_index=y_true.index,
|
||||
horizon_minutes=horizon,
|
||||
)
|
||||
preds = preds.loc[y_true.index]
|
||||
reg_clim = _regression_scores(y_true, preds)
|
||||
rows.append(
|
||||
{
|
||||
"target": series_eval.name,
|
||||
"horizon_min": horizon,
|
||||
"baseline": "climatologie_horaire",
|
||||
"n_samples": len(y_true),
|
||||
**reg_clim,
|
||||
}
|
||||
)
|
||||
|
||||
return pd.DataFrame(rows)
|
||||
|
||||
|
||||
def evaluate_rain_baselines(
|
||||
rain_train: pd.Series,
|
||||
rain_eval: pd.Series,
|
||||
*,
|
||||
horizons: Iterable[int],
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Évalue des baselines pour la pluie (version binaire pluie oui/non).
|
||||
"""
|
||||
rows: list[dict[str, object]] = []
|
||||
rain_train_bin = (rain_train > 0).astype(int)
|
||||
rain_eval_bin = (rain_eval > 0).astype(int)
|
||||
|
||||
for horizon in horizons:
|
||||
steps = _steps_from_minutes(horizon, DEFAULT_BASE_FREQ_MINUTES)
|
||||
|
||||
# Persistance
|
||||
frame_persist = persistence_baseline(rain_eval, horizon_minutes=horizon)
|
||||
y_true = (frame_persist["y_true"] > 0).astype(int)
|
||||
proba = (frame_persist["y_pred"] > 0).astype(float)
|
||||
cls_persist = _classification_scores(y_true, proba, threshold=0.5)
|
||||
rows.append(
|
||||
{
|
||||
"target": "rain",
|
||||
"horizon_min": horizon,
|
||||
"baseline": "persistance",
|
||||
"n_samples": len(y_true),
|
||||
**cls_persist,
|
||||
}
|
||||
)
|
||||
|
||||
# Moyenne mobile (prédiction binaire à partir du cumul moyen)
|
||||
frame_ma = moving_average_baseline(
|
||||
rain_eval,
|
||||
horizon_minutes=horizon,
|
||||
window_minutes=MOVING_AVG_WINDOW_MINUTES,
|
||||
)
|
||||
y_true_ma = (frame_ma["y_true"] > 0).astype(int)
|
||||
proba_ma = (frame_ma["y_pred"] > 0).astype(float)
|
||||
cls_ma = _classification_scores(y_true_ma, proba_ma, threshold=0.5)
|
||||
rows.append(
|
||||
{
|
||||
"target": "rain",
|
||||
"horizon_min": horizon,
|
||||
"baseline": f"moyenne_mobile_{MOVING_AVG_WINDOW_MINUTES}m",
|
||||
"n_samples": len(y_true_ma),
|
||||
**cls_ma,
|
||||
}
|
||||
)
|
||||
|
||||
# Climatologie horaire (probabilité de pluie par heure)
|
||||
y_true_clim = rain_eval_bin.shift(-steps).dropna()
|
||||
proba_clim = hourly_climatology_baseline(
|
||||
rain_train_bin,
|
||||
eval_index=rain_eval_bin.index,
|
||||
horizon_minutes=horizon,
|
||||
)
|
||||
proba_clim = proba_clim.loc[y_true_clim.index].fillna(0.0)
|
||||
cls_clim = _classification_scores(y_true_clim, proba_clim, threshold=0.5)
|
||||
rows.append(
|
||||
{
|
||||
"target": "rain",
|
||||
"horizon_min": horizon,
|
||||
"baseline": "climatologie_horaire",
|
||||
"n_samples": len(y_true_clim),
|
||||
**cls_clim,
|
||||
}
|
||||
)
|
||||
|
||||
return pd.DataFrame(rows)
|
||||
|
||||
|
||||
def _save_csv(df: pd.DataFrame, path: Path) -> None:
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
df.to_csv(path, index=False)
|
||||
|
||||
|
||||
def plot_regression_mae(reg_df: pd.DataFrame, output_path: Path) -> None:
|
||||
"""Trace la MAE des baselines (validation) par horizon pour température et vent."""
|
||||
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
df = reg_df[reg_df["split"] == "validation"]
|
||||
fig, axes = plt.subplots(2, 1, figsize=(8, 6), sharex=True)
|
||||
targets = ["temperature", "wind_speed"]
|
||||
baselines = df["baseline"].unique()
|
||||
|
||||
for ax, target in zip(axes, targets):
|
||||
sub = df[df["target"] == target]
|
||||
for baseline in baselines:
|
||||
line = sub[sub["baseline"] == baseline].sort_values("horizon_min")
|
||||
ax.plot(line["horizon_min"], line["mae"], marker="o", label=baseline)
|
||||
ax.set_title(f"MAE {target} (validation)")
|
||||
ax.set_ylabel("MAE")
|
||||
ax.grid(True, linestyle=":", alpha=0.4)
|
||||
|
||||
axes[-1].set_xlabel("Horizon (minutes)")
|
||||
axes[0].legend()
|
||||
fig.tight_layout()
|
||||
fig.savefig(output_path, dpi=150)
|
||||
plt.close(fig)
|
||||
|
||||
|
||||
def plot_rain_scores(rain_df: pd.DataFrame, output_path: Path) -> None:
|
||||
"""Trace F1 et Brier des baselines pluie (validation) par horizon."""
|
||||
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
df = rain_df[rain_df["split"] == "validation"]
|
||||
baselines = df["baseline"].unique()
|
||||
|
||||
fig, axes = plt.subplots(2, 1, figsize=(8, 6), sharex=True)
|
||||
for metric, ax in zip(("f1", "brier"), axes):
|
||||
for baseline in baselines:
|
||||
line = df[df["baseline"] == baseline].sort_values("horizon_min")
|
||||
ax.plot(line["horizon_min"], line[metric], marker="o", label=baseline)
|
||||
ax.set_title(f"{metric.upper()} pluie (validation)" if metric == "f1" else "Brier pluie (validation)")
|
||||
ax.set_ylabel(metric.upper() if metric == "f1" else "Brier")
|
||||
ax.grid(True, linestyle=":", alpha=0.4)
|
||||
axes[-1].set_xlabel("Horizon (minutes)")
|
||||
axes[0].legend()
|
||||
fig.tight_layout()
|
||||
fig.savefig(output_path, dpi=150)
|
||||
plt.close(fig)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
if not CSV_PATH.exists():
|
||||
print(f"⚠ Fichier introuvable : {CSV_PATH}")
|
||||
return
|
||||
|
||||
df = load_raw_csv(CSV_PATH)
|
||||
_ensure_columns(df, CONTINUOUS_TARGETS + (RAIN_TARGET,))
|
||||
|
||||
# Découpe temporelle sans fuite
|
||||
train_df, val_df, test_df = chronological_split(df, train_frac=0.7, val_frac=0.15)
|
||||
print(f"Dataset chargé : {CSV_PATH}")
|
||||
print(f" Train : {len(train_df)} lignes")
|
||||
print(f" Val : {len(val_df)} lignes")
|
||||
print(f" Test : {len(test_df)} lignes")
|
||||
print()
|
||||
|
||||
# Évalue sur validation
|
||||
reg_val_rows: list[pd.DataFrame] = []
|
||||
for target in CONTINUOUS_TARGETS:
|
||||
reg_val_rows.append(
|
||||
evaluate_regression_baselines(
|
||||
train_df[target],
|
||||
val_df[target],
|
||||
horizons=HORIZONS_MINUTES,
|
||||
)
|
||||
)
|
||||
reg_val = pd.concat(reg_val_rows, ignore_index=True)
|
||||
rain_val = evaluate_rain_baselines(
|
||||
rain_train=train_df[RAIN_TARGET],
|
||||
rain_eval=val_df[RAIN_TARGET],
|
||||
horizons=HORIZONS_MINUTES,
|
||||
)
|
||||
|
||||
# Évalue sur test
|
||||
reg_test_rows: list[pd.DataFrame] = []
|
||||
for target in CONTINUOUS_TARGETS:
|
||||
reg_test_rows.append(
|
||||
evaluate_regression_baselines(
|
||||
train_df[target],
|
||||
test_df[target],
|
||||
horizons=HORIZONS_MINUTES,
|
||||
)
|
||||
)
|
||||
reg_test = pd.concat(reg_test_rows, ignore_index=True)
|
||||
rain_test = evaluate_rain_baselines(
|
||||
rain_train=train_df[RAIN_TARGET],
|
||||
rain_eval=test_df[RAIN_TARGET],
|
||||
horizons=HORIZONS_MINUTES,
|
||||
)
|
||||
|
||||
# Combine et sauvegarde en CSV
|
||||
reg_val["split"] = "validation"
|
||||
reg_test["split"] = "test"
|
||||
reg_all = pd.concat([reg_val, reg_test], ignore_index=True)
|
||||
|
||||
rain_val["split"] = "validation"
|
||||
rain_test["split"] = "test"
|
||||
rain_all = pd.concat([rain_val, rain_test], ignore_index=True)
|
||||
|
||||
DATA_DIR.mkdir(parents=True, exist_ok=True)
|
||||
_save_csv(reg_all, DATA_DIR / "baselines_regression.csv")
|
||||
_save_csv(rain_all, DATA_DIR / "baselines_rain.csv")
|
||||
|
||||
# Figures (validation uniquement pour la lisibilité)
|
||||
FIG_DIR.mkdir(parents=True, exist_ok=True)
|
||||
plot_regression_mae(reg_all, FIG_DIR / "baselines_mae_validation.png")
|
||||
plot_rain_scores(rain_all, FIG_DIR / "baselines_rain_validation.png")
|
||||
|
||||
print("=== Baselines validation (température / vent) ===")
|
||||
print(reg_val.to_string(index=False, float_format=lambda x: f"{x:.3f}"))
|
||||
print()
|
||||
print("=== Baselines validation (pluie binaire) ===")
|
||||
print(rain_val.to_string(index=False, float_format=lambda x: f"{x:.3f}"))
|
||||
print()
|
||||
print("=== Baselines test (température / vent) ===")
|
||||
print(reg_test.to_string(index=False, float_format=lambda x: f"{x:.3f}"))
|
||||
print()
|
||||
print("=== Baselines test (pluie binaire) ===")
|
||||
print(rain_test.to_string(index=False, float_format=lambda x: f"{x:.3f}"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,345 @@
|
||||
# scripts/run_first_models.py
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
import sys
|
||||
from typing import Iterable, Sequence
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.linear_model import Ridge, Lasso, LogisticRegression
|
||||
from sklearn.metrics import (
|
||||
mean_absolute_error,
|
||||
mean_squared_error,
|
||||
f1_score,
|
||||
precision_recall_curve,
|
||||
roc_curve,
|
||||
average_precision_score,
|
||||
brier_score_loss,
|
||||
)
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
|
||||
PROJECT_ROOT = Path(__file__).resolve().parents[3]
|
||||
if str(PROJECT_ROOT) not in sys.path:
|
||||
sys.path.insert(0, str(PROJECT_ROOT))
|
||||
|
||||
from meteo.dataset import load_raw_csv
|
||||
from model.features import build_feature_dataframe, FeatureSpec, _steps_from_minutes
|
||||
from model.splits import chronological_split
|
||||
|
||||
CSV_PATH = Path("data/weather_minutely.csv")
|
||||
DOC_DIR = Path(__file__).resolve().parent.parent
|
||||
DATA_DIR = DOC_DIR / "data"
|
||||
FIG_DIR = DOC_DIR / "figures"
|
||||
|
||||
HORIZONS_MINUTES: tuple[int, ...] = (10, 60, 360, 1440)
|
||||
CONTINUOUS_TARGETS: tuple[str, ...] = ("temperature", "wind_speed")
|
||||
RAIN_TARGET: str = "rain_rate"
|
||||
|
||||
# Lags spécifiques issus des analyses du chapitre 5 (exemple de mapping ; sinon défauts)
|
||||
DEFAULT_LAGS_BY_COL: dict[str, Sequence[int]] = {
|
||||
"temperature": (10, 20, 30),
|
||||
"wind_speed": (10, 20, 30),
|
||||
"rain_rate": (10, 20, 30),
|
||||
"humidity": (10, 20, 30),
|
||||
"pressure": (10, 20, 30),
|
||||
"illuminance": (10, 20, 30),
|
||||
"wind_direction": (10, 20, 30),
|
||||
"sun_elevation": (10, 20, 30),
|
||||
}
|
||||
|
||||
USE_CORR_FILTER = True
|
||||
CORR_THRESHOLD = 0.2
|
||||
CORR_PATH = Path("docs/05 - Corrélations binaires avancées/data/correlation_matrix_lagged.csv")
|
||||
LAG_MATRIX_PATH = Path("docs/05 - Corrélations binaires avancées/data/lag_matrix_minutes.csv")
|
||||
|
||||
|
||||
def _align_target(
|
||||
df: pd.DataFrame,
|
||||
target_col: str,
|
||||
horizon_minutes: int,
|
||||
base_freq_minutes: int = 10,
|
||||
) -> tuple[pd.DataFrame, pd.Series]:
|
||||
"""
|
||||
Décale la cible dans le futur pour l'horizon souhaité et aligne X, y.
|
||||
"""
|
||||
steps = _steps_from_minutes(horizon_minutes, base_freq_minutes)
|
||||
y = df[target_col].shift(-steps)
|
||||
X_full = df.drop(columns=[target_col])
|
||||
# Ne garder que les colonnes numériques/booléennes (exclut "season" textuelle)
|
||||
X = X_full.select_dtypes(include=["number", "bool"])
|
||||
aligned = pd.concat([X, y.rename("target")], axis=1).dropna()
|
||||
return aligned.drop(columns=["target"]), aligned["target"]
|
||||
|
||||
|
||||
def _load_correlation_and_lag() -> tuple[pd.DataFrame | None, pd.DataFrame | None]:
|
||||
corr_df = pd.read_csv(CORR_PATH, index_col=0) if CORR_PATH.exists() else None
|
||||
lag_df = pd.read_csv(LAG_MATRIX_PATH, index_col=0) if LAG_MATRIX_PATH.exists() else None
|
||||
return corr_df, lag_df
|
||||
|
||||
|
||||
def _select_features_from_corr(
|
||||
corr_df: pd.DataFrame | None,
|
||||
targets: Sequence[str],
|
||||
threshold: float,
|
||||
) -> set[str]:
|
||||
if corr_df is None:
|
||||
return set()
|
||||
selected: set[str] = set()
|
||||
for target in targets:
|
||||
if target not in corr_df.columns:
|
||||
continue
|
||||
corrs = corr_df[target].drop(labels=[target], errors="ignore")
|
||||
strong = corrs[corrs.abs() >= threshold]
|
||||
selected.update(strong.index.tolist())
|
||||
return selected
|
||||
|
||||
|
||||
def _build_lags_from_matrices(
|
||||
lag_df: pd.DataFrame | None,
|
||||
corr_df: pd.DataFrame | None,
|
||||
selected_cols: Iterable[str],
|
||||
default_lags: dict[str, Sequence[int]],
|
||||
threshold: float,
|
||||
) -> dict[str, Sequence[int]]:
|
||||
"""
|
||||
Combine lags par défaut et lags issus de la matrice de décalage si |corr| dépasse le seuil.
|
||||
"""
|
||||
mapping: dict[str, Sequence[int]] = {}
|
||||
for col in selected_cols:
|
||||
base = list(default_lags.get(col, (10, 20, 30)))
|
||||
extra: set[int] = set()
|
||||
if lag_df is not None and corr_df is not None and col in lag_df.index:
|
||||
corrs = corr_df.loc[col]
|
||||
for tgt, corr_val in corrs.items():
|
||||
if tgt == col:
|
||||
continue
|
||||
if abs(corr_val) < threshold:
|
||||
continue
|
||||
lag_val = lag_df.loc[col, tgt]
|
||||
if pd.notna(lag_val) and lag_val != 0:
|
||||
extra.add(int(abs(round(float(lag_val)))))
|
||||
merged = sorted({*base, *extra})
|
||||
mapping[col] = merged
|
||||
return mapping
|
||||
|
||||
|
||||
def _scale_train_val_test(X_train: pd.DataFrame, X_val: pd.DataFrame, X_test: pd.DataFrame) -> tuple[np.ndarray, np.ndarray, np.ndarray, StandardScaler]:
|
||||
scaler = StandardScaler()
|
||||
X_train_scaled = scaler.fit_transform(X_train)
|
||||
X_val_scaled = scaler.transform(X_val)
|
||||
X_test_scaled = scaler.transform(X_test)
|
||||
return X_train_scaled, X_val_scaled, X_test_scaled, scaler
|
||||
|
||||
|
||||
def _regression_scores(y_true: np.ndarray, y_pred: np.ndarray) -> dict[str, float]:
|
||||
return {
|
||||
"mae": float(mean_absolute_error(y_true, y_pred)),
|
||||
"rmse": float(np.sqrt(mean_squared_error(y_true, y_pred))),
|
||||
}
|
||||
|
||||
|
||||
def _classification_scores(y_true: np.ndarray, proba: np.ndarray, threshold: float = 0.5) -> dict[str, float]:
|
||||
y_pred = (proba >= threshold).astype(int)
|
||||
return {
|
||||
"f1": float(f1_score(y_true, y_pred, zero_division=0)),
|
||||
"brier": float(brier_score_loss(y_true, proba)),
|
||||
"ap": float(average_precision_score(y_true, proba)),
|
||||
}
|
||||
|
||||
|
||||
def run_regression_models(train_df: pd.DataFrame, val_df: pd.DataFrame, test_df: pd.DataFrame) -> pd.DataFrame:
|
||||
rows: list[dict[str, object]] = []
|
||||
for target_col in CONTINUOUS_TARGETS:
|
||||
for horizon in HORIZONS_MINUTES:
|
||||
X_train, y_train = _align_target(train_df, target_col, horizon)
|
||||
X_val, y_val = _align_target(val_df, target_col, horizon)
|
||||
X_test, y_test = _align_target(test_df, target_col, horizon)
|
||||
|
||||
if y_train.empty or y_val.empty or y_test.empty:
|
||||
continue
|
||||
|
||||
X_train_s, X_val_s, X_test_s, scaler = _scale_train_val_test(X_train, X_val, X_test)
|
||||
|
||||
for model_name, model in (
|
||||
("ridge", Ridge(alpha=1.0)),
|
||||
("lasso", Lasso(alpha=0.001)),
|
||||
):
|
||||
model.fit(X_train_s, y_train)
|
||||
y_val_pred = model.predict(X_val_s)
|
||||
y_test_pred = model.predict(X_test_s)
|
||||
|
||||
val_scores = _regression_scores(y_val, y_val_pred)
|
||||
test_scores = _regression_scores(y_test, y_test_pred)
|
||||
|
||||
rows.append(
|
||||
{
|
||||
"target": target_col,
|
||||
"horizon_min": horizon,
|
||||
"model": model_name,
|
||||
"split": "validation",
|
||||
**val_scores,
|
||||
}
|
||||
)
|
||||
rows.append(
|
||||
{
|
||||
"target": target_col,
|
||||
"horizon_min": horizon,
|
||||
"model": model_name,
|
||||
"split": "test",
|
||||
**test_scores,
|
||||
}
|
||||
)
|
||||
return pd.DataFrame(rows)
|
||||
|
||||
|
||||
def run_rain_model(train_df: pd.DataFrame, val_df: pd.DataFrame, test_df: pd.DataFrame) -> pd.DataFrame:
|
||||
rows: list[dict[str, object]] = []
|
||||
target_col = RAIN_TARGET
|
||||
for horizon in HORIZONS_MINUTES:
|
||||
X_train, y_train = _align_target(train_df, target_col, horizon)
|
||||
X_val, y_val = _align_target(val_df, target_col, horizon)
|
||||
X_test, y_test = _align_target(test_df, target_col, horizon)
|
||||
|
||||
y_train_bin = (y_train > 0).astype(int)
|
||||
y_val_bin = (y_val > 0).astype(int)
|
||||
y_test_bin = (y_test > 0).astype(int)
|
||||
|
||||
if y_train_bin.empty or y_val_bin.empty or y_test_bin.empty:
|
||||
continue
|
||||
|
||||
X_train_s, X_val_s, X_test_s, scaler = _scale_train_val_test(X_train, X_val, X_test)
|
||||
|
||||
clf = LogisticRegression(max_iter=200)
|
||||
clf.fit(X_train_s, y_train_bin)
|
||||
|
||||
proba_val = clf.predict_proba(X_val_s)[:, 1]
|
||||
proba_test = clf.predict_proba(X_test_s)[:, 1]
|
||||
|
||||
val_scores = _classification_scores(y_val_bin, proba_val)
|
||||
test_scores = _classification_scores(y_test_bin, proba_test)
|
||||
|
||||
rows.append(
|
||||
{
|
||||
"target": "rain_binary",
|
||||
"horizon_min": horizon,
|
||||
"model": "logistic_regression",
|
||||
"split": "validation",
|
||||
**val_scores,
|
||||
}
|
||||
)
|
||||
rows.append(
|
||||
{
|
||||
"target": "rain_binary",
|
||||
"horizon_min": horizon,
|
||||
"model": "logistic_regression",
|
||||
"split": "test",
|
||||
**test_scores,
|
||||
}
|
||||
)
|
||||
|
||||
return pd.DataFrame(rows)
|
||||
|
||||
|
||||
def plot_regression_results(df: pd.DataFrame, output_path: Path) -> None:
|
||||
"""Trace la MAE par horizon pour chaque modèle (validation)."""
|
||||
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
df_val = df[df["split"] == "validation"]
|
||||
targets = df_val["target"].unique()
|
||||
models = df_val["model"].unique()
|
||||
|
||||
fig, axes = plt.subplots(len(targets), 1, figsize=(8, 4 * len(targets)), sharex=True)
|
||||
if len(targets) == 1:
|
||||
axes = [axes]
|
||||
for ax, target in zip(axes, targets):
|
||||
sub = df_val[df_val["target"] == target]
|
||||
for model in models:
|
||||
line = sub[sub["model"] == model].sort_values("horizon_min")
|
||||
ax.plot(line["horizon_min"], line["mae"], marker="o", label=model)
|
||||
ax.set_title(f"MAE {target} (validation)")
|
||||
ax.set_ylabel("MAE")
|
||||
ax.grid(True, linestyle=":", alpha=0.4)
|
||||
axes[-1].set_xlabel("Horizon (minutes)")
|
||||
axes[0].legend()
|
||||
fig.tight_layout()
|
||||
fig.savefig(output_path, dpi=150)
|
||||
plt.close(fig)
|
||||
|
||||
|
||||
def plot_rain_curves(df: pd.DataFrame, output_prefix: Path) -> None:
|
||||
"""Trace PR et ROC sur la validation pour la pluie binaire (logistique)."""
|
||||
|
||||
output_prefix.parent.mkdir(parents=True, exist_ok=True)
|
||||
# Il faut recalculer les courbes à partir des probas ; on les régénère sur val
|
||||
# On recompute une fois (pas stockées dans df)
|
||||
# Ce helper est pour garder un format cohérent et simple
|
||||
return # On gardera les courbes basées sur les scores déjà exportés pour l'instant
|
||||
|
||||
|
||||
def main() -> None:
|
||||
if not CSV_PATH.exists():
|
||||
print(f"⚠ Fichier introuvable : {CSV_PATH}")
|
||||
return
|
||||
|
||||
df_raw = load_raw_csv(CSV_PATH)
|
||||
print(f"Dataset chargé : {CSV_PATH}")
|
||||
|
||||
corr_df, lag_df = _load_correlation_and_lag()
|
||||
selected_from_corr = _select_features_from_corr(corr_df, CONTINUOUS_TARGETS + (RAIN_TARGET,), CORR_THRESHOLD) if USE_CORR_FILTER else set()
|
||||
|
||||
# Sélection des colonnes numériques
|
||||
numeric_cols = df_raw.select_dtypes(include=["number", "bool"]).columns
|
||||
if USE_CORR_FILTER and selected_from_corr:
|
||||
# On garde les cibles + les colonnes corrélées
|
||||
selected_cols = [col for col in numeric_cols if col in selected_from_corr or col in CONTINUOUS_TARGETS or col == RAIN_TARGET]
|
||||
else:
|
||||
selected_cols = list(numeric_cols)
|
||||
|
||||
lags_mapping = _build_lags_from_matrices(
|
||||
lag_df,
|
||||
corr_df,
|
||||
selected_cols,
|
||||
default_lags=DEFAULT_LAGS_BY_COL,
|
||||
threshold=CORR_THRESHOLD,
|
||||
)
|
||||
|
||||
feature_spec = FeatureSpec(lags_minutes=lags_mapping)
|
||||
df_feat = build_feature_dataframe(df_raw[selected_cols], feature_spec=feature_spec, target_columns=selected_cols)
|
||||
|
||||
# Découpe temporelle sans fuite
|
||||
train_df, val_df, test_df = chronological_split(df_feat, train_frac=0.7, val_frac=0.15)
|
||||
print(f" Train : {len(train_df)} lignes")
|
||||
print(f" Val : {len(val_df)} lignes")
|
||||
print(f" Test : {len(test_df)} lignes")
|
||||
print()
|
||||
|
||||
# Régressions (température/vent)
|
||||
reg_results = run_regression_models(train_df, val_df, test_df)
|
||||
# Pluie binaire
|
||||
rain_results = run_rain_model(train_df, val_df, test_df)
|
||||
|
||||
DATA_DIR.mkdir(parents=True, exist_ok=True)
|
||||
reg_path = DATA_DIR / "models_regression.csv"
|
||||
rain_path = DATA_DIR / "models_rain.csv"
|
||||
reg_results.to_csv(reg_path, index=False)
|
||||
rain_results.to_csv(rain_path, index=False)
|
||||
|
||||
print(f"✔ Résultats régression sauvegardés : {reg_path}")
|
||||
print(f"✔ Résultats pluie sauvegardés : {rain_path}")
|
||||
|
||||
# Figures
|
||||
FIG_DIR.mkdir(parents=True, exist_ok=True)
|
||||
plot_regression_results(reg_results, FIG_DIR / "models_mae_validation.png")
|
||||
# Pas de courbes ROC/PR générées ici pour simplifier, mais les scores (F1/Brier/AP) sont disponibles.
|
||||
|
||||
print("=== Scores régression (validation) ===")
|
||||
print(reg_results[reg_results["split"] == "validation"].to_string(index=False, float_format=lambda x: f"{x:.3f}"))
|
||||
print()
|
||||
print("=== Scores pluie (validation) ===")
|
||||
print(rain_results[rain_results["split"] == "validation"].to_string(index=False, float_format=lambda x: f"{x:.3f}"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user