Affiner les heatmaps de corrélation et l'annotation des lags

2025-11-21 01:46:06 +01:00
parent a36157b52f
commit 2ff719107b
11 changed files with 599 additions and 36 deletions
--- a/avancées/scripts/plot_lagged_correlation_heatmap_from_data.py
+++ b/avancées/scripts/plot_lagged_correlation_heatmap_from_data.py
@@ -0,0 +1,191 @@
+# scripts/plot_lagged_correlation_heatmap_from_data.py
+from __future__ import annotations
+
+from pathlib import Path
+import sys
+import argparse
+
+import pandas as pd
+
+PROJECT_ROOT = Path(__file__).resolve().parents[3]
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
+
+from meteo.variables import VARIABLES, VARIABLES_BY_KEY
+from meteo.plots import plot_correlation_heatmap
+
+
+DATA_DIR = Path("docs/05 - Corrélations binaires avancées/data/lagged_correlations")
+FIG_DIR = Path("docs/05 - Corrélations binaires avancées/figures")
+DATA_OUTPUT_DIR = Path("docs/05 - Corrélations binaires avancées/data")
+
+
+def _load_best_corr_and_lag(csv_path: Path) -> tuple[float, int, bool] | None:
+    """Retourne (corr, lag, sign_known) au max |r| à partir d'un CSV de lagcorr."""
+
+    df = pd.read_csv(csv_path)
+
+    if "Pearson" in df.columns:
+        series = df["Pearson"]
+        sign_known = True
+    elif "correlation" in df.columns:
+        series = df["correlation"]
+        sign_known = True
+    elif "Pearson |r|" in df.columns:
+        series = df["Pearson |r|"]
+        sign_known = False
+    elif "Pearson (r²)" in df.columns:
+        series = (df["Pearson (r²)"].abs()) ** 0.5
+        sign_known = False
+    else:
+        return None
+
+    abs_series = series.abs()
+    if abs_series.empty or abs_series.isna().all():
+        return None
+
+    idx = abs_series.idxmax()
+    best_corr = series.iloc[idx]
+    best_lag = int(df.loc[idx, "lag_minutes"])
+    # Si pas de signe, on retourne un corr positif (le signe pourra être posé via l'inverse)
+    if not sign_known:
+        best_corr = abs(best_corr)
+    return best_corr, best_lag, sign_known
+
+
+def _get_pair_best(
+    vx_key: str,
+    vy_key: str,
+    *,
+    data_dir: Path,
+) -> tuple[float, int, bool, str] | None:
+    """
+    Retourne (corr, lag, sign_known, source) pour la paire ordonnée vx->vy.
+    Si le CSV direct ne contient pas le signe (|r| ou r²), tente de l'inférer
+    à partir du CSV inverse (vy->vx) en inversant le lag.
+    """
+    primary = data_dir / f"lagcorr_{vx_key}_to_{vy_key}.csv"
+    reverse = data_dir / f"lagcorr_{vy_key}_to_{vx_key}.csv"
+
+    primary_res = _load_best_corr_and_lag(primary) if primary.exists() else None
+    if primary_res and primary_res[2]:
+        corr, lag, sign_known = primary_res
+        return corr, lag, sign_known, primary.name
+
+    # Tentative d'inférence via le CSV inverse
+    reverse_res = _load_best_corr_and_lag(reverse) if reverse.exists() else None
+    if primary_res and reverse_res and reverse_res[2]:
+        primary_corr, _, _ = primary_res
+        rev_corr, rev_lag, _ = reverse_res
+        corr = primary_corr if primary_res[2] else abs(primary_corr) * (1 if rev_corr >= 0 else -1)
+        lag = -rev_lag
+        return corr, lag, True, f"{primary.name} (signe/lag inférés depuis {reverse.name})"
+
+    if primary_res:
+        corr, lag, sign_known = primary_res
+        return corr, lag, sign_known, primary.name if primary.exists() else "n/a"
+
+    if reverse_res:
+        rev_corr, rev_lag, sign_known = reverse_res
+        corr = rev_corr
+        lag = -rev_lag
+        return corr, lag, sign_known, reverse.name
+
+    return None
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Construit des matrices depuis les CSV lagcorr_* existants (max |r| par paire ordonnée)."
+    )
+    parser.add_argument(
+        "--data-dir",
+        type=Path,
+        default=DATA_DIR,
+        help="Dossier contenant les CSV lagcorr_*_to_*.csv.",
+    )
+    parser.add_argument(
+        "--fig-dir",
+        type=Path,
+        default=FIG_DIR,
+        help="Dossier de sortie pour la heatmap.",
+    )
+    parser.add_argument(
+        "--data-output-dir",
+        type=Path,
+        default=DATA_OUTPUT_DIR,
+        help="Dossier de sortie pour les matrices CSV exportées.",
+    )
+    parser.add_argument(
+        "--annot-threshold",
+        type=float,
+        default=0.3,
+        help="N'affiche le lag annoté que si |r| >= ce seuil (0 pour tout afficher).",
+    )
+    args = parser.parse_args()
+
+    args.fig_dir.mkdir(parents=True, exist_ok=True)
+    args.data_output_dir.mkdir(parents=True, exist_ok=True)
+
+    columns = [v.column for v in VARIABLES]
+    corr_matrix = pd.DataFrame(index=columns, columns=columns, dtype=float)
+    lag_matrix = pd.DataFrame(index=columns, columns=columns, dtype=int)
+
+    missing_files: list[str] = []
+    sign_unknown: list[str] = []
+
+    for vx in VARIABLES:
+        for vy in VARIABLES:
+            if vx == vy:
+                corr_matrix.loc[vx.column, vy.column] = 1.0
+                lag_matrix.loc[vx.column, vy.column] = 0
+                continue
+            res = _get_pair_best(vx.key, vy.key, data_dir=args.data_dir)
+            if res is None:
+                missing_files.append(f"{vx.key}→{vy.key}")
+                corr_matrix.loc[vx.column, vy.column] = float("nan")
+                lag_matrix.loc[vx.column, vy.column] = 0
+                continue
+
+            best_corr, best_lag, sign_known, source = res
+            if not sign_known:
+                sign_unknown.append(f"{vx.key}→{vy.key} (source={source})")
+            corr_matrix.loc[vx.column, vy.column] = best_corr
+            lag_matrix.loc[vx.column, vy.column] = best_lag
+
+    if missing_files:
+        print("⚠ CSV manquants pour certaines paires :", ", ".join(missing_files))
+    if sign_unknown:
+        print("⚠ Signe inconnu (CSV en |r| ou r²) pour :", ", ".join(sign_unknown))
+
+    # Heatmap signée (valeurs positives lorsque le signe est absent des CSV)
+    output_path = args.fig_dir / "correlation_heatmap_lagged.png"
+    annot_df = lag_matrix.copy()
+    if args.annot_threshold > 0:
+        mask = corr_matrix.abs() < args.annot_threshold
+        annot_df = annot_df.mask(mask, "")
+    plot_correlation_heatmap(
+        corr=corr_matrix,
+        variables=VARIABLES,
+        output_path=output_path,
+        annotate=True,
+        annotate_values=annot_df,
+        title="Corrélations (lag optimal par paire, issues des CSV)",
+        cmap="coolwarm",
+        vmin=-1.0,
+        vmax=1.0,
+        colorbar_label="Coefficient de corrélation r",
+    )
+    print(f"✔ Heatmap laggée sauvegardée dans : {output_path}")
+
+    corr_csv = args.data_output_dir / "correlation_matrix_lagged.csv"
+    corr_matrix.to_csv(corr_csv)
+    print(f"✔ Matrice des corrélations exportée : {corr_csv}")
+
+    lag_csv = args.data_output_dir / "lag_matrix_minutes.csv"
+    lag_matrix.to_csv(lag_csv)
+    print(f"✔ Matrice des lags exportée : {lag_csv}")
+
+
+if __name__ == "__main__":
+    main()
--- a/avancées/scripts/plot_lagged_correlations.py
+++ b/avancées/scripts/plot_lagged_correlations.py
@@ -4,16 +4,21 @@ from __future__ import annotations
 from pathlib import Path
 import sys

+import argparse

 PROJECT_ROOT = Path(__file__).resolve().parents[3]
 if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

 from meteo.dataset import load_raw_csv
-from meteo.variables import VARIABLES_BY_KEY
+from meteo.variables import VARIABLES, VARIABLES_BY_KEY
 from meteo.analysis import compute_lagged_correlation
-from meteo.plots import plot_lagged_correlation
-from meteo.correlation_presets import DEFAULT_LAGGED_PAIRS
+from meteo.plots import plot_lagged_correlation_multi
+from meteo.correlation_presets import (
+    DEFAULT_ABS_CORRELATION_BANDS,
+    DEFAULT_SIGNED_CORRELATION_BANDS,
+    CorrelationBand,
+)


 CSV_PATH = Path("data/weather_minutely.csv")
@@ -22,39 +27,143 @@ OUTPUT_DIR = DOC_DIR / "figures" / "lagged_correlations"


 def main() -> None:
+    parser = argparse.ArgumentParser(description="Trace les corrélations décalées pour toutes les paires de variables.")
+    parser.add_argument(
+        "--max-lag",
+        type=int,
+        default=720,
+        help="Décalage maximal en minutes (par défaut : 720 = ±12h).",
+    )
+    parser.add_argument(
+        "--step",
+        type=int,
+        default=10,
+        help="Pas en minutes pour l'évaluation des lags (par défaut : 10).",
+    )
+    parser.add_argument(
+        "--resample",
+        default="none",
+        help="Fréquence d'agrégation avant calcul (par défaut : 'none') Exemple : '10min'.",
+    )
+    parser.add_argument(
+        "--signed",
+        action="store_true",
+        default=True,
+        help="Affiche les corrélations signées. Utiliser --no-signed pour revenir aux valeurs absolues |r|.",
+    )
+    parser.add_argument(
+        "--no-signed",
+        action="store_false",
+        dest="signed",
+        help="Alias explicite pour repasser en valeurs absolues |r|.",
+    )
+    parser.add_argument(
+        "--thresholds",
+        default=None,
+        help=(
+            "Seuils personnalisés (valeurs séparées par des virgules). "
+            "Par défaut, utilise les bandes définies dans meteo.correlation_presets "
+            "(abs ou signées selon --signed)."
+        ),
+    )
+    parser.add_argument(
+        "--only",
+        nargs="*",
+        help="Clés de variables à inclure (par défaut toutes les variables numériques).",
+    )
+    args = parser.parse_args()
+
    if not CSV_PATH.exists():
        print(f"⚠ Fichier introuvable : {CSV_PATH}")
        return

    df = load_raw_csv(CSV_PATH)
+    df = df.select_dtypes(include="number")
+    if args.resample and args.resample.lower() != "none":
+        df = df.resample(args.resample).mean()
+        print(f"Dataset rééchantillonné à {args.resample} pour accélérer le calcul.")
    print(f"Dataset minuté chargé : {CSV_PATH}")
    print(f"  Lignes   : {len(df)}")
    print(f"  Colonnes : {list(df.columns)}")
    print()

-    for key_x, key_y in DEFAULT_LAGGED_PAIRS:
-        var_x = VARIABLES_BY_KEY[key_x]
-        var_y = VARIABLES_BY_KEY[key_y]
+    if args.only:
+        missing = [k for k in args.only if k not in VARIABLES_BY_KEY]
+        if missing:
+            raise KeyError(f"Variables inconnues : {', '.join(missing)}")
+        variables = [VARIABLES_BY_KEY[k] for k in args.only]
+    else:
+        variables = list(VARIABLES)
+    pairs = [(vx, vy) for i, vx in enumerate(variables) for vy in variables[i + 1 :]]
+    print(f"Paires analysées : {len(pairs)} (combinaisons uniques, sans inverses).")

+    for var_x, var_y in pairs:
        print(f"→ Corrélation décalée : {var_x.key} → {var_y.key}")

-        lag_df = compute_lagged_correlation(
+        lag_df_pearson = compute_lagged_correlation(
            df=df,
            var_x=var_x,
            var_y=var_y,
-            max_lag_minutes=360,   # ± 6 heures
-            step_minutes=10,       # pas de 10 minutes
+            max_lag_minutes=args.max_lag,
+            step_minutes=args.step,
            method="pearson",
-        )
+        )["correlation"]
+
+        lag_df_spearman = compute_lagged_correlation(
+            df=df,
+            var_x=var_x,
+            var_y=var_y,
+            max_lag_minutes=args.max_lag,
+            step_minutes=args.step,
+            method="spearman",
+        )["correlation"]
+
+        use_abs = not args.signed
+        if use_abs:
+            lag_df_pearson = lag_df_pearson.abs()
+            lag_df_spearman = lag_df_spearman.abs()
+            ylabel = "Corrélation (|r|)"
+            labels = {"Pearson": "Pearson |r|", "Spearman": "Spearman |r|"}
+            y_limits = (0.0, 1.0)
+            threshold_values = (
+                [
+                    float(t)
+                    for t in (args.thresholds or "").split(",")
+                    if t.strip() != ""
+                ]
+                if args.thresholds
+                else []
+            )
+            bands = list(DEFAULT_ABS_CORRELATION_BANDS)
+        else:
+            ylabel = "Corrélation"
+            labels = {"Pearson": "Pearson", "Spearman": "Spearman"}
+            y_limits = (-1.0, 1.0)
+            threshold_values: list[float] = []
+            if args.thresholds:
+                threshold_values = [
+                    float(t)
+                    for t in args.thresholds.split(",")
+                    if t.strip() != ""
+                ]
+                threshold_values = sorted({v for thr in threshold_values for v in (thr, -thr)})
+            bands = list(DEFAULT_SIGNED_CORRELATION_BANDS)

        filename = f"lagcorr_{var_x.key}_to_{var_y.key}.png"
        output_path = OUTPUT_DIR / filename

-        plot_lagged_correlation(
-            lag_df=lag_df,
+        plot_lagged_correlation_multi(
+            lag_series={
+                labels["Pearson"]: lag_df_pearson,
+                labels["Spearman"]: lag_df_spearman,
+            },
            var_x=var_x,
            var_y=var_y,
            output_path=output_path,
+            ylabel=ylabel,
+            y_limits=y_limits,
+            thresholds=threshold_values,
+            bands=bands,
        )

    print("✔ Graphiques de corrélation décalée générés.")