Exploration des relations binaires

2025-11-22 01:46:57 +01:00
parent 2ff719107b
commit 1932938fd6
135 changed files with 778 additions and 136 deletions
--- a/avancées/scripts/plot_all_pairwise_scatter.py
+++ b/avancées/scripts/plot_all_pairwise_scatter.py
@@ -0,0 +1,177 @@
+# scripts/plot_all_pairwise_scatter.py
+from __future__ import annotations
+
+from pathlib import Path
+import sys
+
+import pandas as pd
+
+
+PROJECT_ROOT = Path(__file__).resolve().parents[3]
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
+
+from meteo.dataset import load_raw_csv
+from meteo.variables import iter_variable_pairs
+from meteo.plots import plot_scatter_pair
+
+
+CSV_PATH = Path("data/weather_minutely.csv")
+DOC_DIR = Path(__file__).resolve().parent.parent
+OUTPUT_DIR = DOC_DIR / "figures" / "pairwise_scatter"
+LAG_MATRIX_PATH = DOC_DIR / "data" / "lag_matrix_minutes.csv"
+CORR_MATRIX_PATH = DOC_DIR / "data" / "correlation_matrix_lagged.csv"
+MIN_ABS_CORR_FOR_LAGGED = 0.3
+
+
+def _load_lag_matrix(path: Path) -> pd.DataFrame | None:
+    if not path.exists():
+        print(f"⚠ Aucun lag appliqué : fichier absent ({path}).")
+        return None
+
+    lag_df = pd.read_csv(path, index_col=0)
+    if lag_df.empty:
+        print(f"⚠ Matrice de lags vide : {path}")
+        return None
+    return lag_df
+
+
+def _load_corr_matrix(path: Path) -> pd.DataFrame | None:
+    if not path.exists():
+        print(f"⚠ Matrice de corrélation laggée absente ({path}) : les versions décalées seront ignorées.")
+        return None
+
+    corr_df = pd.read_csv(path, index_col=0)
+    if corr_df.empty:
+        print(f"⚠ Matrice de corrélation laggée vide : {path}")
+        return None
+    return corr_df
+
+
+def _get_optimal_lag(lag_df: pd.DataFrame, var_x, var_y) -> int | None:
+    try:
+        value = lag_df.loc[var_x.column, var_y.column]
+    except KeyError:
+        return None
+
+    if pd.isna(value):
+        return None
+
+    return int(round(float(value)))
+
+
+def _should_generate_lagged(
+    lag_df: pd.DataFrame | None,
+    corr_df: pd.DataFrame | None,
+    var_x,
+    var_y,
+) -> tuple[bool, int | None, float | None, str | None]:
+    """
+    Retourne (do_generate, applied_lag_minutes, corr_value, source) où :
+    - applied_lag_minutes s'applique au couple (var_x, var_y) dans cet ordre
+      (lag > 0 : var_x précède var_y ; lag < 0 : var_y précède var_x).
+    - source précise l'orientation retenue dans les matrices (ex: "x->y" ou "y->x").
+    Règles :
+    - lag strictement positif
+    - |corr| >= MIN_ABS_CORR_FOR_LAGGED
+    On teste les deux orientations (x->y et y->x) pour détecter le cas où le
+    lag positif n'est disponible que dans l'ordre inverse ; on rétablit ensuite
+    l'ordre original (var_x, var_y) pour la génération du scatter.
+    """
+    if lag_df is None or corr_df is None:
+        return False, None, None, None
+
+    candidates: list[tuple[float, int, float, str]] = []
+    for lead, follow in ((var_x, var_y), (var_y, var_x)):
+        try:
+            lag_value = lag_df.loc[lead.column, follow.column]
+            corr_value = corr_df.loc[lead.column, follow.column]
+        except KeyError:
+            continue
+
+        if pd.isna(lag_value) or pd.isna(corr_value):
+            continue
+
+        lag_minutes = int(round(float(lag_value)))
+        corr_float = float(corr_value)
+        if lag_minutes <= 0:
+            continue
+        if abs(corr_float) < MIN_ABS_CORR_FOR_LAGGED:
+            continue
+
+        orientation = "x->y" if lead is var_x and follow is var_y else "y->x"
+        candidates.append((abs(corr_float), lag_minutes, corr_float, orientation))
+
+    if not candidates:
+        return False, None, None, None
+
+    # Priorise la plus forte corrélation absolue ; à défaut, la première trouvée
+    candidates.sort(key=lambda tup: tup[0], reverse=True)
+    _, lag_minutes, corr_float, orientation = candidates[0]
+
+    if orientation == "x->y":
+        applied_lag = lag_minutes  # var_x précède var_y
+    else:
+        applied_lag = -lag_minutes  # var_y précède var_x, on conserve l'ordre (x, y)
+
+    return True, applied_lag, corr_float, orientation
+
+
+def main() -> None:
+    if not CSV_PATH.exists():
+        print(f"⚠ Fichier introuvable : {CSV_PATH}")
+        return
+
+    df = load_raw_csv(CSV_PATH)
+    print(f"Dataset minuté chargé : {CSV_PATH}")
+    print(f"  Lignes   : {len(df)}")
+    print(f"  Colonnes : {list(df.columns)}")
+
+    lag_matrix = _load_lag_matrix(LAG_MATRIX_PATH)
+    if lag_matrix is not None:
+        print(f"Matrice de lags optimale chargée depuis : {LAG_MATRIX_PATH}")
+    corr_matrix = _load_corr_matrix(CORR_MATRIX_PATH)
+    if corr_matrix is not None:
+        print(f"Matrice de corrélations (lag optimal) chargée depuis : {CORR_MATRIX_PATH}")
+
+    pairs = iter_variable_pairs()
+    print(f"Nombre de paires de variables : {len(pairs)}")
+
+    for var_x, var_y in pairs:
+        filename = f"scatter_{var_x.key}_vs_{var_y.key}.png"
+        output_path = OUTPUT_DIR / filename
+
+        print(f"→ Trace {var_y.key} en fonction de {var_x.key} → {output_path}")
+        plot_scatter_pair(
+            df=df,
+            var_x=var_x,
+            var_y=var_y,
+            output_path=output_path,
+            sample_step=10,  # un point sur 10 : ≈ 32k points au lieu de 320k
+        )
+
+        do_lagged, lag_minutes, corr_value, orientation = _should_generate_lagged(lag_matrix, corr_matrix, var_x, var_y)
+        if not do_lagged or lag_minutes is None or orientation is None:
+            print("   (pas de version décalée : lag <= 0 ou |r| insuffisant)")
+            continue
+
+        lagged_filename = f"scatter_{var_x.key}_vs_{var_y.key}_lagged.png"
+        lagged_output = OUTPUT_DIR / lagged_filename
+        print(
+            f"   + Version décalée ({lag_minutes:+d} min appliqué sur {var_x.key} vs {var_y.key}, "
+            f"|r|={abs(corr_value):.3f}, source {orientation}) → {lagged_output}"
+        )
+        plot_scatter_pair(
+            df=df,
+            var_x=var_x,
+            var_y=var_y,
+            output_path=lagged_output,
+            sample_step=10,
+            lag_minutes=lag_minutes,
+        )
+
+    print("✔ Tous les graphiques de nuages de points ont été générés.")
+
+
+if __name__ == "__main__":
+    main()