1

Affiner les heatmaps de corrélation et l'annotation des lags

This commit is contained in:
2025-11-21 01:46:06 +01:00
parent a36157b52f
commit 2ff719107b
11 changed files with 599 additions and 36 deletions

View File

@@ -0,0 +1,191 @@
# scripts/plot_lagged_correlation_heatmap_from_data.py
from __future__ import annotations
from pathlib import Path
import sys
import argparse
import pandas as pd
PROJECT_ROOT = Path(__file__).resolve().parents[3]
if str(PROJECT_ROOT) not in sys.path:
sys.path.insert(0, str(PROJECT_ROOT))
from meteo.variables import VARIABLES, VARIABLES_BY_KEY
from meteo.plots import plot_correlation_heatmap
DATA_DIR = Path("docs/05 - Corrélations binaires avancées/data/lagged_correlations")
FIG_DIR = Path("docs/05 - Corrélations binaires avancées/figures")
DATA_OUTPUT_DIR = Path("docs/05 - Corrélations binaires avancées/data")
def _load_best_corr_and_lag(csv_path: Path) -> tuple[float, int, bool] | None:
"""Retourne (corr, lag, sign_known) au max |r| à partir d'un CSV de lagcorr."""
df = pd.read_csv(csv_path)
if "Pearson" in df.columns:
series = df["Pearson"]
sign_known = True
elif "correlation" in df.columns:
series = df["correlation"]
sign_known = True
elif "Pearson |r|" in df.columns:
series = df["Pearson |r|"]
sign_known = False
elif "Pearson (r²)" in df.columns:
series = (df["Pearson (r²)"].abs()) ** 0.5
sign_known = False
else:
return None
abs_series = series.abs()
if abs_series.empty or abs_series.isna().all():
return None
idx = abs_series.idxmax()
best_corr = series.iloc[idx]
best_lag = int(df.loc[idx, "lag_minutes"])
# Si pas de signe, on retourne un corr positif (le signe pourra être posé via l'inverse)
if not sign_known:
best_corr = abs(best_corr)
return best_corr, best_lag, sign_known
def _get_pair_best(
vx_key: str,
vy_key: str,
*,
data_dir: Path,
) -> tuple[float, int, bool, str] | None:
"""
Retourne (corr, lag, sign_known, source) pour la paire ordonnée vx->vy.
Si le CSV direct ne contient pas le signe (|r| ou r²), tente de l'inférer
à partir du CSV inverse (vy->vx) en inversant le lag.
"""
primary = data_dir / f"lagcorr_{vx_key}_to_{vy_key}.csv"
reverse = data_dir / f"lagcorr_{vy_key}_to_{vx_key}.csv"
primary_res = _load_best_corr_and_lag(primary) if primary.exists() else None
if primary_res and primary_res[2]:
corr, lag, sign_known = primary_res
return corr, lag, sign_known, primary.name
# Tentative d'inférence via le CSV inverse
reverse_res = _load_best_corr_and_lag(reverse) if reverse.exists() else None
if primary_res and reverse_res and reverse_res[2]:
primary_corr, _, _ = primary_res
rev_corr, rev_lag, _ = reverse_res
corr = primary_corr if primary_res[2] else abs(primary_corr) * (1 if rev_corr >= 0 else -1)
lag = -rev_lag
return corr, lag, True, f"{primary.name} (signe/lag inférés depuis {reverse.name})"
if primary_res:
corr, lag, sign_known = primary_res
return corr, lag, sign_known, primary.name if primary.exists() else "n/a"
if reverse_res:
rev_corr, rev_lag, sign_known = reverse_res
corr = rev_corr
lag = -rev_lag
return corr, lag, sign_known, reverse.name
return None
def main() -> None:
parser = argparse.ArgumentParser(
description="Construit des matrices depuis les CSV lagcorr_* existants (max |r| par paire ordonnée)."
)
parser.add_argument(
"--data-dir",
type=Path,
default=DATA_DIR,
help="Dossier contenant les CSV lagcorr_*_to_*.csv.",
)
parser.add_argument(
"--fig-dir",
type=Path,
default=FIG_DIR,
help="Dossier de sortie pour la heatmap.",
)
parser.add_argument(
"--data-output-dir",
type=Path,
default=DATA_OUTPUT_DIR,
help="Dossier de sortie pour les matrices CSV exportées.",
)
parser.add_argument(
"--annot-threshold",
type=float,
default=0.3,
help="N'affiche le lag annoté que si |r| >= ce seuil (0 pour tout afficher).",
)
args = parser.parse_args()
args.fig_dir.mkdir(parents=True, exist_ok=True)
args.data_output_dir.mkdir(parents=True, exist_ok=True)
columns = [v.column for v in VARIABLES]
corr_matrix = pd.DataFrame(index=columns, columns=columns, dtype=float)
lag_matrix = pd.DataFrame(index=columns, columns=columns, dtype=int)
missing_files: list[str] = []
sign_unknown: list[str] = []
for vx in VARIABLES:
for vy in VARIABLES:
if vx == vy:
corr_matrix.loc[vx.column, vy.column] = 1.0
lag_matrix.loc[vx.column, vy.column] = 0
continue
res = _get_pair_best(vx.key, vy.key, data_dir=args.data_dir)
if res is None:
missing_files.append(f"{vx.key}{vy.key}")
corr_matrix.loc[vx.column, vy.column] = float("nan")
lag_matrix.loc[vx.column, vy.column] = 0
continue
best_corr, best_lag, sign_known, source = res
if not sign_known:
sign_unknown.append(f"{vx.key}{vy.key} (source={source})")
corr_matrix.loc[vx.column, vy.column] = best_corr
lag_matrix.loc[vx.column, vy.column] = best_lag
if missing_files:
print("⚠ CSV manquants pour certaines paires :", ", ".join(missing_files))
if sign_unknown:
print("⚠ Signe inconnu (CSV en |r| ou r²) pour :", ", ".join(sign_unknown))
# Heatmap signée (valeurs positives lorsque le signe est absent des CSV)
output_path = args.fig_dir / "correlation_heatmap_lagged.png"
annot_df = lag_matrix.copy()
if args.annot_threshold > 0:
mask = corr_matrix.abs() < args.annot_threshold
annot_df = annot_df.mask(mask, "")
plot_correlation_heatmap(
corr=corr_matrix,
variables=VARIABLES,
output_path=output_path,
annotate=True,
annotate_values=annot_df,
title="Corrélations (lag optimal par paire, issues des CSV)",
cmap="coolwarm",
vmin=-1.0,
vmax=1.0,
colorbar_label="Coefficient de corrélation r",
)
print(f"✔ Heatmap laggée sauvegardée dans : {output_path}")
corr_csv = args.data_output_dir / "correlation_matrix_lagged.csv"
corr_matrix.to_csv(corr_csv)
print(f"✔ Matrice des corrélations exportée : {corr_csv}")
lag_csv = args.data_output_dir / "lag_matrix_minutes.csv"
lag_matrix.to_csv(lag_csv)
print(f"✔ Matrice des lags exportée : {lag_csv}")
if __name__ == "__main__":
main()

View File

@@ -4,16 +4,21 @@ from __future__ import annotations
from pathlib import Path
import sys
import argparse
PROJECT_ROOT = Path(__file__).resolve().parents[3]
if str(PROJECT_ROOT) not in sys.path:
sys.path.insert(0, str(PROJECT_ROOT))
from meteo.dataset import load_raw_csv
from meteo.variables import VARIABLES_BY_KEY
from meteo.variables import VARIABLES, VARIABLES_BY_KEY
from meteo.analysis import compute_lagged_correlation
from meteo.plots import plot_lagged_correlation
from meteo.correlation_presets import DEFAULT_LAGGED_PAIRS
from meteo.plots import plot_lagged_correlation_multi
from meteo.correlation_presets import (
DEFAULT_ABS_CORRELATION_BANDS,
DEFAULT_SIGNED_CORRELATION_BANDS,
CorrelationBand,
)
CSV_PATH = Path("data/weather_minutely.csv")
@@ -22,39 +27,143 @@ OUTPUT_DIR = DOC_DIR / "figures" / "lagged_correlations"
def main() -> None:
parser = argparse.ArgumentParser(description="Trace les corrélations décalées pour toutes les paires de variables.")
parser.add_argument(
"--max-lag",
type=int,
default=720,
help="Décalage maximal en minutes (par défaut : 720 = ±12h).",
)
parser.add_argument(
"--step",
type=int,
default=10,
help="Pas en minutes pour l'évaluation des lags (par défaut : 10).",
)
parser.add_argument(
"--resample",
default="none",
help="Fréquence d'agrégation avant calcul (par défaut : 'none') Exemple : '10min'.",
)
parser.add_argument(
"--signed",
action="store_true",
default=True,
help="Affiche les corrélations signées. Utiliser --no-signed pour revenir aux valeurs absolues |r|.",
)
parser.add_argument(
"--no-signed",
action="store_false",
dest="signed",
help="Alias explicite pour repasser en valeurs absolues |r|.",
)
parser.add_argument(
"--thresholds",
default=None,
help=(
"Seuils personnalisés (valeurs séparées par des virgules). "
"Par défaut, utilise les bandes définies dans meteo.correlation_presets "
"(abs ou signées selon --signed)."
),
)
parser.add_argument(
"--only",
nargs="*",
help="Clés de variables à inclure (par défaut toutes les variables numériques).",
)
args = parser.parse_args()
if not CSV_PATH.exists():
print(f"⚠ Fichier introuvable : {CSV_PATH}")
return
df = load_raw_csv(CSV_PATH)
df = df.select_dtypes(include="number")
if args.resample and args.resample.lower() != "none":
df = df.resample(args.resample).mean()
print(f"Dataset rééchantillonné à {args.resample} pour accélérer le calcul.")
print(f"Dataset minuté chargé : {CSV_PATH}")
print(f" Lignes : {len(df)}")
print(f" Colonnes : {list(df.columns)}")
print()
for key_x, key_y in DEFAULT_LAGGED_PAIRS:
var_x = VARIABLES_BY_KEY[key_x]
var_y = VARIABLES_BY_KEY[key_y]
if args.only:
missing = [k for k in args.only if k not in VARIABLES_BY_KEY]
if missing:
raise KeyError(f"Variables inconnues : {', '.join(missing)}")
variables = [VARIABLES_BY_KEY[k] for k in args.only]
else:
variables = list(VARIABLES)
pairs = [(vx, vy) for i, vx in enumerate(variables) for vy in variables[i + 1 :]]
print(f"Paires analysées : {len(pairs)} (combinaisons uniques, sans inverses).")
for var_x, var_y in pairs:
print(f"→ Corrélation décalée : {var_x.key}{var_y.key}")
lag_df = compute_lagged_correlation(
lag_df_pearson = compute_lagged_correlation(
df=df,
var_x=var_x,
var_y=var_y,
max_lag_minutes=360, # ± 6 heures
step_minutes=10, # pas de 10 minutes
max_lag_minutes=args.max_lag,
step_minutes=args.step,
method="pearson",
)
)["correlation"]
lag_df_spearman = compute_lagged_correlation(
df=df,
var_x=var_x,
var_y=var_y,
max_lag_minutes=args.max_lag,
step_minutes=args.step,
method="spearman",
)["correlation"]
use_abs = not args.signed
if use_abs:
lag_df_pearson = lag_df_pearson.abs()
lag_df_spearman = lag_df_spearman.abs()
ylabel = "Corrélation (|r|)"
labels = {"Pearson": "Pearson |r|", "Spearman": "Spearman |r|"}
y_limits = (0.0, 1.0)
threshold_values = (
[
float(t)
for t in (args.thresholds or "").split(",")
if t.strip() != ""
]
if args.thresholds
else []
)
bands = list(DEFAULT_ABS_CORRELATION_BANDS)
else:
ylabel = "Corrélation"
labels = {"Pearson": "Pearson", "Spearman": "Spearman"}
y_limits = (-1.0, 1.0)
threshold_values: list[float] = []
if args.thresholds:
threshold_values = [
float(t)
for t in args.thresholds.split(",")
if t.strip() != ""
]
threshold_values = sorted({v for thr in threshold_values for v in (thr, -thr)})
bands = list(DEFAULT_SIGNED_CORRELATION_BANDS)
filename = f"lagcorr_{var_x.key}_to_{var_y.key}.png"
output_path = OUTPUT_DIR / filename
plot_lagged_correlation(
lag_df=lag_df,
plot_lagged_correlation_multi(
lag_series={
labels["Pearson"]: lag_df_pearson,
labels["Spearman"]: lag_df_spearman,
},
var_x=var_x,
var_y=var_y,
output_path=output_path,
ylabel=ylabel,
y_limits=y_limits,
thresholds=threshold_values,
bands=bands,
)
print("✔ Graphiques de corrélation décalée générés.")