You've already forked donnees_meteo
Affiner les heatmaps de corrélation et l'annotation des lags
This commit is contained in:
@@ -0,0 +1,191 @@
|
||||
# scripts/plot_lagged_correlation_heatmap_from_data.py
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
import pandas as pd
|
||||
|
||||
PROJECT_ROOT = Path(__file__).resolve().parents[3]
|
||||
if str(PROJECT_ROOT) not in sys.path:
|
||||
sys.path.insert(0, str(PROJECT_ROOT))
|
||||
|
||||
from meteo.variables import VARIABLES, VARIABLES_BY_KEY
|
||||
from meteo.plots import plot_correlation_heatmap
|
||||
|
||||
|
||||
DATA_DIR = Path("docs/05 - Corrélations binaires avancées/data/lagged_correlations")
|
||||
FIG_DIR = Path("docs/05 - Corrélations binaires avancées/figures")
|
||||
DATA_OUTPUT_DIR = Path("docs/05 - Corrélations binaires avancées/data")
|
||||
|
||||
|
||||
def _load_best_corr_and_lag(csv_path: Path) -> tuple[float, int, bool] | None:
|
||||
"""Retourne (corr, lag, sign_known) au max |r| à partir d'un CSV de lagcorr."""
|
||||
|
||||
df = pd.read_csv(csv_path)
|
||||
|
||||
if "Pearson" in df.columns:
|
||||
series = df["Pearson"]
|
||||
sign_known = True
|
||||
elif "correlation" in df.columns:
|
||||
series = df["correlation"]
|
||||
sign_known = True
|
||||
elif "Pearson |r|" in df.columns:
|
||||
series = df["Pearson |r|"]
|
||||
sign_known = False
|
||||
elif "Pearson (r²)" in df.columns:
|
||||
series = (df["Pearson (r²)"].abs()) ** 0.5
|
||||
sign_known = False
|
||||
else:
|
||||
return None
|
||||
|
||||
abs_series = series.abs()
|
||||
if abs_series.empty or abs_series.isna().all():
|
||||
return None
|
||||
|
||||
idx = abs_series.idxmax()
|
||||
best_corr = series.iloc[idx]
|
||||
best_lag = int(df.loc[idx, "lag_minutes"])
|
||||
# Si pas de signe, on retourne un corr positif (le signe pourra être posé via l'inverse)
|
||||
if not sign_known:
|
||||
best_corr = abs(best_corr)
|
||||
return best_corr, best_lag, sign_known
|
||||
|
||||
|
||||
def _get_pair_best(
|
||||
vx_key: str,
|
||||
vy_key: str,
|
||||
*,
|
||||
data_dir: Path,
|
||||
) -> tuple[float, int, bool, str] | None:
|
||||
"""
|
||||
Retourne (corr, lag, sign_known, source) pour la paire ordonnée vx->vy.
|
||||
Si le CSV direct ne contient pas le signe (|r| ou r²), tente de l'inférer
|
||||
à partir du CSV inverse (vy->vx) en inversant le lag.
|
||||
"""
|
||||
primary = data_dir / f"lagcorr_{vx_key}_to_{vy_key}.csv"
|
||||
reverse = data_dir / f"lagcorr_{vy_key}_to_{vx_key}.csv"
|
||||
|
||||
primary_res = _load_best_corr_and_lag(primary) if primary.exists() else None
|
||||
if primary_res and primary_res[2]:
|
||||
corr, lag, sign_known = primary_res
|
||||
return corr, lag, sign_known, primary.name
|
||||
|
||||
# Tentative d'inférence via le CSV inverse
|
||||
reverse_res = _load_best_corr_and_lag(reverse) if reverse.exists() else None
|
||||
if primary_res and reverse_res and reverse_res[2]:
|
||||
primary_corr, _, _ = primary_res
|
||||
rev_corr, rev_lag, _ = reverse_res
|
||||
corr = primary_corr if primary_res[2] else abs(primary_corr) * (1 if rev_corr >= 0 else -1)
|
||||
lag = -rev_lag
|
||||
return corr, lag, True, f"{primary.name} (signe/lag inférés depuis {reverse.name})"
|
||||
|
||||
if primary_res:
|
||||
corr, lag, sign_known = primary_res
|
||||
return corr, lag, sign_known, primary.name if primary.exists() else "n/a"
|
||||
|
||||
if reverse_res:
|
||||
rev_corr, rev_lag, sign_known = reverse_res
|
||||
corr = rev_corr
|
||||
lag = -rev_lag
|
||||
return corr, lag, sign_known, reverse.name
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Construit des matrices depuis les CSV lagcorr_* existants (max |r| par paire ordonnée)."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--data-dir",
|
||||
type=Path,
|
||||
default=DATA_DIR,
|
||||
help="Dossier contenant les CSV lagcorr_*_to_*.csv.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--fig-dir",
|
||||
type=Path,
|
||||
default=FIG_DIR,
|
||||
help="Dossier de sortie pour la heatmap.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--data-output-dir",
|
||||
type=Path,
|
||||
default=DATA_OUTPUT_DIR,
|
||||
help="Dossier de sortie pour les matrices CSV exportées.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--annot-threshold",
|
||||
type=float,
|
||||
default=0.3,
|
||||
help="N'affiche le lag annoté que si |r| >= ce seuil (0 pour tout afficher).",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
args.fig_dir.mkdir(parents=True, exist_ok=True)
|
||||
args.data_output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
columns = [v.column for v in VARIABLES]
|
||||
corr_matrix = pd.DataFrame(index=columns, columns=columns, dtype=float)
|
||||
lag_matrix = pd.DataFrame(index=columns, columns=columns, dtype=int)
|
||||
|
||||
missing_files: list[str] = []
|
||||
sign_unknown: list[str] = []
|
||||
|
||||
for vx in VARIABLES:
|
||||
for vy in VARIABLES:
|
||||
if vx == vy:
|
||||
corr_matrix.loc[vx.column, vy.column] = 1.0
|
||||
lag_matrix.loc[vx.column, vy.column] = 0
|
||||
continue
|
||||
res = _get_pair_best(vx.key, vy.key, data_dir=args.data_dir)
|
||||
if res is None:
|
||||
missing_files.append(f"{vx.key}→{vy.key}")
|
||||
corr_matrix.loc[vx.column, vy.column] = float("nan")
|
||||
lag_matrix.loc[vx.column, vy.column] = 0
|
||||
continue
|
||||
|
||||
best_corr, best_lag, sign_known, source = res
|
||||
if not sign_known:
|
||||
sign_unknown.append(f"{vx.key}→{vy.key} (source={source})")
|
||||
corr_matrix.loc[vx.column, vy.column] = best_corr
|
||||
lag_matrix.loc[vx.column, vy.column] = best_lag
|
||||
|
||||
if missing_files:
|
||||
print("⚠ CSV manquants pour certaines paires :", ", ".join(missing_files))
|
||||
if sign_unknown:
|
||||
print("⚠ Signe inconnu (CSV en |r| ou r²) pour :", ", ".join(sign_unknown))
|
||||
|
||||
# Heatmap signée (valeurs positives lorsque le signe est absent des CSV)
|
||||
output_path = args.fig_dir / "correlation_heatmap_lagged.png"
|
||||
annot_df = lag_matrix.copy()
|
||||
if args.annot_threshold > 0:
|
||||
mask = corr_matrix.abs() < args.annot_threshold
|
||||
annot_df = annot_df.mask(mask, "")
|
||||
plot_correlation_heatmap(
|
||||
corr=corr_matrix,
|
||||
variables=VARIABLES,
|
||||
output_path=output_path,
|
||||
annotate=True,
|
||||
annotate_values=annot_df,
|
||||
title="Corrélations (lag optimal par paire, issues des CSV)",
|
||||
cmap="coolwarm",
|
||||
vmin=-1.0,
|
||||
vmax=1.0,
|
||||
colorbar_label="Coefficient de corrélation r",
|
||||
)
|
||||
print(f"✔ Heatmap laggée sauvegardée dans : {output_path}")
|
||||
|
||||
corr_csv = args.data_output_dir / "correlation_matrix_lagged.csv"
|
||||
corr_matrix.to_csv(corr_csv)
|
||||
print(f"✔ Matrice des corrélations exportée : {corr_csv}")
|
||||
|
||||
lag_csv = args.data_output_dir / "lag_matrix_minutes.csv"
|
||||
lag_matrix.to_csv(lag_csv)
|
||||
print(f"✔ Matrice des lags exportée : {lag_csv}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -4,16 +4,21 @@ from __future__ import annotations
|
||||
from pathlib import Path
|
||||
import sys
|
||||
|
||||
import argparse
|
||||
|
||||
PROJECT_ROOT = Path(__file__).resolve().parents[3]
|
||||
if str(PROJECT_ROOT) not in sys.path:
|
||||
sys.path.insert(0, str(PROJECT_ROOT))
|
||||
|
||||
from meteo.dataset import load_raw_csv
|
||||
from meteo.variables import VARIABLES_BY_KEY
|
||||
from meteo.variables import VARIABLES, VARIABLES_BY_KEY
|
||||
from meteo.analysis import compute_lagged_correlation
|
||||
from meteo.plots import plot_lagged_correlation
|
||||
from meteo.correlation_presets import DEFAULT_LAGGED_PAIRS
|
||||
from meteo.plots import plot_lagged_correlation_multi
|
||||
from meteo.correlation_presets import (
|
||||
DEFAULT_ABS_CORRELATION_BANDS,
|
||||
DEFAULT_SIGNED_CORRELATION_BANDS,
|
||||
CorrelationBand,
|
||||
)
|
||||
|
||||
|
||||
CSV_PATH = Path("data/weather_minutely.csv")
|
||||
@@ -22,39 +27,143 @@ OUTPUT_DIR = DOC_DIR / "figures" / "lagged_correlations"
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="Trace les corrélations décalées pour toutes les paires de variables.")
|
||||
parser.add_argument(
|
||||
"--max-lag",
|
||||
type=int,
|
||||
default=720,
|
||||
help="Décalage maximal en minutes (par défaut : 720 = ±12h).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--step",
|
||||
type=int,
|
||||
default=10,
|
||||
help="Pas en minutes pour l'évaluation des lags (par défaut : 10).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--resample",
|
||||
default="none",
|
||||
help="Fréquence d'agrégation avant calcul (par défaut : 'none') Exemple : '10min'.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--signed",
|
||||
action="store_true",
|
||||
default=True,
|
||||
help="Affiche les corrélations signées. Utiliser --no-signed pour revenir aux valeurs absolues |r|.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-signed",
|
||||
action="store_false",
|
||||
dest="signed",
|
||||
help="Alias explicite pour repasser en valeurs absolues |r|.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--thresholds",
|
||||
default=None,
|
||||
help=(
|
||||
"Seuils personnalisés (valeurs séparées par des virgules). "
|
||||
"Par défaut, utilise les bandes définies dans meteo.correlation_presets "
|
||||
"(abs ou signées selon --signed)."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--only",
|
||||
nargs="*",
|
||||
help="Clés de variables à inclure (par défaut toutes les variables numériques).",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if not CSV_PATH.exists():
|
||||
print(f"⚠ Fichier introuvable : {CSV_PATH}")
|
||||
return
|
||||
|
||||
df = load_raw_csv(CSV_PATH)
|
||||
df = df.select_dtypes(include="number")
|
||||
if args.resample and args.resample.lower() != "none":
|
||||
df = df.resample(args.resample).mean()
|
||||
print(f"Dataset rééchantillonné à {args.resample} pour accélérer le calcul.")
|
||||
print(f"Dataset minuté chargé : {CSV_PATH}")
|
||||
print(f" Lignes : {len(df)}")
|
||||
print(f" Colonnes : {list(df.columns)}")
|
||||
print()
|
||||
|
||||
for key_x, key_y in DEFAULT_LAGGED_PAIRS:
|
||||
var_x = VARIABLES_BY_KEY[key_x]
|
||||
var_y = VARIABLES_BY_KEY[key_y]
|
||||
if args.only:
|
||||
missing = [k for k in args.only if k not in VARIABLES_BY_KEY]
|
||||
if missing:
|
||||
raise KeyError(f"Variables inconnues : {', '.join(missing)}")
|
||||
variables = [VARIABLES_BY_KEY[k] for k in args.only]
|
||||
else:
|
||||
variables = list(VARIABLES)
|
||||
pairs = [(vx, vy) for i, vx in enumerate(variables) for vy in variables[i + 1 :]]
|
||||
print(f"Paires analysées : {len(pairs)} (combinaisons uniques, sans inverses).")
|
||||
|
||||
for var_x, var_y in pairs:
|
||||
print(f"→ Corrélation décalée : {var_x.key} → {var_y.key}")
|
||||
|
||||
lag_df = compute_lagged_correlation(
|
||||
lag_df_pearson = compute_lagged_correlation(
|
||||
df=df,
|
||||
var_x=var_x,
|
||||
var_y=var_y,
|
||||
max_lag_minutes=360, # ± 6 heures
|
||||
step_minutes=10, # pas de 10 minutes
|
||||
max_lag_minutes=args.max_lag,
|
||||
step_minutes=args.step,
|
||||
method="pearson",
|
||||
)
|
||||
)["correlation"]
|
||||
|
||||
lag_df_spearman = compute_lagged_correlation(
|
||||
df=df,
|
||||
var_x=var_x,
|
||||
var_y=var_y,
|
||||
max_lag_minutes=args.max_lag,
|
||||
step_minutes=args.step,
|
||||
method="spearman",
|
||||
)["correlation"]
|
||||
|
||||
use_abs = not args.signed
|
||||
if use_abs:
|
||||
lag_df_pearson = lag_df_pearson.abs()
|
||||
lag_df_spearman = lag_df_spearman.abs()
|
||||
ylabel = "Corrélation (|r|)"
|
||||
labels = {"Pearson": "Pearson |r|", "Spearman": "Spearman |r|"}
|
||||
y_limits = (0.0, 1.0)
|
||||
threshold_values = (
|
||||
[
|
||||
float(t)
|
||||
for t in (args.thresholds or "").split(",")
|
||||
if t.strip() != ""
|
||||
]
|
||||
if args.thresholds
|
||||
else []
|
||||
)
|
||||
bands = list(DEFAULT_ABS_CORRELATION_BANDS)
|
||||
else:
|
||||
ylabel = "Corrélation"
|
||||
labels = {"Pearson": "Pearson", "Spearman": "Spearman"}
|
||||
y_limits = (-1.0, 1.0)
|
||||
threshold_values: list[float] = []
|
||||
if args.thresholds:
|
||||
threshold_values = [
|
||||
float(t)
|
||||
for t in args.thresholds.split(",")
|
||||
if t.strip() != ""
|
||||
]
|
||||
threshold_values = sorted({v for thr in threshold_values for v in (thr, -thr)})
|
||||
bands = list(DEFAULT_SIGNED_CORRELATION_BANDS)
|
||||
|
||||
filename = f"lagcorr_{var_x.key}_to_{var_y.key}.png"
|
||||
output_path = OUTPUT_DIR / filename
|
||||
|
||||
plot_lagged_correlation(
|
||||
lag_df=lag_df,
|
||||
plot_lagged_correlation_multi(
|
||||
lag_series={
|
||||
labels["Pearson"]: lag_df_pearson,
|
||||
labels["Spearman"]: lag_df_spearman,
|
||||
},
|
||||
var_x=var_x,
|
||||
var_y=var_y,
|
||||
output_path=output_path,
|
||||
ylabel=ylabel,
|
||||
y_limits=y_limits,
|
||||
thresholds=threshold_values,
|
||||
bands=bands,
|
||||
)
|
||||
|
||||
print("✔ Graphiques de corrélation décalée générés.")
|
||||
|
||||
Reference in New Issue
Block a user