You've already forked donnees_meteo
Exploration des relations binaires
This commit is contained in:
@@ -0,0 +1,177 @@
|
||||
# scripts/plot_all_pairwise_scatter.py
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
import sys
|
||||
|
||||
import pandas as pd
|
||||
|
||||
|
||||
PROJECT_ROOT = Path(__file__).resolve().parents[3]
|
||||
if str(PROJECT_ROOT) not in sys.path:
|
||||
sys.path.insert(0, str(PROJECT_ROOT))
|
||||
|
||||
from meteo.dataset import load_raw_csv
|
||||
from meteo.variables import iter_variable_pairs
|
||||
from meteo.plots import plot_scatter_pair
|
||||
|
||||
|
||||
CSV_PATH = Path("data/weather_minutely.csv")
|
||||
DOC_DIR = Path(__file__).resolve().parent.parent
|
||||
OUTPUT_DIR = DOC_DIR / "figures" / "pairwise_scatter"
|
||||
LAG_MATRIX_PATH = DOC_DIR / "data" / "lag_matrix_minutes.csv"
|
||||
CORR_MATRIX_PATH = DOC_DIR / "data" / "correlation_matrix_lagged.csv"
|
||||
MIN_ABS_CORR_FOR_LAGGED = 0.3
|
||||
|
||||
|
||||
def _load_lag_matrix(path: Path) -> pd.DataFrame | None:
|
||||
if not path.exists():
|
||||
print(f"⚠ Aucun lag appliqué : fichier absent ({path}).")
|
||||
return None
|
||||
|
||||
lag_df = pd.read_csv(path, index_col=0)
|
||||
if lag_df.empty:
|
||||
print(f"⚠ Matrice de lags vide : {path}")
|
||||
return None
|
||||
return lag_df
|
||||
|
||||
|
||||
def _load_corr_matrix(path: Path) -> pd.DataFrame | None:
|
||||
if not path.exists():
|
||||
print(f"⚠ Matrice de corrélation laggée absente ({path}) : les versions décalées seront ignorées.")
|
||||
return None
|
||||
|
||||
corr_df = pd.read_csv(path, index_col=0)
|
||||
if corr_df.empty:
|
||||
print(f"⚠ Matrice de corrélation laggée vide : {path}")
|
||||
return None
|
||||
return corr_df
|
||||
|
||||
|
||||
def _get_optimal_lag(lag_df: pd.DataFrame, var_x, var_y) -> int | None:
|
||||
try:
|
||||
value = lag_df.loc[var_x.column, var_y.column]
|
||||
except KeyError:
|
||||
return None
|
||||
|
||||
if pd.isna(value):
|
||||
return None
|
||||
|
||||
return int(round(float(value)))
|
||||
|
||||
|
||||
def _should_generate_lagged(
|
||||
lag_df: pd.DataFrame | None,
|
||||
corr_df: pd.DataFrame | None,
|
||||
var_x,
|
||||
var_y,
|
||||
) -> tuple[bool, int | None, float | None, str | None]:
|
||||
"""
|
||||
Retourne (do_generate, applied_lag_minutes, corr_value, source) où :
|
||||
- applied_lag_minutes s'applique au couple (var_x, var_y) dans cet ordre
|
||||
(lag > 0 : var_x précède var_y ; lag < 0 : var_y précède var_x).
|
||||
- source précise l'orientation retenue dans les matrices (ex: "x->y" ou "y->x").
|
||||
Règles :
|
||||
- lag strictement positif
|
||||
- |corr| >= MIN_ABS_CORR_FOR_LAGGED
|
||||
On teste les deux orientations (x->y et y->x) pour détecter le cas où le
|
||||
lag positif n'est disponible que dans l'ordre inverse ; on rétablit ensuite
|
||||
l'ordre original (var_x, var_y) pour la génération du scatter.
|
||||
"""
|
||||
if lag_df is None or corr_df is None:
|
||||
return False, None, None, None
|
||||
|
||||
candidates: list[tuple[float, int, float, str]] = []
|
||||
for lead, follow in ((var_x, var_y), (var_y, var_x)):
|
||||
try:
|
||||
lag_value = lag_df.loc[lead.column, follow.column]
|
||||
corr_value = corr_df.loc[lead.column, follow.column]
|
||||
except KeyError:
|
||||
continue
|
||||
|
||||
if pd.isna(lag_value) or pd.isna(corr_value):
|
||||
continue
|
||||
|
||||
lag_minutes = int(round(float(lag_value)))
|
||||
corr_float = float(corr_value)
|
||||
if lag_minutes <= 0:
|
||||
continue
|
||||
if abs(corr_float) < MIN_ABS_CORR_FOR_LAGGED:
|
||||
continue
|
||||
|
||||
orientation = "x->y" if lead is var_x and follow is var_y else "y->x"
|
||||
candidates.append((abs(corr_float), lag_minutes, corr_float, orientation))
|
||||
|
||||
if not candidates:
|
||||
return False, None, None, None
|
||||
|
||||
# Priorise la plus forte corrélation absolue ; à défaut, la première trouvée
|
||||
candidates.sort(key=lambda tup: tup[0], reverse=True)
|
||||
_, lag_minutes, corr_float, orientation = candidates[0]
|
||||
|
||||
if orientation == "x->y":
|
||||
applied_lag = lag_minutes # var_x précède var_y
|
||||
else:
|
||||
applied_lag = -lag_minutes # var_y précède var_x, on conserve l'ordre (x, y)
|
||||
|
||||
return True, applied_lag, corr_float, orientation
|
||||
|
||||
|
||||
def main() -> None:
|
||||
if not CSV_PATH.exists():
|
||||
print(f"⚠ Fichier introuvable : {CSV_PATH}")
|
||||
return
|
||||
|
||||
df = load_raw_csv(CSV_PATH)
|
||||
print(f"Dataset minuté chargé : {CSV_PATH}")
|
||||
print(f" Lignes : {len(df)}")
|
||||
print(f" Colonnes : {list(df.columns)}")
|
||||
|
||||
lag_matrix = _load_lag_matrix(LAG_MATRIX_PATH)
|
||||
if lag_matrix is not None:
|
||||
print(f"Matrice de lags optimale chargée depuis : {LAG_MATRIX_PATH}")
|
||||
corr_matrix = _load_corr_matrix(CORR_MATRIX_PATH)
|
||||
if corr_matrix is not None:
|
||||
print(f"Matrice de corrélations (lag optimal) chargée depuis : {CORR_MATRIX_PATH}")
|
||||
|
||||
pairs = iter_variable_pairs()
|
||||
print(f"Nombre de paires de variables : {len(pairs)}")
|
||||
|
||||
for var_x, var_y in pairs:
|
||||
filename = f"scatter_{var_x.key}_vs_{var_y.key}.png"
|
||||
output_path = OUTPUT_DIR / filename
|
||||
|
||||
print(f"→ Trace {var_y.key} en fonction de {var_x.key} → {output_path}")
|
||||
plot_scatter_pair(
|
||||
df=df,
|
||||
var_x=var_x,
|
||||
var_y=var_y,
|
||||
output_path=output_path,
|
||||
sample_step=10, # un point sur 10 : ≈ 32k points au lieu de 320k
|
||||
)
|
||||
|
||||
do_lagged, lag_minutes, corr_value, orientation = _should_generate_lagged(lag_matrix, corr_matrix, var_x, var_y)
|
||||
if not do_lagged or lag_minutes is None or orientation is None:
|
||||
print(" (pas de version décalée : lag <= 0 ou |r| insuffisant)")
|
||||
continue
|
||||
|
||||
lagged_filename = f"scatter_{var_x.key}_vs_{var_y.key}_lagged.png"
|
||||
lagged_output = OUTPUT_DIR / lagged_filename
|
||||
print(
|
||||
f" + Version décalée ({lag_minutes:+d} min appliqué sur {var_x.key} vs {var_y.key}, "
|
||||
f"|r|={abs(corr_value):.3f}, source {orientation}) → {lagged_output}"
|
||||
)
|
||||
plot_scatter_pair(
|
||||
df=df,
|
||||
var_x=var_x,
|
||||
var_y=var_y,
|
||||
output_path=lagged_output,
|
||||
sample_step=10,
|
||||
lag_minutes=lag_minutes,
|
||||
)
|
||||
|
||||
print("✔ Tous les graphiques de nuages de points ont été générés.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user