diff --git a/figures/event_composites/rain_event_composites.png b/figures/event_composites/rain_event_composites.png new file mode 100644 index 0000000..6d3fe7a Binary files /dev/null and b/figures/event_composites/rain_event_composites.png differ diff --git a/figures/hexbin_explorations/hexbin_lux_humidity_color_temp.png b/figures/hexbin_explorations/hexbin_lux_humidity_color_temp.png new file mode 100644 index 0000000..45805e0 Binary files /dev/null and b/figures/hexbin_explorations/hexbin_lux_humidity_color_temp.png differ diff --git a/figures/hexbin_explorations/hexbin_pressure_rain_color_wind.png b/figures/hexbin_explorations/hexbin_pressure_rain_color_wind.png new file mode 100644 index 0000000..d2c9196 Binary files /dev/null and b/figures/hexbin_explorations/hexbin_pressure_rain_color_wind.png differ diff --git a/figures/hexbin_explorations/hexbin_temp_humidity_color_rain.png b/figures/hexbin_explorations/hexbin_temp_humidity_color_rain.png new file mode 100644 index 0000000..24e13aa Binary files /dev/null and b/figures/hexbin_explorations/hexbin_temp_humidity_color_rain.png differ diff --git a/figures/rolling_correlations/rolling_correlation_heatmap.png b/figures/rolling_correlations/rolling_correlation_heatmap.png new file mode 100644 index 0000000..08fb4ce Binary files /dev/null and b/figures/rolling_correlations/rolling_correlation_heatmap.png differ diff --git a/meteo/analysis.py b/meteo/analysis.py index 72afa52..e667142 100644 --- a/meteo/analysis.py +++ b/meteo/analysis.py @@ -1,7 +1,7 @@ # meteo/analysis.py from __future__ import annotations -from typing import Literal +from typing import Literal, Sequence import numpy as np import pandas as pd @@ -115,3 +115,199 @@ def compute_lagged_correlation( lag_df = lag_df.set_index("lag_minutes") return lag_df + + +def _ensure_datetime_index(df: pd.DataFrame) -> pd.DatetimeIndex: + if not isinstance(df.index, pd.DatetimeIndex): + raise TypeError("Cette fonction nécessite un DataFrame indexé par le temps.") + return df.index + + +def compute_rolling_correlation_series( + df: pd.DataFrame, + var_x: Variable, + var_y: Variable, + *, + window_minutes: int, + min_valid_fraction: float = 0.6, + step_minutes: int | None = None, + method: Literal["pearson", "spearman"] = "pearson", +) -> pd.Series: + """ + Calcule la corrélation glissante X/Y sur une fenêtre temporelle. + Retourne une série indexée par l'instant de fin de fenêtre. + """ + if not 0 < min_valid_fraction <= 1: + raise ValueError("min_valid_fraction doit être dans l'intervalle ]0, 1].") + + for col in (var_x.column, var_y.column): + if col not in df.columns: + raise KeyError(f"Colonne absente du DataFrame : {col}") + + _ensure_datetime_index(df) + pair = df[[var_x.column, var_y.column]].dropna().sort_index() + + if pair.empty: + return pd.Series(dtype=float, name=f"{var_x.key}→{var_y.key}") + + window = f"{window_minutes}min" + min_periods = max(1, int(window_minutes * min_valid_fraction)) + if method not in {"pearson"}: + raise NotImplementedError( + "Les corrélations glissantes ne supportent actuellement que la méthode 'pearson'." + ) + + rolling_corr = pair[var_x.column].rolling( + window=window, + min_periods=min_periods, + ).corr(pair[var_y.column]) + + rolling_corr = rolling_corr.dropna() + rolling_corr.name = f"{var_x.key}→{var_y.key}" + + if step_minutes and step_minutes > 1: + rolling_corr = rolling_corr.resample(f"{step_minutes}min").mean().dropna() + + return rolling_corr + + +def compute_rolling_correlations_for_pairs( + df: pd.DataFrame, + pairs: Sequence[tuple[Variable, Variable]], + *, + window_minutes: int, + min_valid_fraction: float = 0.6, + step_minutes: int | None = None, + method: Literal["pearson", "spearman"] = "pearson", +) -> pd.DataFrame: + """ + Calcule les corrélations glissantes pour plusieurs paires et aligne les + résultats dans un DataFrame (index temps, colonnes = 'x→y'). + """ + series_list: list[pd.Series] = [] + for var_x, var_y in pairs: + corr = compute_rolling_correlation_series( + df=df, + var_x=var_x, + var_y=var_y, + window_minutes=window_minutes, + min_valid_fraction=min_valid_fraction, + step_minutes=step_minutes, + method=method, + ) + if not corr.empty: + series_list.append(corr) + + if not series_list: + return pd.DataFrame() + + result = pd.concat(series_list, axis=1) + result = result.sort_index() + return result + + +def _infer_time_step(index: pd.DatetimeIndex) -> pd.Timedelta: + diffs = index.to_series().diff().dropna() + if diffs.empty: + return pd.Timedelta(minutes=1) + return diffs.median() + + +def detect_threshold_events( + series: pd.Series, + *, + threshold: float, + min_duration: pd.Timedelta, + min_gap: pd.Timedelta, +) -> list[tuple[pd.Timestamp, pd.Timestamp]]: + """ + Détecte des événements où `series > threshold` (après remplissage des NaN + par False) durant au moins `min_duration`. Les événements séparés d'un + intervalle < min_gap sont fusionnés. + """ + if not isinstance(series.index, pd.DatetimeIndex): + raise TypeError("series doit être indexée par le temps.") + + mask = (series > threshold).fillna(False) + if not mask.any(): + return [] + + groups = (mask != mask.shift()).cumsum() + time_step = _infer_time_step(series.index) + raw_events: list[tuple[pd.Timestamp, pd.Timestamp]] = [] + + for group_id, group_mask in mask.groupby(groups): + if not group_mask.iloc[0]: + continue + start = group_mask.index[0] + end = group_mask.index[-1] + time_step + duration = end - start + if duration >= min_duration: + raw_events.append((start, end)) + + if not raw_events: + return [] + + merged: list[tuple[pd.Timestamp, pd.Timestamp]] = [] + for start, end in raw_events: + if not merged: + merged.append((start, end)) + continue + + prev_start, prev_end = merged[-1] + if start - prev_end < min_gap: + merged[-1] = (prev_start, max(prev_end, end)) + else: + merged.append((start, end)) + + return merged + + +def build_event_aligned_segments( + df: pd.DataFrame, + events: Sequence[tuple[pd.Timestamp, pd.Timestamp]], + columns: Sequence[str], + *, + window_before_minutes: int, + window_after_minutes: int, + resample_minutes: int = 1, +) -> pd.DataFrame: + """ + Extrait, pour chaque événement, les séries centrées sur son début et + retourne un DataFrame MultiIndex (event_id, offset_minutes). + """ + if not events: + return pd.DataFrame(columns=columns) + + index = _ensure_datetime_index(df) + data = df[columns].sort_index() + + freq = pd.Timedelta(minutes=resample_minutes) + if resample_minutes > 1: + data = data.resample(freq).mean() + + before = pd.Timedelta(minutes=window_before_minutes) + after = pd.Timedelta(minutes=window_after_minutes) + + segments: list[pd.DataFrame] = [] + + for event_id, (start, _end) in enumerate(events): + window_start = start - before + window_end = start + after + window_index = pd.date_range(window_start, window_end, freq=freq) + segment = data.reindex(window_index) + if segment.empty: + continue + offsets = ((segment.index - start) / pd.Timedelta(minutes=1)).astype(float) + multi_index = pd.MultiIndex.from_arrays( + [np.full(len(segment), event_id), offsets], + names=["event_id", "offset_minutes"], + ) + segment.index = multi_index + segments.append(segment) + + if not segments: + return pd.DataFrame(columns=columns) + + aligned = pd.concat(segments) + return aligned diff --git a/meteo/plots.py b/meteo/plots.py index fa70fa3..7c570cf 100644 --- a/meteo/plots.py +++ b/meteo/plots.py @@ -2,7 +2,7 @@ from __future__ import annotations from pathlib import Path -from typing import Sequence +from typing import Callable, Sequence import matplotlib.pyplot as plt from matplotlib.colors import Normalize @@ -168,6 +168,71 @@ def plot_scatter_pair( return output_path.resolve() +def plot_hexbin_with_third_variable( + df: pd.DataFrame, + var_x: Variable, + var_y: Variable, + var_color: Variable, + output_path: str | Path, + *, + gridsize: int = 60, + mincnt: int = 5, + reduce_func: Callable[[np.ndarray], float] | None = None, + reduce_func_label: str | None = None, + cmap: str = "viridis", +) -> Path: + """ + Trace une carte de densité hexbin où la couleur encode une 3e variable. + """ + output_path = Path(output_path) + output_path.parent.mkdir(parents=True, exist_ok=True) + + reduce_func = reduce_func or np.mean + + df_xyz = df[[var_x.column, var_y.column, var_color.column]].dropna() + if df_xyz.empty: + fig, ax = plt.subplots() + ax.text( + 0.5, + 0.5, + "Pas de données valides pour cette combinaison.", + ha="center", + va="center", + ) + ax.set_axis_off() + fig.savefig(output_path, dpi=150, bbox_inches="tight") + plt.close(fig) + return output_path.resolve() + + fig, ax = plt.subplots() + hb = ax.hexbin( + df_xyz[var_x.column], + df_xyz[var_y.column], + C=df_xyz[var_color.column], + reduce_C_function=reduce_func, + gridsize=gridsize, + cmap=cmap, + mincnt=mincnt, + ) + + func_label = reduce_func_label or getattr(reduce_func, "__name__", "statistique") + colorbar_label = f"{func_label.capitalize()} de {var_color.label}" + cbar = fig.colorbar(hb, ax=ax) + cbar.set_label(colorbar_label) + + ax.set_xlabel(f"{var_x.label} ({var_x.unit})") + ax.set_ylabel(f"{var_y.label} ({var_y.unit})") + ax.set_title( + f"{var_y.label} vs {var_x.label}\nCouleur : {func_label} de {var_color.label}" + ) + ax.grid(False) + fig.tight_layout() + fig.savefig(output_path, dpi=150) + plt.close(fig) + + return output_path.resolve() + + def plot_lagged_correlation( lag_df: pd.DataFrame, var_x: Variable, @@ -270,3 +335,141 @@ def plot_correlation_heatmap( plt.close(fig) return output_path.resolve() + + +def plot_rolling_correlation_heatmap( + rolling_corr: pd.DataFrame, + output_path: str | Path, + *, + cmap: str = "coolwarm", + vmin: float = -1.0, + vmax: float = 1.0, + time_tick_count: int = 6, +) -> Path: + """ + Visualise l'évolution de corrélations glissantes pour plusieurs paires. + """ + output_path = Path(output_path) + output_path.parent.mkdir(parents=True, exist_ok=True) + + if rolling_corr.empty: + fig, ax = plt.subplots() + ax.text(0.5, 0.5, "Aucune donnée de corrélation glissante.", ha="center", va="center") + ax.set_axis_off() + fig.savefig(output_path, dpi=150, bbox_inches="tight") + plt.close(fig) + return output_path.resolve() + + labels = list(rolling_corr.columns) + data = rolling_corr.to_numpy().T + + height = max(3.0, 0.6 * len(labels)) + fig, ax = plt.subplots(figsize=(10, height)) + im = ax.imshow(data, aspect="auto", cmap=cmap, vmin=vmin, vmax=vmax) + + ax.set_yticks(np.arange(len(labels))) + ax.set_yticklabels(labels) + + if isinstance(rolling_corr.index, pd.DatetimeIndex): + times = rolling_corr.index + if len(times) > 1: + tick_idx = np.linspace(0, len(times) - 1, num=min(time_tick_count, len(times)), dtype=int) + else: + tick_idx = np.array([0]) + tick_labels = [times[i].strftime("%Y-%m-%d\n%H:%M") for i in tick_idx] + else: + tick_idx = np.linspace(0, len(rolling_corr.index) - 1, num=min(time_tick_count, len(rolling_corr.index)), dtype=int) + tick_labels = [str(rolling_corr.index[i]) for i in tick_idx] + + ax.set_xticks(tick_idx) + ax.set_xticklabels(tick_labels, rotation=30, ha="right") + + ax.set_xlabel("Temps (fin de fenêtre)") + ax.set_ylabel("Paire de variables") + ax.set_title("Corrélations glissantes") + + cbar = fig.colorbar(im, ax=ax) + cbar.set_label("Coefficient de corrélation") + + fig.tight_layout() + fig.savefig(output_path, dpi=150) + plt.close(fig) + + return output_path.resolve() + + +def plot_event_composite( + aligned_segments: pd.DataFrame, + variables: Sequence[Variable], + output_path: str | Path, + *, + quantiles: tuple[float, float] = (0.25, 0.75), + baseline_label: str = "Début de l'événement", +) -> Path: + """ + Trace les moyennes/médianes autour d'événements détectés avec éventail inter-quantiles. + """ + output_path = Path(output_path) + output_path.parent.mkdir(parents=True, exist_ok=True) + + if aligned_segments.empty: + fig, ax = plt.subplots() + ax.text( + 0.5, + 0.5, + "Aucun événement aligné à tracer.", + ha="center", + va="center", + ) + ax.set_axis_off() + fig.savefig(output_path, dpi=150, bbox_inches="tight") + plt.close(fig) + return output_path.resolve() + + if "offset_minutes" not in aligned_segments.index.names: + raise ValueError("aligned_segments doit avoir un niveau 'offset_minutes'.") + + group = aligned_segments.groupby(level="offset_minutes") + mean_df = group.mean() + median_df = group.median() + + q_low, q_high = quantiles + quantile_low = group.quantile(q_low) if q_low is not None else None + quantile_high = group.quantile(q_high) if q_high is not None else None + + offsets = mean_df.index.to_numpy(dtype=float) + n_vars = len(variables) + fig, axes = plt.subplots(n_vars, 1, figsize=(10, 3 * n_vars), sharex=True) + if n_vars == 1: + axes = [axes] + + for ax, var in zip(axes, variables): + col = var.column + ax.axvline(0, color="black", linestyle="--", linewidth=1, label=baseline_label) + ax.plot(offsets, mean_df[col], color="tab:blue", label="Moyenne") + ax.plot(offsets, median_df[col], color="tab:orange", linestyle="--", label="Médiane") + + if quantile_low is not None and quantile_high is not None: + ax.fill_between( + offsets, + quantile_low[col], + quantile_high[col], + color="tab:blue", + alpha=0.2, + label=f"IQR {int(q_low*100)}–{int(q_high*100)}%", + ) + + ylabel = f"{var.label} ({var.unit})" if var.unit else var.label + ax.set_ylabel(ylabel) + ax.grid(True, linestyle=":", alpha=0.5) + + axes[-1].set_xlabel("Minutes autour de l'événement") + axes[0].legend(loc="upper right") + total_events = len(aligned_segments.index.get_level_values("event_id").unique()) + fig.suptitle(f"Composites autour d'événements ({total_events} occurrences)") + + fig.tight_layout(rect=[0, 0, 1, 0.97]) + fig.savefig(output_path, dpi=150) + plt.close(fig) + + return output_path.resolve() diff --git a/scripts/plot_hexbin_explorations.py b/scripts/plot_hexbin_explorations.py new file mode 100644 index 0000000..bb5bbe6 --- /dev/null +++ b/scripts/plot_hexbin_explorations.py @@ -0,0 +1,128 @@ +# scripts/plot_hexbin_explorations.py +from __future__ import annotations + +from pathlib import Path +from typing import Callable + +import numpy as np + +from meteo.dataset import load_raw_csv +from meteo.variables import VARIABLES_BY_KEY +from meteo.plots import plot_hexbin_with_third_variable + + +CSV_PATH = Path("data/weather_minutely.csv") +OUTPUT_DIR = Path("figures/hexbin_explorations") + + +REDUCE_FUNCTIONS: dict[str, Callable[[np.ndarray], float]] = { + "mean": np.mean, + "median": np.median, + "max": np.max, +} + +REDUCE_LABEL_FR: dict[str, str] = { + "mean": "moyenne", + "median": "médiane", + "max": "maximum", +} + +# Chaque scénario illustre soit une corrélation bien connue, +# soit l'absence de structure entre variables. +HEXBIN_SCENARIOS: list[dict[str, object]] = [ + { + "x": "temperature", + "y": "humidity", + "color": "rain_rate", + "filename": "hexbin_temp_humidity_color_rain.png", + "description": ( + "Mettre en évidence comment l'humidité relative plafonne lorsque la température chute " + "et comment les épisodes de pluie se situent dans une bande restreinte." + ), + "reduce": "max", + "gridsize": 50, + "mincnt": 8, + }, + { + "x": "pressure", + "y": "rain_rate", + "color": "wind_speed", + "filename": "hexbin_pressure_rain_color_wind.png", + "description": ( + "Vérifier si des rafales accompagnent vraiment les chutes de pression. " + "On s'attend à voir beaucoup de cases vides : la corrélation est loin d'être systématique." + ), + "reduce": "median", + "gridsize": 45, + "mincnt": 5, + }, + { + "x": "illuminance", + "y": "humidity", + "color": "temperature", + "filename": "hexbin_lux_humidity_color_temp.png", + "description": ( + "Explorer le cycle jour/nuit : l'humidité monte quand l'illuminance chute, " + "mais cela n'implique pas toujours une baisse rapide de température." + ), + "reduce": "mean", + "gridsize": 55, + "mincnt": 6, + }, +] + + +def main() -> None: + if not CSV_PATH.exists(): + print(f"⚠ Fichier introuvable : {CSV_PATH}") + return + + df = load_raw_csv(CSV_PATH) + print(f"Dataset minuté chargé : {CSV_PATH}") + print(f" Lignes : {len(df)}") + print(f" Colonnes : {list(df.columns)}") + print() + + for scenario in HEXBIN_SCENARIOS: + key_x = scenario["x"] + key_y = scenario["y"] + key_color = scenario["color"] + + var_x = VARIABLES_BY_KEY[key_x] + var_y = VARIABLES_BY_KEY[key_y] + var_color = VARIABLES_BY_KEY[key_color] + + filename = scenario["filename"] + output_path = OUTPUT_DIR / filename + + reduce_name = scenario.get("reduce", "mean") + reduce_func = REDUCE_FUNCTIONS.get(reduce_name, np.mean) + reduce_label = REDUCE_LABEL_FR.get(reduce_name, reduce_name) + + gridsize = int(scenario.get("gridsize", 60)) + mincnt = int(scenario.get("mincnt", 5)) + + description = scenario["description"] + print(f"→ Hexbin {var_y.key} vs {var_x.key} (couleur = {var_color.key})") + print(f" {description}") + + plot_hexbin_with_third_variable( + df=df, + var_x=var_x, + var_y=var_y, + var_color=var_color, + output_path=output_path, + gridsize=gridsize, + mincnt=mincnt, + reduce_func=reduce_func, + reduce_func_label=reduce_label, + cmap="magma", + ) + print(f" ✔ Graphique enregistré : {output_path}") + print() + + print("✔ Tous les graphiques hexbin ont été générés.") + + +if __name__ == "__main__": + main() diff --git a/scripts/plot_rain_event_composites.py b/scripts/plot_rain_event_composites.py new file mode 100644 index 0000000..4afbb5b --- /dev/null +++ b/scripts/plot_rain_event_composites.py @@ -0,0 +1,85 @@ +# scripts/plot_rain_event_composites.py +from __future__ import annotations + +from pathlib import Path +from typing import Sequence + +import pandas as pd + +from meteo.dataset import load_raw_csv +from meteo.variables import Variable, VARIABLES_BY_KEY +from meteo.analysis import detect_threshold_events, build_event_aligned_segments +from meteo.plots import plot_event_composite + + +CSV_PATH = Path("data/weather_minutely.csv") +OUTPUT_PATH = Path("figures/event_composites/rain_event_composites.png") + +RAIN_THRESHOLD = 0.2 # mm/h : au-dessous on considère qu'il ne pleut pas vraiment +MIN_EVENT_DURATION = 5 # minutes +MIN_EVENT_GAP = 20 # minutes nécessaires pour considérer un nouvel événement +WINDOW_BEFORE = 120 # minutes affichées avant le début de la pluie +WINDOW_AFTER = 240 # minutes après le déclenchement + +COMPOSITE_VARIABLE_KEYS: Sequence[str] = [ + "pressure", + "temperature", + "humidity", + "wind_speed", +] + + +def main() -> None: + if not CSV_PATH.exists(): + print(f"⚠ Fichier introuvable : {CSV_PATH}") + return + + df = load_raw_csv(CSV_PATH) + print(f"Dataset minuté chargé : {CSV_PATH}") + print(f" Lignes : {len(df)}") + print(f" Colonnes : {list(df.columns)}") + print() + + rain_series = df["rain_rate"] + events = detect_threshold_events( + rain_series, + threshold=RAIN_THRESHOLD, + min_duration=pd.Timedelta(minutes=MIN_EVENT_DURATION), + min_gap=pd.Timedelta(minutes=MIN_EVENT_GAP), + ) + + if not events: + print("⚠ Aucun événement de pluie détecté avec les paramètres actuels.") + return + + print(f"Nombre d'événements détectés : {len(events)}") + + variables: list[Variable] = [VARIABLES_BY_KEY[key] for key in COMPOSITE_VARIABLE_KEYS] + columns = [v.column for v in variables] + + aligned_segments = build_event_aligned_segments( + df=df, + events=events, + columns=columns, + window_before_minutes=WINDOW_BEFORE, + window_after_minutes=WINDOW_AFTER, + resample_minutes=1, + ) + + if aligned_segments.empty: + print("⚠ Les segments alignés sont vides (période manquante ?).") + return + + output_path = plot_event_composite( + aligned_segments=aligned_segments, + variables=variables, + output_path=OUTPUT_PATH, + quantiles=(0.2, 0.8), + baseline_label="Début de la pluie", + ) + + print(f"✔ Graphique composite pluie sauvegardé : {output_path}") + + +if __name__ == "__main__": + main() diff --git a/scripts/plot_rolling_correlation_heatmap.py b/scripts/plot_rolling_correlation_heatmap.py new file mode 100644 index 0000000..b19b425 --- /dev/null +++ b/scripts/plot_rolling_correlation_heatmap.py @@ -0,0 +1,65 @@ +# scripts/plot_rolling_correlation_heatmap.py +from __future__ import annotations + +from pathlib import Path + +from meteo.dataset import load_raw_csv +from meteo.variables import VARIABLES_BY_KEY +from meteo.analysis import compute_rolling_correlations_for_pairs +from meteo.plots import plot_rolling_correlation_heatmap + + +CSV_PATH = Path("data/weather_minutely.csv") +OUTPUT_PATH = Path("figures/rolling_correlations/rolling_correlation_heatmap.png") + +ROLLING_PAIRS: list[tuple[str, str]] = [ + ("temperature", "humidity"), + ("pressure", "rain_rate"), + ("pressure", "wind_speed"), + ("illuminance", "temperature"), + ("humidity", "rain_rate"), +] + +WINDOW_MINUTES = 180 # 3 heures pour observer les tendances synoptiques +STEP_MINUTES = 30 # on n'échantillonne qu'un point sur 30 minutes + + +def main() -> None: + if not CSV_PATH.exists(): + print(f"⚠ Fichier introuvable : {CSV_PATH}") + return + + df = load_raw_csv(CSV_PATH) + print(f"Dataset minuté chargé : {CSV_PATH}") + print(f" Lignes : {len(df)}") + print(f" Colonnes : {list(df.columns)}") + print() + + pairs = [(VARIABLES_BY_KEY[a], VARIABLES_BY_KEY[b]) for a, b in ROLLING_PAIRS] + + rolling_df = compute_rolling_correlations_for_pairs( + df=df, + pairs=pairs, + window_minutes=WINDOW_MINUTES, + min_valid_fraction=0.7, + step_minutes=STEP_MINUTES, + method="pearson", + ) + + if rolling_df.empty: + print("⚠ Impossible de calculer les corrélations glissantes (données insuffisantes).") + return + + output_path = plot_rolling_correlation_heatmap( + rolling_corr=rolling_df, + output_path=OUTPUT_PATH, + cmap="coolwarm", + vmin=-1.0, + vmax=1.0, + ) + + print(f"✔ Heatmap de corrélations glissantes enregistrée : {output_path}") + + +if __name__ == "__main__": + main()