diff --git a/.env.example b/.env.example index fa9d71d..7e3fae7 100644 --- a/.env.example +++ b/.env.example @@ -2,3 +2,6 @@ INFLUXDB_URL=http:// INFLUXDB_TOKEN= INFLUXDB_ORG= INFLUXDB_BUCKET=weather +STATION_LATITUDE= +STATION_LONGITUDE= +STATION_ELEVATION= \ No newline at end of file diff --git a/docs/08 - Enrichissement du jeu de données.md b/docs/08 - Enrichissement du jeu de données.md new file mode 100644 index 0000000..48f0bcd --- /dev/null +++ b/docs/08 - Enrichissement du jeu de données.md @@ -0,0 +1,3 @@ +# Enrichissement du jeu de données + +- Élévation du soleil diff --git a/figures/sun/hexbin_sun_elevation_vs_illuminance.png b/figures/sun/hexbin_sun_elevation_vs_illuminance.png new file mode 100644 index 0000000..d223edb Binary files /dev/null and b/figures/sun/hexbin_sun_elevation_vs_illuminance.png differ diff --git a/figures/sun/hexbin_sun_elevation_vs_temperature.png b/figures/sun/hexbin_sun_elevation_vs_temperature.png new file mode 100644 index 0000000..bf24c5e Binary files /dev/null and b/figures/sun/hexbin_sun_elevation_vs_temperature.png differ diff --git a/figures/sun/sun_elevation_profiles.png b/figures/sun/sun_elevation_profiles.png new file mode 100644 index 0000000..77d9a8e Binary files /dev/null and b/figures/sun/sun_elevation_profiles.png differ diff --git a/meteo/analysis.py b/meteo/analysis.py index e7f5afc..d1debe3 100644 --- a/meteo/analysis.py +++ b/meteo/analysis.py @@ -134,6 +134,19 @@ class DiurnalCycleStats: quantile_high_level: float | None = None +@dataclass +class BinnedStatistics: + centers: np.ndarray + intervals: pd.IntervalIndex + counts: pd.Series + mean: pd.DataFrame + median: pd.DataFrame + quantile_low: pd.DataFrame | None + quantile_high: pd.DataFrame | None + quantile_low_level: float | None = None + quantile_high_level: float | None = None + + def compute_rolling_correlation_series( df: pd.DataFrame, var_x: Variable, @@ -456,3 +469,93 @@ def compute_daily_rainfall_totals( } ) return result + + +def compute_binned_statistics( + df: pd.DataFrame, + *, + bin_source_column: str, + target_columns: Sequence[str], + bins: Sequence[float] | np.ndarray, + min_count: int = 30, + quantiles: tuple[float, float] | None = (0.25, 0.75), +) -> BinnedStatistics: + """ + Calcule des statistiques (mean/median/quantiles) pour plusieurs colonnes + en regroupant les données selon des intervalles définis sur une colonne source. + """ + if bin_source_column not in df.columns: + raise KeyError(f"Colonne source absente : {bin_source_column}") + + missing_targets = [col for col in target_columns if col not in df.columns] + if missing_targets: + raise KeyError(f"Colonnes cibles absentes : {missing_targets!r}") + + subset_cols = [bin_source_column, *target_columns] + data = df[subset_cols].dropna(subset=[bin_source_column]) + + if data.empty: + empty_interval_index = pd.IntervalIndex([]) + empty_df = pd.DataFrame(columns=target_columns) + empty_counts = pd.Series(dtype=int) + return BinnedStatistics( + centers=np.array([]), + intervals=empty_interval_index, + counts=empty_counts, + mean=empty_df, + median=empty_df, + quantile_low=None, + quantile_high=None, + ) + + categories = pd.cut(data[bin_source_column], bins=bins, include_lowest=True) + grouped = data.groupby(categories, observed=False) + + counts = grouped.size() + valid_mask = counts >= max(1, min_count) + valid_intervals = counts.index[valid_mask] + + if len(valid_intervals) == 0: + empty_interval_index = pd.IntervalIndex([]) + empty_df = pd.DataFrame(columns=target_columns) + empty_counts = pd.Series(dtype=int) + return BinnedStatistics( + centers=np.array([]), + intervals=empty_interval_index, + counts=empty_counts, + mean=empty_df, + median=empty_df, + quantile_low=None, + quantile_high=None, + ) + + interval_index = pd.IntervalIndex(valid_intervals) + + mean_df = grouped[target_columns].mean().loc[interval_index] + median_df = grouped[target_columns].median().loc[interval_index] + + q_low = q_high = None + quantile_low_df: pd.DataFrame | None = None + quantile_high_df: pd.DataFrame | None = None + + if quantiles is not None: + q_low, q_high = quantiles + if q_low is not None: + quantile_low_df = grouped[target_columns].quantile(q_low).loc[interval_index] + if q_high is not None: + quantile_high_df = grouped[target_columns].quantile(q_high).loc[interval_index] + + centers = np.array([interval.mid for interval in interval_index]) + filtered_counts = counts.loc[interval_index] + + return BinnedStatistics( + centers=centers, + intervals=interval_index, + counts=filtered_counts, + mean=mean_df, + median=median_df, + quantile_low=quantile_low_df, + quantile_high=quantile_high_df, + quantile_low_level=q_low, + quantile_high_level=q_high, + ) diff --git a/meteo/config.py b/meteo/config.py index 58dc7a3..a406532 100644 --- a/meteo/config.py +++ b/meteo/config.py @@ -65,3 +65,58 @@ class InfluxSettings: org=org, # type: ignore[arg-type] bucket=bucket, # type: ignore[arg-type] ) + + +@dataclass(frozen=True) +class StationLocation: + """ + Décrit la position géographique de la station météo. + Utilisée pour les calculs astronomiques (ex: élévation du soleil). + """ + + latitude: float + longitude: float + elevation_m: float = 0.0 + + @classmethod + def from_env(cls, *, optional: bool = False) -> Self | None: + """ + Charge les coordonnées GPS depuis les variables d'environnement : + - STATION_LATITUDE (obligatoire) + - STATION_LONGITUDE (obligatoire) + - STATION_ELEVATION (optionnelle, en mètres) + """ + load_dotenv() + + lat = os.getenv("STATION_LATITUDE") + lon = os.getenv("STATION_LONGITUDE") + elev = os.getenv("STATION_ELEVATION") + + if not lat or not lon: + if optional: + return None + raise RuntimeError( + "Les variables STATION_LATITUDE et STATION_LONGITUDE doivent être définies " + "pour calculer l'élévation solaire." + ) + + try: + latitude = float(lat) + longitude = float(lon) + elevation = float(elev) if elev else 0.0 + except ValueError as exc: + raise RuntimeError( + "STATION_LATITUDE / STATION_LONGITUDE / STATION_ELEVATION doivent être des nombres valides." + ) from exc + + return cls(latitude=latitude, longitude=longitude, elevation_m=elevation) + + def to_astral_observer_kwargs(self) -> dict[str, float]: + """ + Prépare les arguments attendus par astral.Observer. + """ + return { + "latitude": self.latitude, + "longitude": self.longitude, + "elevation": self.elevation_m, + } diff --git a/meteo/plots.py b/meteo/plots.py index f9463c6..4b103b2 100644 --- a/meteo/plots.py +++ b/meteo/plots.py @@ -11,7 +11,7 @@ import matplotlib.dates as mdates import numpy as np import pandas as pd -from .analysis import DiurnalCycleStats +from .analysis import DiurnalCycleStats, BinnedStatistics from .variables import Variable @@ -596,6 +596,100 @@ def plot_diurnal_cycle( return output_path.resolve() +def plot_binned_profiles( + stats: BinnedStatistics, + variables: Sequence[Variable], + output_path: str | Path, + *, + xlabel: str, + title: str, + show_counts: bool = False, +) -> Path: + """ + Trace les statistiques agrégées d'une ou plusieurs variables en fonction de bins. + """ + output_path = Path(output_path) + output_path.parent.mkdir(parents=True, exist_ok=True) + + if stats.centers.size == 0: + fig, ax = plt.subplots() + ax.text( + 0.5, + 0.5, + "Aucune donnée suffisante pour ces intervalles.", + ha="center", + va="center", + ) + ax.set_axis_off() + fig.savefig(output_path, dpi=150, bbox_inches="tight") + plt.close(fig) + return output_path.resolve() + + base_axes = len(variables) + total_axes = base_axes + (1 if show_counts else 0) + fig, axes = plt.subplots( + total_axes, + 1, + sharex=True, + figsize=(10, 3 * total_axes), + ) + + if total_axes == 1: + axes = [axes] + else: + axes = list(axes) + + x_values = stats.centers + bin_widths = np.array([interval.length for interval in stats.intervals]) + + if show_counts: + count_ax = axes.pop(0) + count_ax.bar( + x_values, + stats.counts.to_numpy(dtype=float), + width=bin_widths, + color="lightgray", + edgecolor="gray", + align="center", + ) + count_ax.set_ylabel("Nombre de points") + count_ax.grid(True, linestyle=":", alpha=0.4) + count_ax.set_title("Densité des observations par bin") + + for ax, var in zip(axes, variables): + col = var.column + ax.plot(x_values, stats.mean[col], color="tab:blue", label="Moyenne") + ax.plot(x_values, stats.median[col], color="tab:orange", linestyle="--", label="Médiane") + + if stats.quantile_low is not None and stats.quantile_high is not None: + ax.fill_between( + x_values, + stats.quantile_low[col], + stats.quantile_high[col], + color="tab:blue", + alpha=0.15, + label=( + f"Quantiles {int(stats.quantile_low_level * 100)}–{int(stats.quantile_high_level * 100)}%" + if stats.quantile_low_level is not None and stats.quantile_high_level is not None + else "Quantiles" + ), + ) + + ylabel = f"{var.label} ({var.unit})" if var.unit else var.label + ax.set_ylabel(ylabel) + ax.grid(True, linestyle=":", alpha=0.5) + + axes[-1].set_xlabel(xlabel) + axes[0].legend(loc="upper right") + axes[-1].set_xlim(stats.intervals.left.min(), stats.intervals.right.max()) + + fig.suptitle(title) + fig.tight_layout(rect=[0, 0, 1, 0.97]) + fig.savefig(output_path, dpi=150) + plt.close(fig) + return output_path.resolve() + + def plot_daily_rainfall_hyetograph( daily_rain: pd.DataFrame, output_path: str | Path, diff --git a/meteo/solar.py b/meteo/solar.py new file mode 100644 index 0000000..d9fc414 --- /dev/null +++ b/meteo/solar.py @@ -0,0 +1,66 @@ +# meteo/solar.py +from __future__ import annotations + +import pandas as pd +from astral import Observer +from astral.sun import elevation + + +def _ensure_datetime_index(index: pd.Index) -> pd.DatetimeIndex: + if not isinstance(index, pd.DatetimeIndex): + raise TypeError("Un DatetimeIndex est requis pour calculer l'élévation solaire.") + return index + + +def _prepare_index(index: pd.DatetimeIndex) -> pd.DatetimeIndex: + """ + Retourne une version timezone-aware (en UTC) du DatetimeIndex fourni. + """ + if index.tz is None: + return index.tz_localize("UTC") + return index.tz_convert("UTC") + + +def compute_solar_elevation_series( + index: pd.Index, + *, + latitude: float, + longitude: float, + elevation_m: float = 0.0, + series_name: str = "sun_elevation", +) -> pd.Series: + """ + Calcule l'élévation du soleil (en degrés) pour chaque timestamp de l'index. + """ + dt_index = _ensure_datetime_index(index) + observer = Observer(latitude=latitude, longitude=longitude, elevation=elevation_m) + utc_index = _prepare_index(dt_index) + + values = [ + float(elevation(observer, ts.to_pydatetime())) + for ts in utc_index + ] + + return pd.Series(values, index=dt_index, name=series_name) + + +def add_solar_elevation_column( + df: pd.DataFrame, + *, + latitude: float, + longitude: float, + elevation_m: float = 0.0, + column_name: str = "sun_elevation", +) -> pd.DataFrame: + """ + Ajoute une colonne `column_name` contenant l'élévation du soleil en degrés. + """ + series = compute_solar_elevation_series( + df.index, + latitude=latitude, + longitude=longitude, + elevation_m=elevation_m, + series_name=column_name, + ) + df[column_name] = series + return df diff --git a/meteo/variables.py b/meteo/variables.py index 27af65c..43744d9 100644 --- a/meteo/variables.py +++ b/meteo/variables.py @@ -65,6 +65,12 @@ VARIABLES: List[Variable] = [ label="Direction du vent", unit="°", ), + Variable( + key="sun_elevation", + column="sun_elevation", + label="Élévation solaire", + unit="°", + ), ] VARIABLES_BY_KEY: Dict[str, Variable] = {v.key: v for v in VARIABLES} diff --git a/requirements.txt b/requirements.txt index 6de0071..bebe89d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,6 +9,9 @@ numpy matplotlib seaborn +# Astronomie / position du soleil +astral + # Modèles statistiques / ML scikit-learn statsmodels diff --git a/scripts/make_minutely_dataset.py b/scripts/make_minutely_dataset.py index 081378f..ed51a20 100644 --- a/scripts/make_minutely_dataset.py +++ b/scripts/make_minutely_dataset.py @@ -4,6 +4,8 @@ from __future__ import annotations from pathlib import Path from meteo.dataset import load_raw_csv, resample_to_minutes +from meteo.config import StationLocation +from meteo.solar import add_solar_elevation_column FORMATTED_CSV_PATH = Path("data/weather_filled_1s.csv") @@ -23,6 +25,29 @@ def main() -> None: df_min = resample_to_minutes(df_1s) print(f"Après resampling 60s : {len(df_min)} lignes") + try: + location = StationLocation.from_env(optional=True) + except RuntimeError as exc: + print(f"⚠ Coordonnées GPS invalides : {exc}") + location = None + + if location is not None: + print( + f"Ajout de l'élévation solaire (lat={location.latitude}, lon={location.longitude}, " + f"alt={location.elevation_m} m)..." + ) + add_solar_elevation_column( + df_min, + latitude=location.latitude, + longitude=location.longitude, + elevation_m=location.elevation_m, + ) + else: + print( + "ℹ Coordonnées GPS non définies (STATION_LATITUDE / STATION_LONGITUDE). " + "La colonne sun_elevation ne sera pas ajoutée." + ) + OUTPUT_CSV_PATH.parent.mkdir(parents=True, exist_ok=True) df_min.to_csv(OUTPUT_CSV_PATH, index_label="time") print(f"✔ Dataset minuté écrit dans : {OUTPUT_CSV_PATH.resolve()}") diff --git a/scripts/plot_sun_elevation_relationships.py b/scripts/plot_sun_elevation_relationships.py new file mode 100644 index 0000000..e827dc8 --- /dev/null +++ b/scripts/plot_sun_elevation_relationships.py @@ -0,0 +1,128 @@ +# scripts/plot_sun_elevation_relationships.py +from __future__ import annotations + +from pathlib import Path + +import numpy as np + +from meteo.dataset import load_raw_csv +from meteo.variables import VARIABLES_BY_KEY +from meteo.analysis import compute_binned_statistics +from meteo.plots import plot_binned_profiles, plot_hexbin_with_third_variable +from meteo.config import StationLocation +from meteo.solar import add_solar_elevation_column + + +CSV_PATH = Path("data/weather_minutely.csv") +OUTPUT_DIR = Path("figures/sun") + + +def ensure_sun_elevation(df): + if "sun_elevation" in df.columns: + return df + + print("ℹ La colonne 'sun_elevation' est absente, tentative de calcul à la volée.") + location = StationLocation.from_env(optional=True) + if location is None: + print( + "⚠ Impossible de calculer l'élévation solaire : définissez STATION_LATITUDE et STATION_LONGITUDE " + "puis regénérez le dataset (scripts/make_minutely_dataset)." + ) + return None + + print( + f"→ Calcul d'élévation solaire avec lat={location.latitude}, lon={location.longitude}, " + f"alt={location.elevation_m} m." + ) + add_solar_elevation_column( + df, + latitude=location.latitude, + longitude=location.longitude, + elevation_m=location.elevation_m, + ) + return df + + +def main() -> None: + if not CSV_PATH.exists(): + print(f"⚠ Fichier introuvable : {CSV_PATH}") + return + + df = load_raw_csv(CSV_PATH) + print(f"Dataset minuté chargé : {CSV_PATH}") + print(f" Lignes : {len(df)}") + print(f" Colonnes : {list(df.columns)}") + print() + + df = ensure_sun_elevation(df) + if df is None or "sun_elevation" not in df.columns: + return + + OUTPUT_DIR.mkdir(parents=True, exist_ok=True) + + profile_keys = ["temperature", "humidity", "illuminance"] + profile_vars = [VARIABLES_BY_KEY[key] for key in profile_keys] + bins = np.arange(-90, 95, 5) # bins de 5° + + stats = compute_binned_statistics( + df=df, + bin_source_column="sun_elevation", + target_columns=[v.column for v in profile_vars], + bins=bins, + min_count=100, + quantiles=(0.2, 0.8), + ) + + profile_output = OUTPUT_DIR / "sun_elevation_profiles.png" + plot_binned_profiles( + stats=stats, + variables=profile_vars, + output_path=profile_output, + xlabel="Élévation solaire (°)", + title="Profils moyens en fonction de l'élévation solaire", + show_counts=True, + ) + print(f"✔ Profils sun vs variables : {profile_output}") + + hexbin_scenarios = [ + { + "x": "sun_elevation", + "y": "illuminance", + "color": "temperature", + "filename": "hexbin_sun_elevation_vs_illuminance.png", + "description": "Illuminance en fonction de l'élévation du soleil, couleur = température.", + }, + { + "x": "sun_elevation", + "y": "temperature", + "color": "humidity", + "filename": "hexbin_sun_elevation_vs_temperature.png", + "description": "Température en fonction de l'élévation, couleur = humidité relative.", + }, + ] + + for scenario in hexbin_scenarios: + var_x = VARIABLES_BY_KEY[scenario["x"]] + var_y = VARIABLES_BY_KEY[scenario["y"]] + var_color = VARIABLES_BY_KEY[scenario["color"]] + output_path = OUTPUT_DIR / scenario["filename"] + + print(f"→ {scenario['description']}") + plot_hexbin_with_third_variable( + df=df, + var_x=var_x, + var_y=var_y, + var_color=var_color, + output_path=output_path, + gridsize=60, + mincnt=10, + reduce_func_label="moyenne", + cmap="cividis", + ) + print(f" ✔ Hexbin enregistré : {output_path}") + + print("✔ Tous les graphiques liés à l'élévation solaire ont été produits.") + + +if __name__ == "__main__": + main()