diff --git a/scripts/report_dataset_stats.py b/scripts/report_dataset_stats.py new file mode 100644 index 0000000..9666688 --- /dev/null +++ b/scripts/report_dataset_stats.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python3 +"""Affiche quelques statistiques basiques sur les fichiers CSV météo.""" + +from __future__ import annotations + +import argparse +import csv +from dataclasses import dataclass +from datetime import datetime +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[1] +DEFAULT_TARGETS = [ + ("Données brutes complètes", ROOT / "data/weather_raw_full.csv"), + ("Données agrégées par minute", ROOT / "data/weather_minutely.csv"), +] + + +@dataclass +class DatasetStats: + label: str + path: Path + size_bytes: int + columns: list[str] + row_count: int + header_detected: bool + first_timestamp: datetime | None + last_timestamp: datetime | None + + +def human_size(num_bytes: int) -> str: + """Retourne une taille lisible (o/Ko/Mo/Go) à partir d'un nombre d'octets.""" + units = ["o", "Ko", "Mo", "Go", "To"] + size = float(num_bytes) + for unit in units: + if size < 1024 or unit == units[-1]: + return f"{size:.1f} {unit}" + size /= 1024 + return f"{num_bytes} o" + + +def format_int(value: int) -> str: + """Formate un entier avec des espaces pour les milliers.""" + return f"{value:,}".replace(",", " ") + + +def parse_timestamp(value: str) -> datetime | None: + if not value: + return None + try: + return datetime.fromisoformat(value.strip()) + except ValueError: + return None + + +def collect_stats(label: str, path: Path, delimiter: str, assume_header: bool) -> DatasetStats: + if not path.exists(): + raise FileNotFoundError(path) + + columns: list[str] = [] + row_count = 0 + first_ts: datetime | None = None + last_ts: datetime | None = None + size_bytes = path.stat().st_size + + with path.open("r", encoding="utf-8", newline="") as handle: + reader = csv.reader(handle, delimiter=delimiter) + first_line = next(reader, None) + + if first_line is None: + header_detected = False + else: + columns = [col.strip() for col in first_line] + header_detected = bool(columns) and assume_header + + # Détermine la première ligne de données en fonction du mode d'en-tête. + data_row = next(reader, None) if header_detected else first_line + if data_row is not None: + row_count = 1 + ts = parse_timestamp(data_row[0]) if data_row else None + if ts: + first_ts = last_ts = ts + + for row in reader: + row_count += 1 + ts = parse_timestamp(row[0]) if row else None + if ts: + if first_ts is None or ts < first_ts: + first_ts = ts + if last_ts is None or ts > last_ts: + last_ts = ts + + return DatasetStats( + label=label, + path=path, + size_bytes=size_bytes, + columns=columns, + row_count=row_count, + header_detected=header_detected, + first_timestamp=first_ts, + last_timestamp=last_ts, + ) + + +def format_path(path: Path) -> str: + try: + return str(path.relative_to(ROOT)) + except ValueError: + return str(path) + + +def display_stats(stats: DatasetStats) -> None: + header = ", ".join(stats.columns) if stats.columns else "-" + header_note = " (en-tête détecté)" if stats.header_detected else "" + first_ts = stats.first_timestamp.isoformat(sep=" ") if stats.first_timestamp else "-" + last_ts = stats.last_timestamp.isoformat(sep=" ") if stats.last_timestamp else "-" + + print(f"\n=== {stats.label}") + print(f"- Fichier : {format_path(stats.path)}") + print(f"- Taille : {human_size(stats.size_bytes)} ({format_int(stats.size_bytes)} octets)") + print(f"- Colonnes : {len(stats.columns)}{header_note}") + print(f" {header}") + print(f"- Lignes (hors en-tête) : {format_int(stats.row_count)}") + print(f"- Premier enregistrement : {first_ts}") + print(f"- Dernier enregistrement : {last_ts}") + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Affiche quelques statistiques basiques sur les CSV météo (taille, colonnes, lignes).", + ) + parser.add_argument( + "files", + nargs="*", + type=Path, + help="Fichiers CSV à inspecter (par défaut : les CSV bruts et agrégés du dossier data/).", + ) + parser.add_argument( + "--delimiter", + default=",", + help="Délimiteur des colonnes (par défaut : ',').", + ) + parser.add_argument( + "--no-header", + action="store_true", + help="Ne pas soustraire la première ligne lors du comptage des lignes.", + ) + return parser.parse_args() + + +def main() -> None: + args = parse_args() + targets = [(path.name, path) for path in args.files] if args.files else DEFAULT_TARGETS + assume_header = not args.no_header + + for label, path in targets: + try: + stats = collect_stats(label, path, delimiter=args.delimiter, assume_header=assume_header) + except FileNotFoundError: + print(f"\n✘ Fichier introuvable : {format_path(path)}") + continue + display_stats(stats) + + +if __name__ == "__main__": + main()