#!/usr/bin/env python3 """Affiche quelques statistiques basiques sur les fichiers CSV météo.""" from __future__ import annotations import argparse import csv from dataclasses import dataclass from datetime import datetime from pathlib import Path ROOT = Path(__file__).resolve().parents[1] DEFAULT_TARGETS = [ ("Données brutes complètes", ROOT / "data/weather_raw_full.csv"), ("Données agrégées par minute", ROOT / "data/weather_minutely.csv"), ] @dataclass class DatasetStats: label: str path: Path size_bytes: int columns: list[str] row_count: int header_detected: bool first_timestamp: datetime | None last_timestamp: datetime | None def human_size(num_bytes: int) -> str: """Retourne une taille lisible (o/Ko/Mo/Go) à partir d'un nombre d'octets.""" units = ["o", "Ko", "Mo", "Go", "To"] size = float(num_bytes) for unit in units: if size < 1024 or unit == units[-1]: return f"{size:.1f} {unit}" size /= 1024 return f"{num_bytes} o" def format_int(value: int) -> str: """Formate un entier avec des espaces pour les milliers.""" return f"{value:,}".replace(",", " ") def parse_timestamp(value: str) -> datetime | None: if not value: return None try: return datetime.fromisoformat(value.strip()) except ValueError: return None def collect_stats(label: str, path: Path, delimiter: str, assume_header: bool) -> DatasetStats: if not path.exists(): raise FileNotFoundError(path) columns: list[str] = [] row_count = 0 first_ts: datetime | None = None last_ts: datetime | None = None size_bytes = path.stat().st_size with path.open("r", encoding="utf-8", newline="") as handle: reader = csv.reader(handle, delimiter=delimiter) first_line = next(reader, None) if first_line is None: header_detected = False else: columns = [col.strip() for col in first_line] header_detected = bool(columns) and assume_header # Détermine la première ligne de données en fonction du mode d'en-tête. data_row = next(reader, None) if header_detected else first_line if data_row is not None: row_count = 1 ts = parse_timestamp(data_row[0]) if data_row else None if ts: first_ts = last_ts = ts for row in reader: row_count += 1 ts = parse_timestamp(row[0]) if row else None if ts: if first_ts is None or ts < first_ts: first_ts = ts if last_ts is None or ts > last_ts: last_ts = ts return DatasetStats( label=label, path=path, size_bytes=size_bytes, columns=columns, row_count=row_count, header_detected=header_detected, first_timestamp=first_ts, last_timestamp=last_ts, ) def format_path(path: Path) -> str: try: return str(path.relative_to(ROOT)) except ValueError: return str(path) def display_stats(stats: DatasetStats) -> None: header = ", ".join(stats.columns) if stats.columns else "-" header_note = " (en-tête détecté)" if stats.header_detected else "" first_ts = stats.first_timestamp.isoformat(sep=" ") if stats.first_timestamp else "-" last_ts = stats.last_timestamp.isoformat(sep=" ") if stats.last_timestamp else "-" print(f"\n=== {stats.label}") print(f"- Fichier : {format_path(stats.path)}") print(f"- Taille : {human_size(stats.size_bytes)} ({format_int(stats.size_bytes)} octets)") print(f"- Colonnes : {len(stats.columns)}{header_note}") print(f" {header}") print(f"- Lignes (hors en-tête) : {format_int(stats.row_count)}") print(f"- Premier enregistrement : {first_ts}") print(f"- Dernier enregistrement : {last_ts}") def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( description="Affiche quelques statistiques basiques sur les CSV météo (taille, colonnes, lignes).", ) parser.add_argument( "files", nargs="*", type=Path, help="Fichiers CSV à inspecter (par défaut : les CSV bruts et agrégés du dossier data/).", ) parser.add_argument( "--delimiter", default=",", help="Délimiteur des colonnes (par défaut : ',').", ) parser.add_argument( "--no-header", action="store_true", help="Ne pas soustraire la première ligne lors du comptage des lignes.", ) return parser.parse_args() def main() -> None: args = parse_args() targets = [(path.name, path) for path in args.files] if args.files else DEFAULT_TARGETS assume_header = not args.no_header for label, path in targets: try: stats = collect_stats(label, path, delimiter=args.delimiter, assume_header=assume_header) except FileNotFoundError: print(f"\n✘ Fichier introuvable : {format_path(path)}") continue display_stats(stats) if __name__ == "__main__": main()