1
donnees_meteo/scripts/report_dataset_stats.py

167 lines
5.1 KiB
Python

#!/usr/bin/env python3
"""Affiche quelques statistiques basiques sur les fichiers CSV météo."""
from __future__ import annotations
import argparse
import csv
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
DEFAULT_TARGETS = [
("Données brutes complètes", ROOT / "data/weather_raw_full.csv"),
("Données agrégées par minute", ROOT / "data/weather_minutely.csv"),
]
@dataclass
class DatasetStats:
label: str
path: Path
size_bytes: int
columns: list[str]
row_count: int
header_detected: bool
first_timestamp: datetime | None
last_timestamp: datetime | None
def human_size(num_bytes: int) -> str:
"""Retourne une taille lisible (o/Ko/Mo/Go) à partir d'un nombre d'octets."""
units = ["o", "Ko", "Mo", "Go", "To"]
size = float(num_bytes)
for unit in units:
if size < 1024 or unit == units[-1]:
return f"{size:.1f} {unit}"
size /= 1024
return f"{num_bytes} o"
def format_int(value: int) -> str:
"""Formate un entier avec des espaces pour les milliers."""
return f"{value:,}".replace(",", " ")
def parse_timestamp(value: str) -> datetime | None:
if not value:
return None
try:
return datetime.fromisoformat(value.strip())
except ValueError:
return None
def collect_stats(label: str, path: Path, delimiter: str, assume_header: bool) -> DatasetStats:
if not path.exists():
raise FileNotFoundError(path)
columns: list[str] = []
row_count = 0
first_ts: datetime | None = None
last_ts: datetime | None = None
size_bytes = path.stat().st_size
with path.open("r", encoding="utf-8", newline="") as handle:
reader = csv.reader(handle, delimiter=delimiter)
first_line = next(reader, None)
if first_line is None:
header_detected = False
else:
columns = [col.strip() for col in first_line]
header_detected = bool(columns) and assume_header
# Détermine la première ligne de données en fonction du mode d'en-tête.
data_row = next(reader, None) if header_detected else first_line
if data_row is not None:
row_count = 1
ts = parse_timestamp(data_row[0]) if data_row else None
if ts:
first_ts = last_ts = ts
for row in reader:
row_count += 1
ts = parse_timestamp(row[0]) if row else None
if ts:
if first_ts is None or ts < first_ts:
first_ts = ts
if last_ts is None or ts > last_ts:
last_ts = ts
return DatasetStats(
label=label,
path=path,
size_bytes=size_bytes,
columns=columns,
row_count=row_count,
header_detected=header_detected,
first_timestamp=first_ts,
last_timestamp=last_ts,
)
def format_path(path: Path) -> str:
try:
return str(path.relative_to(ROOT))
except ValueError:
return str(path)
def display_stats(stats: DatasetStats) -> None:
header = ", ".join(stats.columns) if stats.columns else "-"
header_note = " (en-tête détecté)" if stats.header_detected else ""
first_ts = stats.first_timestamp.isoformat(sep=" ") if stats.first_timestamp else "-"
last_ts = stats.last_timestamp.isoformat(sep=" ") if stats.last_timestamp else "-"
print(f"\n=== {stats.label}")
print(f"- Fichier : {format_path(stats.path)}")
print(f"- Taille : {human_size(stats.size_bytes)} ({format_int(stats.size_bytes)} octets)")
print(f"- Colonnes : {len(stats.columns)}{header_note}")
print(f" {header}")
print(f"- Lignes (hors en-tête) : {format_int(stats.row_count)}")
print(f"- Premier enregistrement : {first_ts}")
print(f"- Dernier enregistrement : {last_ts}")
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Affiche quelques statistiques basiques sur les CSV météo (taille, colonnes, lignes).",
)
parser.add_argument(
"files",
nargs="*",
type=Path,
help="Fichiers CSV à inspecter (par défaut : les CSV bruts et agrégés du dossier data/).",
)
parser.add_argument(
"--delimiter",
default=",",
help="Délimiteur des colonnes (par défaut : ',').",
)
parser.add_argument(
"--no-header",
action="store_true",
help="Ne pas soustraire la première ligne lors du comptage des lignes.",
)
return parser.parse_args()
def main() -> None:
args = parse_args()
targets = [(path.name, path) for path in args.files] if args.files else DEFAULT_TARGETS
assume_header = not args.no_header
for label, path in targets:
try:
stats = collect_stats(label, path, delimiter=args.delimiter, assume_header=assume_header)
except FileNotFoundError:
print(f"\n✘ Fichier introuvable : {format_path(path)}")
continue
display_stats(stats)
if __name__ == "__main__":
main()