167 lines
5.1 KiB
Python
167 lines
5.1 KiB
Python
#!/usr/bin/env python3
|
|
"""Affiche quelques statistiques basiques sur les fichiers CSV météo."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import csv
|
|
from dataclasses import dataclass
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
ROOT = Path(__file__).resolve().parents[1]
|
|
DEFAULT_TARGETS = [
|
|
("Données brutes complètes", ROOT / "data/weather_raw_full.csv"),
|
|
("Données agrégées par minute", ROOT / "data/weather_minutely.csv"),
|
|
]
|
|
|
|
|
|
@dataclass
|
|
class DatasetStats:
|
|
label: str
|
|
path: Path
|
|
size_bytes: int
|
|
columns: list[str]
|
|
row_count: int
|
|
header_detected: bool
|
|
first_timestamp: datetime | None
|
|
last_timestamp: datetime | None
|
|
|
|
|
|
def human_size(num_bytes: int) -> str:
|
|
"""Retourne une taille lisible (o/Ko/Mo/Go) à partir d'un nombre d'octets."""
|
|
units = ["o", "Ko", "Mo", "Go", "To"]
|
|
size = float(num_bytes)
|
|
for unit in units:
|
|
if size < 1024 or unit == units[-1]:
|
|
return f"{size:.1f} {unit}"
|
|
size /= 1024
|
|
return f"{num_bytes} o"
|
|
|
|
|
|
def format_int(value: int) -> str:
|
|
"""Formate un entier avec des espaces pour les milliers."""
|
|
return f"{value:,}".replace(",", " ")
|
|
|
|
|
|
def parse_timestamp(value: str) -> datetime | None:
|
|
if not value:
|
|
return None
|
|
try:
|
|
return datetime.fromisoformat(value.strip())
|
|
except ValueError:
|
|
return None
|
|
|
|
|
|
def collect_stats(label: str, path: Path, delimiter: str, assume_header: bool) -> DatasetStats:
|
|
if not path.exists():
|
|
raise FileNotFoundError(path)
|
|
|
|
columns: list[str] = []
|
|
row_count = 0
|
|
first_ts: datetime | None = None
|
|
last_ts: datetime | None = None
|
|
size_bytes = path.stat().st_size
|
|
|
|
with path.open("r", encoding="utf-8", newline="") as handle:
|
|
reader = csv.reader(handle, delimiter=delimiter)
|
|
first_line = next(reader, None)
|
|
|
|
if first_line is None:
|
|
header_detected = False
|
|
else:
|
|
columns = [col.strip() for col in first_line]
|
|
header_detected = bool(columns) and assume_header
|
|
|
|
# Détermine la première ligne de données en fonction du mode d'en-tête.
|
|
data_row = next(reader, None) if header_detected else first_line
|
|
if data_row is not None:
|
|
row_count = 1
|
|
ts = parse_timestamp(data_row[0]) if data_row else None
|
|
if ts:
|
|
first_ts = last_ts = ts
|
|
|
|
for row in reader:
|
|
row_count += 1
|
|
ts = parse_timestamp(row[0]) if row else None
|
|
if ts:
|
|
if first_ts is None or ts < first_ts:
|
|
first_ts = ts
|
|
if last_ts is None or ts > last_ts:
|
|
last_ts = ts
|
|
|
|
return DatasetStats(
|
|
label=label,
|
|
path=path,
|
|
size_bytes=size_bytes,
|
|
columns=columns,
|
|
row_count=row_count,
|
|
header_detected=header_detected,
|
|
first_timestamp=first_ts,
|
|
last_timestamp=last_ts,
|
|
)
|
|
|
|
|
|
def format_path(path: Path) -> str:
|
|
try:
|
|
return str(path.relative_to(ROOT))
|
|
except ValueError:
|
|
return str(path)
|
|
|
|
|
|
def display_stats(stats: DatasetStats) -> None:
|
|
header = ", ".join(stats.columns) if stats.columns else "-"
|
|
header_note = " (en-tête détecté)" if stats.header_detected else ""
|
|
first_ts = stats.first_timestamp.isoformat(sep=" ") if stats.first_timestamp else "-"
|
|
last_ts = stats.last_timestamp.isoformat(sep=" ") if stats.last_timestamp else "-"
|
|
|
|
print(f"\n=== {stats.label}")
|
|
print(f"- Fichier : {format_path(stats.path)}")
|
|
print(f"- Taille : {human_size(stats.size_bytes)} ({format_int(stats.size_bytes)} octets)")
|
|
print(f"- Colonnes : {len(stats.columns)}{header_note}")
|
|
print(f" {header}")
|
|
print(f"- Lignes (hors en-tête) : {format_int(stats.row_count)}")
|
|
print(f"- Premier enregistrement : {first_ts}")
|
|
print(f"- Dernier enregistrement : {last_ts}")
|
|
|
|
|
|
def parse_args() -> argparse.Namespace:
|
|
parser = argparse.ArgumentParser(
|
|
description="Affiche quelques statistiques basiques sur les CSV météo (taille, colonnes, lignes).",
|
|
)
|
|
parser.add_argument(
|
|
"files",
|
|
nargs="*",
|
|
type=Path,
|
|
help="Fichiers CSV à inspecter (par défaut : les CSV bruts et agrégés du dossier data/).",
|
|
)
|
|
parser.add_argument(
|
|
"--delimiter",
|
|
default=",",
|
|
help="Délimiteur des colonnes (par défaut : ',').",
|
|
)
|
|
parser.add_argument(
|
|
"--no-header",
|
|
action="store_true",
|
|
help="Ne pas soustraire la première ligne lors du comptage des lignes.",
|
|
)
|
|
return parser.parse_args()
|
|
|
|
|
|
def main() -> None:
|
|
args = parse_args()
|
|
targets = [(path.name, path) for path in args.files] if args.files else DEFAULT_TARGETS
|
|
assume_header = not args.no_header
|
|
|
|
for label, path in targets:
|
|
try:
|
|
stats = collect_stats(label, path, delimiter=args.delimiter, assume_header=assume_header)
|
|
except FileNotFoundError:
|
|
print(f"\n✘ Fichier introuvable : {format_path(path)}")
|
|
continue
|
|
display_stats(stats)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|