Script de statistiques de base
This commit is contained in:
parent
a4d3ce7b49
commit
8979f48c23
166
scripts/report_dataset_stats.py
Normal file
166
scripts/report_dataset_stats.py
Normal file
@ -0,0 +1,166 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Affiche quelques statistiques basiques sur les fichiers CSV météo."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[1]
|
||||
DEFAULT_TARGETS = [
|
||||
("Données brutes complètes", ROOT / "data/weather_raw_full.csv"),
|
||||
("Données agrégées par minute", ROOT / "data/weather_minutely.csv"),
|
||||
]
|
||||
|
||||
|
||||
@dataclass
|
||||
class DatasetStats:
|
||||
label: str
|
||||
path: Path
|
||||
size_bytes: int
|
||||
columns: list[str]
|
||||
row_count: int
|
||||
header_detected: bool
|
||||
first_timestamp: datetime | None
|
||||
last_timestamp: datetime | None
|
||||
|
||||
|
||||
def human_size(num_bytes: int) -> str:
|
||||
"""Retourne une taille lisible (o/Ko/Mo/Go) à partir d'un nombre d'octets."""
|
||||
units = ["o", "Ko", "Mo", "Go", "To"]
|
||||
size = float(num_bytes)
|
||||
for unit in units:
|
||||
if size < 1024 or unit == units[-1]:
|
||||
return f"{size:.1f} {unit}"
|
||||
size /= 1024
|
||||
return f"{num_bytes} o"
|
||||
|
||||
|
||||
def format_int(value: int) -> str:
|
||||
"""Formate un entier avec des espaces pour les milliers."""
|
||||
return f"{value:,}".replace(",", " ")
|
||||
|
||||
|
||||
def parse_timestamp(value: str) -> datetime | None:
|
||||
if not value:
|
||||
return None
|
||||
try:
|
||||
return datetime.fromisoformat(value.strip())
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def collect_stats(label: str, path: Path, delimiter: str, assume_header: bool) -> DatasetStats:
|
||||
if not path.exists():
|
||||
raise FileNotFoundError(path)
|
||||
|
||||
columns: list[str] = []
|
||||
row_count = 0
|
||||
first_ts: datetime | None = None
|
||||
last_ts: datetime | None = None
|
||||
size_bytes = path.stat().st_size
|
||||
|
||||
with path.open("r", encoding="utf-8", newline="") as handle:
|
||||
reader = csv.reader(handle, delimiter=delimiter)
|
||||
first_line = next(reader, None)
|
||||
|
||||
if first_line is None:
|
||||
header_detected = False
|
||||
else:
|
||||
columns = [col.strip() for col in first_line]
|
||||
header_detected = bool(columns) and assume_header
|
||||
|
||||
# Détermine la première ligne de données en fonction du mode d'en-tête.
|
||||
data_row = next(reader, None) if header_detected else first_line
|
||||
if data_row is not None:
|
||||
row_count = 1
|
||||
ts = parse_timestamp(data_row[0]) if data_row else None
|
||||
if ts:
|
||||
first_ts = last_ts = ts
|
||||
|
||||
for row in reader:
|
||||
row_count += 1
|
||||
ts = parse_timestamp(row[0]) if row else None
|
||||
if ts:
|
||||
if first_ts is None or ts < first_ts:
|
||||
first_ts = ts
|
||||
if last_ts is None or ts > last_ts:
|
||||
last_ts = ts
|
||||
|
||||
return DatasetStats(
|
||||
label=label,
|
||||
path=path,
|
||||
size_bytes=size_bytes,
|
||||
columns=columns,
|
||||
row_count=row_count,
|
||||
header_detected=header_detected,
|
||||
first_timestamp=first_ts,
|
||||
last_timestamp=last_ts,
|
||||
)
|
||||
|
||||
|
||||
def format_path(path: Path) -> str:
|
||||
try:
|
||||
return str(path.relative_to(ROOT))
|
||||
except ValueError:
|
||||
return str(path)
|
||||
|
||||
|
||||
def display_stats(stats: DatasetStats) -> None:
|
||||
header = ", ".join(stats.columns) if stats.columns else "-"
|
||||
header_note = " (en-tête détecté)" if stats.header_detected else ""
|
||||
first_ts = stats.first_timestamp.isoformat(sep=" ") if stats.first_timestamp else "-"
|
||||
last_ts = stats.last_timestamp.isoformat(sep=" ") if stats.last_timestamp else "-"
|
||||
|
||||
print(f"\n=== {stats.label}")
|
||||
print(f"- Fichier : {format_path(stats.path)}")
|
||||
print(f"- Taille : {human_size(stats.size_bytes)} ({format_int(stats.size_bytes)} octets)")
|
||||
print(f"- Colonnes : {len(stats.columns)}{header_note}")
|
||||
print(f" {header}")
|
||||
print(f"- Lignes (hors en-tête) : {format_int(stats.row_count)}")
|
||||
print(f"- Premier enregistrement : {first_ts}")
|
||||
print(f"- Dernier enregistrement : {last_ts}")
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Affiche quelques statistiques basiques sur les CSV météo (taille, colonnes, lignes).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"files",
|
||||
nargs="*",
|
||||
type=Path,
|
||||
help="Fichiers CSV à inspecter (par défaut : les CSV bruts et agrégés du dossier data/).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--delimiter",
|
||||
default=",",
|
||||
help="Délimiteur des colonnes (par défaut : ',').",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-header",
|
||||
action="store_true",
|
||||
help="Ne pas soustraire la première ligne lors du comptage des lignes.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = parse_args()
|
||||
targets = [(path.name, path) for path in args.files] if args.files else DEFAULT_TARGETS
|
||||
assume_header = not args.no_header
|
||||
|
||||
for label, path in targets:
|
||||
try:
|
||||
stats = collect_stats(label, path, delimiter=args.delimiter, assume_header=assume_header)
|
||||
except FileNotFoundError:
|
||||
print(f"\n✘ Fichier introuvable : {format_path(path)}")
|
||||
continue
|
||||
display_stats(stats)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
x
Reference in New Issue
Block a user