1

123 lines
5.9 KiB
Python

"""Calcul des statistiques de base sur les sets LEGO filtrés."""
import csv
from pathlib import Path
from typing import Iterable, List, Sequence, Tuple
from lib.filesystem import ensure_parent_dir
def read_rows(path: Path) -> List[dict]:
"""Charge un fichier CSV en mémoire sous forme de dictionnaires."""
with path.open() as csv_file:
reader = csv.DictReader(csv_file)
return list(reader)
def write_stats_csv(destination_path: Path, stats: Sequence[Tuple[str, str]]) -> None:
"""Écrit les statistiques dans un CSV à deux colonnes."""
ensure_parent_dir(destination_path)
with destination_path.open("w", newline="") as csv_file:
writer = csv.writer(csv_file)
writer.writerow(["libelle", "valeur"])
for label, value in stats:
writer.writerow([label, value])
def compute_median(values: List[int]) -> float:
"""Calcule la médiane d'une liste de valeurs entières."""
sorted_values = sorted(values)
middle = len(sorted_values) // 2
if len(sorted_values) % 2 == 1:
return float(sorted_values[middle])
return (sorted_values[middle - 1] + sorted_values[middle]) / 2
def compute_basic_stats(
themes: Iterable[dict],
all_sets: Iterable[dict],
filtered_sets: Iterable[dict],
enriched_sets: Iterable[dict],
) -> List[Tuple[str, str]]:
"""Calcule les statistiques principales à partir des sets chargés."""
themes_list = list(themes)
all_sets_list = list(all_sets)
filtered_sets_list = list(filtered_sets)
enriched_sets_list = list(enriched_sets)
theme_count_total = len(themes_list)
total_sets = len(all_sets_list)
filtered_sets_count = len(filtered_sets_list)
avg_sets_per_theme = total_sets / theme_count_total
percent_filtered = (filtered_sets_count / total_sets) * 100
owned_sets_count = sum(1 for row in enriched_sets_list if row["in_collection"] == "true")
missing_sets_count = sum(1 for row in enriched_sets_list if row["in_collection"] == "false")
percent_owned = (owned_sets_count / filtered_sets_count) * 100
parts_per_set = [int(row["num_parts"]) for row in filtered_sets_list]
avg_parts_per_set = sum(parts_per_set) / filtered_sets_count
median_parts_per_set = compute_median(parts_per_set)
years = [int(row["year"]) for row in filtered_sets_list]
avg_sets_per_year = filtered_sets_count / len(set(years))
total_parts = sum(parts_per_set)
theme_ids_filtered = {row["theme_id"] for row in filtered_sets_list}
min_year = str(min(years))
max_year = str(max(years))
year_counts = {}
for year in years:
year_counts[year] = year_counts.get(year, 0) + 1
prolific_year, prolific_count = max(year_counts.items(), key=lambda item: (item[1], -item[0]))
richest_set = max(filtered_sets_list, key=lambda row: int(row["num_parts"]))
lightest_set = min(filtered_sets_list, key=lambda row: int(row["num_parts"]))
oldest_set = min(filtered_sets_list, key=lambda row: (int(row["year"]), row["set_num"]))
latest_set = max(filtered_sets_list, key=lambda row: (int(row["year"]), row["set_num"]))
owned_parts = [int(row["num_parts"]) for row in enriched_sets_list if row["in_collection"] == "true"]
missing_parts = [int(row["num_parts"]) for row in enriched_sets_list if row["in_collection"] == "false"]
avg_parts_owned = sum(owned_parts) / len(owned_parts)
avg_parts_missing = sum(missing_parts) / len(missing_parts)
total_parts_owned = sum(owned_parts)
percent_parts_owned = (total_parts_owned / total_parts) * 100
return [
("Nombre total de sets (catalogue complet)", str(total_sets)),
("Nombre total de thèmes (catalogue complet)", str(theme_count_total)),
("Nombre de sets après filtrage (thèmes ciblés)", str(filtered_sets_count)),
("Nombre moyen de sets par thème (catalogue complet)", f"{avg_sets_per_theme:.2f}"),
("Pourcentage des sets filtrés vs total", f"{percent_filtered:.2f}%"),
("Taux de possession (thèmes filtrés)", f"{percent_owned:.2f}%"),
("Sets dans la collection", str(owned_sets_count)),
("Sets manquants pour la collection", str(missing_sets_count)),
("Nombre moyen de pièces par set (thèmes filtrés)", f"{avg_parts_per_set:.2f}"),
("Médiane de pièces par set (thèmes filtrés)", f"{median_parts_per_set:.2f}"),
("Nombre moyen de sets commercialisés par an (thèmes filtrés)", f"{avg_sets_per_year:.2f}"),
("Total de pièces pour les thèmes filtrés", str(total_parts)),
("Total de pièces des sets possédés", str(total_parts_owned)),
("Pourcentage de pièces possédées (thèmes filtrés)", f"{percent_parts_owned:.2f}%"),
("Nombre de thèmes filtrés", str(len(theme_ids_filtered))),
("Première année de sortie (thèmes filtrés)", min_year),
("Dernière année de sortie (thèmes filtrés)", max_year),
("Année la plus prolifique (thèmes filtrés)", f"{prolific_year} ({prolific_count} sets)"),
(
"Set avec le plus de pièces (thèmes filtrés)",
f"{richest_set['set_num']} - {richest_set['name']} ({richest_set['num_parts']} pièces)",
),
(
"Set avec le moins de pièces (thèmes filtrés)",
f"{lightest_set['set_num']} - {lightest_set['name']} ({lightest_set['num_parts']} pièces)",
),
(
"Set le plus ancien (thèmes filtrés)",
f"{oldest_set['set_num']} - {oldest_set['name']} ({oldest_set['year']})",
),
(
"Set le plus récent (thèmes filtrés)",
f"{latest_set['set_num']} - {latest_set['name']} ({latest_set['year']})",
),
(
"Nombre moyen de pièces des sets possédés",
f"{avg_parts_owned:.2f}",
),
(
"Nombre moyen de pièces des sets manquants",
f"{avg_parts_missing:.2f}",
),
]