"""Calcul des statistiques de base sur les sets LEGO filtrés.""" import csv from pathlib import Path from typing import Iterable, List, Sequence, Tuple from lib.filesystem import ensure_parent_dir def read_rows(path: Path) -> List[dict]: """Charge un fichier CSV en mémoire sous forme de dictionnaires.""" with path.open() as csv_file: reader = csv.DictReader(csv_file) return list(reader) def write_stats_csv(destination_path: Path, stats: Sequence[Tuple[str, str]]) -> None: """Écrit les statistiques dans un CSV à deux colonnes.""" ensure_parent_dir(destination_path) with destination_path.open("w", newline="") as csv_file: writer = csv.writer(csv_file) writer.writerow(["libelle", "valeur"]) for label, value in stats: writer.writerow([label, value]) def compute_median(values: List[int]) -> float: """Calcule la médiane d'une liste de valeurs entières.""" sorted_values = sorted(values) middle = len(sorted_values) // 2 if len(sorted_values) % 2 == 1: return float(sorted_values[middle]) return (sorted_values[middle - 1] + sorted_values[middle]) / 2 def compute_basic_stats( themes: Iterable[dict], all_sets: Iterable[dict], filtered_sets: Iterable[dict], enriched_sets: Iterable[dict], ) -> List[Tuple[str, str]]: """Calcule les statistiques principales à partir des sets chargés.""" themes_list = list(themes) all_sets_list = list(all_sets) filtered_sets_list = list(filtered_sets) enriched_sets_list = list(enriched_sets) theme_count_total = len(themes_list) total_sets = len(all_sets_list) filtered_sets_count = len(filtered_sets_list) avg_sets_per_theme = total_sets / theme_count_total percent_filtered = (filtered_sets_count / total_sets) * 100 owned_sets_count = sum(1 for row in enriched_sets_list if row["in_collection"] == "true") missing_sets_count = sum(1 for row in enriched_sets_list if row["in_collection"] == "false") percent_owned = (owned_sets_count / filtered_sets_count) * 100 parts_per_set = [int(row["num_parts"]) for row in filtered_sets_list] avg_parts_per_set = sum(parts_per_set) / filtered_sets_count median_parts_per_set = compute_median(parts_per_set) years = [int(row["year"]) for row in filtered_sets_list] avg_sets_per_year = filtered_sets_count / len(set(years)) total_parts = sum(parts_per_set) theme_ids_filtered = {row["theme_id"] for row in filtered_sets_list} min_year = str(min(years)) max_year = str(max(years)) year_counts = {} for year in years: year_counts[year] = year_counts.get(year, 0) + 1 prolific_year, prolific_count = max(year_counts.items(), key=lambda item: (item[1], -item[0])) richest_set = max(filtered_sets_list, key=lambda row: int(row["num_parts"])) lightest_set = min(filtered_sets_list, key=lambda row: int(row["num_parts"])) oldest_set = min(filtered_sets_list, key=lambda row: (int(row["year"]), row["set_num"])) latest_set = max(filtered_sets_list, key=lambda row: (int(row["year"]), row["set_num"])) owned_parts = [int(row["num_parts"]) for row in enriched_sets_list if row["in_collection"] == "true"] missing_parts = [int(row["num_parts"]) for row in enriched_sets_list if row["in_collection"] == "false"] avg_parts_owned = sum(owned_parts) / len(owned_parts) avg_parts_missing = sum(missing_parts) / len(missing_parts) total_parts_owned = sum(owned_parts) percent_parts_owned = (total_parts_owned / total_parts) * 100 return [ ("Nombre total de sets (catalogue complet)", str(total_sets)), ("Nombre total de thèmes (catalogue complet)", str(theme_count_total)), ("Nombre de sets après filtrage (thèmes ciblés)", str(filtered_sets_count)), ("Nombre moyen de sets par thème (catalogue complet)", f"{avg_sets_per_theme:.2f}"), ("Pourcentage des sets filtrés vs total", f"{percent_filtered:.2f}%"), ("Taux de possession (thèmes filtrés)", f"{percent_owned:.2f}%"), ("Sets dans la collection", str(owned_sets_count)), ("Sets manquants pour la collection", str(missing_sets_count)), ("Nombre moyen de pièces par set (thèmes filtrés)", f"{avg_parts_per_set:.2f}"), ("Médiane de pièces par set (thèmes filtrés)", f"{median_parts_per_set:.2f}"), ("Nombre moyen de sets commercialisés par an (thèmes filtrés)", f"{avg_sets_per_year:.2f}"), ("Total de pièces pour les thèmes filtrés", str(total_parts)), ("Total de pièces des sets possédés", str(total_parts_owned)), ("Pourcentage de pièces possédées (thèmes filtrés)", f"{percent_parts_owned:.2f}%"), ("Nombre de thèmes filtrés", str(len(theme_ids_filtered))), ("Première année de sortie (thèmes filtrés)", min_year), ("Dernière année de sortie (thèmes filtrés)", max_year), ("Année la plus prolifique (thèmes filtrés)", f"{prolific_year} ({prolific_count} sets)"), ( "Set avec le plus de pièces (thèmes filtrés)", f"{richest_set['set_num']} - {richest_set['name']} ({richest_set['num_parts']} pièces)", ), ( "Set avec le moins de pièces (thèmes filtrés)", f"{lightest_set['set_num']} - {lightest_set['name']} ({lightest_set['num_parts']} pièces)", ), ( "Set le plus ancien (thèmes filtrés)", f"{oldest_set['set_num']} - {oldest_set['name']} ({oldest_set['year']})", ), ( "Set le plus récent (thèmes filtrés)", f"{latest_set['set_num']} - {latest_set['name']} ({latest_set['year']})", ), ( "Nombre moyen de pièces des sets possédés", f"{avg_parts_owned:.2f}", ), ( "Nombre moyen de pièces des sets manquants", f"{avg_parts_missing:.2f}", ), ]