You've already forked etude_lego_jurassic_world
Premiers éléments de l'étude
This commit is contained in:
122
lib/rebrickable/stats.py
Normal file
122
lib/rebrickable/stats.py
Normal file
@@ -0,0 +1,122 @@
|
||||
"""Calcul des statistiques de base sur les sets LEGO filtrés."""
|
||||
|
||||
import csv
|
||||
from pathlib import Path
|
||||
from typing import Iterable, List, Sequence, Tuple
|
||||
|
||||
from lib.filesystem import ensure_parent_dir
|
||||
|
||||
def read_rows(path: Path) -> List[dict]:
|
||||
"""Charge un fichier CSV en mémoire sous forme de dictionnaires."""
|
||||
with path.open() as csv_file:
|
||||
reader = csv.DictReader(csv_file)
|
||||
return list(reader)
|
||||
|
||||
|
||||
def write_stats_csv(destination_path: Path, stats: Sequence[Tuple[str, str]]) -> None:
|
||||
"""Écrit les statistiques dans un CSV à deux colonnes."""
|
||||
ensure_parent_dir(destination_path)
|
||||
with destination_path.open("w", newline="") as csv_file:
|
||||
writer = csv.writer(csv_file)
|
||||
writer.writerow(["libelle", "valeur"])
|
||||
for label, value in stats:
|
||||
writer.writerow([label, value])
|
||||
|
||||
|
||||
def compute_median(values: List[int]) -> float:
|
||||
"""Calcule la médiane d'une liste de valeurs entières."""
|
||||
sorted_values = sorted(values)
|
||||
middle = len(sorted_values) // 2
|
||||
if len(sorted_values) % 2 == 1:
|
||||
return float(sorted_values[middle])
|
||||
return (sorted_values[middle - 1] + sorted_values[middle]) / 2
|
||||
|
||||
|
||||
def compute_basic_stats(
|
||||
themes: Iterable[dict],
|
||||
all_sets: Iterable[dict],
|
||||
filtered_sets: Iterable[dict],
|
||||
enriched_sets: Iterable[dict],
|
||||
) -> List[Tuple[str, str]]:
|
||||
"""Calcule les statistiques principales à partir des sets chargés."""
|
||||
themes_list = list(themes)
|
||||
all_sets_list = list(all_sets)
|
||||
filtered_sets_list = list(filtered_sets)
|
||||
enriched_sets_list = list(enriched_sets)
|
||||
|
||||
theme_count_total = len(themes_list)
|
||||
total_sets = len(all_sets_list)
|
||||
filtered_sets_count = len(filtered_sets_list)
|
||||
avg_sets_per_theme = total_sets / theme_count_total
|
||||
percent_filtered = (filtered_sets_count / total_sets) * 100
|
||||
owned_sets_count = sum(1 for row in enriched_sets_list if row["in_collection"] == "true")
|
||||
missing_sets_count = sum(1 for row in enriched_sets_list if row["in_collection"] == "false")
|
||||
percent_owned = (owned_sets_count / filtered_sets_count) * 100
|
||||
parts_per_set = [int(row["num_parts"]) for row in filtered_sets_list]
|
||||
avg_parts_per_set = sum(parts_per_set) / filtered_sets_count
|
||||
median_parts_per_set = compute_median(parts_per_set)
|
||||
years = [int(row["year"]) for row in filtered_sets_list]
|
||||
avg_sets_per_year = filtered_sets_count / len(set(years))
|
||||
total_parts = sum(parts_per_set)
|
||||
theme_ids_filtered = {row["theme_id"] for row in filtered_sets_list}
|
||||
min_year = str(min(years))
|
||||
max_year = str(max(years))
|
||||
year_counts = {}
|
||||
for year in years:
|
||||
year_counts[year] = year_counts.get(year, 0) + 1
|
||||
prolific_year, prolific_count = max(year_counts.items(), key=lambda item: (item[1], -item[0]))
|
||||
richest_set = max(filtered_sets_list, key=lambda row: int(row["num_parts"]))
|
||||
lightest_set = min(filtered_sets_list, key=lambda row: int(row["num_parts"]))
|
||||
oldest_set = min(filtered_sets_list, key=lambda row: (int(row["year"]), row["set_num"]))
|
||||
latest_set = max(filtered_sets_list, key=lambda row: (int(row["year"]), row["set_num"]))
|
||||
owned_parts = [int(row["num_parts"]) for row in enriched_sets_list if row["in_collection"] == "true"]
|
||||
missing_parts = [int(row["num_parts"]) for row in enriched_sets_list if row["in_collection"] == "false"]
|
||||
avg_parts_owned = sum(owned_parts) / len(owned_parts)
|
||||
avg_parts_missing = sum(missing_parts) / len(missing_parts)
|
||||
total_parts_owned = sum(owned_parts)
|
||||
percent_parts_owned = (total_parts_owned / total_parts) * 100
|
||||
|
||||
return [
|
||||
("Nombre total de sets (catalogue complet)", str(total_sets)),
|
||||
("Nombre total de thèmes (catalogue complet)", str(theme_count_total)),
|
||||
("Nombre de sets après filtrage (thèmes ciblés)", str(filtered_sets_count)),
|
||||
("Nombre moyen de sets par thème (catalogue complet)", f"{avg_sets_per_theme:.2f}"),
|
||||
("Pourcentage des sets filtrés vs total", f"{percent_filtered:.2f}%"),
|
||||
("Taux de possession (thèmes filtrés)", f"{percent_owned:.2f}%"),
|
||||
("Sets dans la collection", str(owned_sets_count)),
|
||||
("Sets manquants pour la collection", str(missing_sets_count)),
|
||||
("Nombre moyen de pièces par set (thèmes filtrés)", f"{avg_parts_per_set:.2f}"),
|
||||
("Médiane de pièces par set (thèmes filtrés)", f"{median_parts_per_set:.2f}"),
|
||||
("Nombre moyen de sets commercialisés par an (thèmes filtrés)", f"{avg_sets_per_year:.2f}"),
|
||||
("Total de pièces pour les thèmes filtrés", str(total_parts)),
|
||||
("Total de pièces des sets possédés", str(total_parts_owned)),
|
||||
("Pourcentage de pièces possédées (thèmes filtrés)", f"{percent_parts_owned:.2f}%"),
|
||||
("Nombre de thèmes filtrés", str(len(theme_ids_filtered))),
|
||||
("Première année de sortie (thèmes filtrés)", min_year),
|
||||
("Dernière année de sortie (thèmes filtrés)", max_year),
|
||||
("Année la plus prolifique (thèmes filtrés)", f"{prolific_year} ({prolific_count} sets)"),
|
||||
(
|
||||
"Set avec le plus de pièces (thèmes filtrés)",
|
||||
f"{richest_set['set_num']} - {richest_set['name']} ({richest_set['num_parts']} pièces)",
|
||||
),
|
||||
(
|
||||
"Set avec le moins de pièces (thèmes filtrés)",
|
||||
f"{lightest_set['set_num']} - {lightest_set['name']} ({lightest_set['num_parts']} pièces)",
|
||||
),
|
||||
(
|
||||
"Set le plus ancien (thèmes filtrés)",
|
||||
f"{oldest_set['set_num']} - {oldest_set['name']} ({oldest_set['year']})",
|
||||
),
|
||||
(
|
||||
"Set le plus récent (thèmes filtrés)",
|
||||
f"{latest_set['set_num']} - {latest_set['name']} ({latest_set['year']})",
|
||||
),
|
||||
(
|
||||
"Nombre moyen de pièces des sets possédés",
|
||||
f"{avg_parts_owned:.2f}",
|
||||
),
|
||||
(
|
||||
"Nombre moyen de pièces des sets manquants",
|
||||
f"{avg_parts_missing:.2f}",
|
||||
),
|
||||
]
|
||||
Reference in New Issue
Block a user