From d067e2075f2dd22bb2e3a89f9bca3e1abdb6498d Mon Sep 17 00:00:00 2001 From: Richard Dern Date: Tue, 2 Dec 2025 16:59:59 +0100 Subject: [PATCH] Ajoute la richesse chromatique par set --- README.md | 13 ++ lib/plots/color_richness.py | 130 ++++++++++++++++++++ lib/rebrickable/color_richness.py | 150 +++++++++++++++++++++++ scripts/compute_color_richness.py | 28 +++++ scripts/plot_color_richness.py | 26 ++++ tests/test_color_richness.py | 196 ++++++++++++++++++++++++++++++ tests/test_color_richness_plot.py | 38 ++++++ tests/test_rare_parts.py | 22 ++-- 8 files changed, 592 insertions(+), 11 deletions(-) create mode 100644 lib/plots/color_richness.py create mode 100644 lib/rebrickable/color_richness.py create mode 100644 scripts/compute_color_richness.py create mode 100644 scripts/plot_color_richness.py create mode 100644 tests/test_color_richness.py create mode 100644 tests/test_color_richness_plot.py diff --git a/README.md b/README.md index c4182fa..2b1fe07 100644 --- a/README.md +++ b/README.md @@ -285,3 +285,16 @@ Le calcul lit `data/intermediate/parts_filtered.csv`, `data/intermediate/sets_en - `data/intermediate/rare_parts_by_set.csv` : agrégat par set (comptes distincts, quantités, focus minifigs). Le tracé `figures/step27/rare_parts_per_set.png` met en scène le top des sets contenant le plus de variantes exclusives, en distinguant les pièces de minifigs et l’état de possession. + +### Étape 28 : richesse chromatique par set + +1. `source .venv/bin/activate` +2. `python -m scripts.compute_color_richness` +3. `python -m scripts.plot_color_richness` + +Le calcul lit `data/intermediate/colors_by_set.csv` et `data/intermediate/sets_enriched.csv` pour mesurer la diversité des palettes (nombre de couleurs distinctes hors rechanges, part des 3 couleurs principales, part de couleurs de minifigs). Il produit : + +- `data/intermediate/color_richness_by_set.csv` : métriques détaillées par set (comptes et parts principales, possession). +- `data/intermediate/color_richness_by_year.csv` : agrégat annuel (moyenne, médiane, bornes de diversité et concentration). + +Les graphiques `figures/step28/color_richness_boxplot.png`, `figures/step28/color_richness_top_sets.png` et `figures/step28/color_concentration_scatter.png` montrent respectivement la répartition annuelle, le top des sets les plus colorés et la concentration des palettes (part des 3 couleurs dominantes vs nombre de couleurs). diff --git a/lib/plots/color_richness.py b/lib/plots/color_richness.py new file mode 100644 index 0000000..6af58c4 --- /dev/null +++ b/lib/plots/color_richness.py @@ -0,0 +1,130 @@ +"""Visualisations de la richesse chromatique par set.""" + +from pathlib import Path +from typing import Iterable, List, Tuple + +import matplotlib.pyplot as plt +import numpy as np +from matplotlib.patches import Patch + +from lib.filesystem import ensure_parent_dir +from lib.rebrickable.stats import read_rows + + +def load_richness_rows(path: Path) -> List[dict]: + """Charge les métriques de richesse chromatique.""" + return read_rows(path) + + +def build_boxplot_data(rows: Iterable[dict]) -> Tuple[List[List[int]], List[str]]: + """Prépare les valeurs de boxplot par année.""" + grouped: dict[str, List[int]] = {} + for row in rows: + year_rows = grouped.get(row["year"]) + if year_rows is None: + year_rows = [] + grouped[row["year"]] = year_rows + year_rows.append(int(row["colors_distinct"])) + years = sorted(grouped.keys(), key=int) + data = [grouped[year] for year in years] + return data, years + + +def plot_richness_boxplot(richness_path: Path, destination_path: Path) -> None: + """Trace le boxplot du nombre de couleurs distinctes par set et par année.""" + rows = load_richness_rows(richness_path) + if not rows: + return + data, years = build_boxplot_data(rows) + fig, ax = plt.subplots(figsize=(12, 7)) + box = ax.boxplot( + data, + orientation="vertical", + patch_artist=True, + tick_labels=years, + boxprops=dict(facecolor="#1f77b4", alpha=0.3), + medianprops=dict(color="#0d0d0d", linewidth=1.5), + whiskerprops=dict(color="#555555", linestyle="--"), + capprops=dict(color="#555555"), + ) + for patch in box["boxes"]: + patch.set_edgecolor("#1f77b4") + ax.set_xlabel("Année") + ax.set_ylabel("Nombre de couleurs distinctes (hors rechanges)") + ax.set_title("Richesse chromatique par set (répartition annuelle)") + ax.grid(axis="y", linestyle="--", alpha=0.3) + + ensure_parent_dir(destination_path) + fig.tight_layout() + fig.savefig(destination_path, dpi=170) + plt.close(fig) + + +def select_top_sets(rows: Iterable[dict], limit: int = 15) -> List[dict]: + """Retient les sets les plus colorés et les plus concentrés.""" + sorted_rows = sorted( + rows, + key=lambda row: (-int(row["colors_distinct"]), float(row["top3_share"]), row["set_num"]), + ) + return sorted_rows[:limit] + + +def plot_richness_top_sets(richness_path: Path, destination_path: Path) -> None: + """Trace le top des sets les plus riches en couleurs.""" + rows = load_richness_rows(richness_path) + if not rows: + return + top_rows = select_top_sets(rows) + y_positions = np.arange(len(top_rows)) + counts = [int(row["colors_distinct"]) for row in top_rows] + labels = [f"{row['set_num']} · {row['name']} ({row['year']})" for row in top_rows] + owned_mask = [row["in_collection"] == "true" for row in top_rows] + + fig, ax = plt.subplots(figsize=(11, 8)) + for y, value, owned in zip(y_positions, counts, owned_mask): + alpha = 0.92 if owned else 0.45 + ax.barh(y, value, color="#2ca02c", alpha=alpha) + ax.set_yticks(y_positions) + ax.set_yticklabels(labels) + ax.invert_yaxis() + ax.set_xlabel("Couleurs distinctes (hors rechanges)") + ax.set_title("Top des sets les plus colorés") + ax.grid(axis="x", linestyle="--", alpha=0.3) + legend = [ + Patch(facecolor="#2ca02c", edgecolor="none", alpha=0.92, label="Set possédé"), + Patch(facecolor="#2ca02c", edgecolor="none", alpha=0.45, label="Set manquant"), + ] + ax.legend(handles=legend, loc="lower right", frameon=False) + + ensure_parent_dir(destination_path) + fig.tight_layout() + fig.savefig(destination_path, dpi=170) + plt.close(fig) + + +def plot_concentration_scatter(richness_path: Path, destination_path: Path) -> None: + """Visualise la concentration de palette vs nombre de couleurs.""" + rows = load_richness_rows(richness_path) + if not rows: + return + x_values = [int(row["colors_distinct"]) for row in rows] + y_values = [float(row["top3_share"]) for row in rows] + owned_mask = [row["in_collection"] == "true" for row in rows] + colors = ["#1f77b4" if owned else "#bbbbbb" for owned in owned_mask] + + fig, ax = plt.subplots(figsize=(10, 7)) + ax.scatter(x_values, y_values, c=colors, alpha=0.7, s=32) + ax.set_xlabel("Nombre de couleurs distinctes (hors rechanges)") + ax.set_ylabel("Part des 3 couleurs principales") + ax.set_title("Concentration des palettes") + ax.grid(True, linestyle="--", alpha=0.3) + legend = [ + Patch(facecolor="#1f77b4", edgecolor="none", alpha=0.7, label="Set possédé"), + Patch(facecolor="#bbbbbb", edgecolor="none", alpha=0.7, label="Set manquant"), + ] + ax.legend(handles=legend, loc="upper right", frameon=False) + + ensure_parent_dir(destination_path) + fig.tight_layout() + fig.savefig(destination_path, dpi=170) + plt.close(fig) diff --git a/lib/rebrickable/color_richness.py b/lib/rebrickable/color_richness.py new file mode 100644 index 0000000..89d94b9 --- /dev/null +++ b/lib/rebrickable/color_richness.py @@ -0,0 +1,150 @@ +"""Métriques de richesse chromatique par set.""" + +import csv +from pathlib import Path +from typing import Dict, Iterable, List, Sequence + +from lib.filesystem import ensure_parent_dir +from lib.rebrickable.stats import compute_median, read_rows + + +def load_colors_by_set(path: Path) -> List[dict]: + """Charge colors_by_set.csv en mémoire.""" + return read_rows(path) + + +def load_sets(path: Path) -> Dict[str, dict]: + """Indexe les sets enrichis par set_num.""" + sets: Dict[str, dict] = {} + with path.open() as csv_file: + reader = csv.DictReader(csv_file) + for row in reader: + sets[row["set_num"]] = row + return sets + + +def group_by_set(rows: Iterable[dict]) -> Dict[str, List[dict]]: + """Regroupe les couleurs par set.""" + grouped: Dict[str, List[dict]] = {} + for row in rows: + set_rows = grouped.get(row["set_num"]) + if set_rows is None: + set_rows = [] + grouped[row["set_num"]] = set_rows + set_rows.append(row) + return grouped + + +def build_richness_by_set( + colors_by_set_path: Path, + sets_enriched_path: Path, +) -> List[dict]: + """Construit les métriques de richesse chromatique par set.""" + colors = load_colors_by_set(colors_by_set_path) + sets_lookup = load_sets(sets_enriched_path) + grouped = group_by_set(colors) + richness: List[dict] = [] + for set_num, set_rows in grouped.items(): + total_non_spare = sum(int(row["quantity_non_spare"]) for row in set_rows) + colors_distinct = len(set_rows) + colors_minifig = sum(1 for row in set_rows if int(row["quantity_minifig"]) > 0) + colors_non_minifig = sum(1 for row in set_rows if int(row["quantity_non_minifig"]) > 0) + sorted_by_quantity = sorted(set_rows, key=lambda row: int(row["quantity_non_spare"]), reverse=True) + top_color = sorted_by_quantity[0] + top3_total = sum(int(row["quantity_non_spare"]) for row in sorted_by_quantity[:3]) + top_share = int(top_color["quantity_non_spare"]) / total_non_spare + top3_share = top3_total / total_non_spare + set_row = sets_lookup[set_num] + richness.append( + { + "set_num": set_num, + "set_id": set_row["set_id"], + "name": set_row["name"], + "year": set_row["year"], + "in_collection": set_row["in_collection"], + "colors_distinct": str(colors_distinct), + "colors_minifig": str(colors_minifig), + "colors_non_minifig": str(colors_non_minifig), + "total_parts_non_spare": str(total_non_spare), + "top_color_name": top_color["color_name"], + "top_color_share": f"{top_share:.4f}", + "top3_share": f"{top3_share:.4f}", + } + ) + richness.sort(key=lambda row: (-int(row["colors_distinct"]), row["set_num"])) + return richness + + +def build_richness_by_year(richness_rows: Iterable[dict]) -> List[dict]: + """Agrège les métriques de richesse par année.""" + grouped: Dict[str, List[dict]] = {} + for row in richness_rows: + year_rows = grouped.get(row["year"]) + if year_rows is None: + year_rows = [] + grouped[row["year"]] = year_rows + year_rows.append(row) + yearly: List[dict] = [] + for year, rows in grouped.items(): + distinct_counts = [int(row["colors_distinct"]) for row in rows] + top3_shares = [float(row["top3_share"]) for row in rows] + average_distinct = sum(distinct_counts) / len(distinct_counts) + median_distinct = compute_median(distinct_counts) + average_top3 = sum(top3_shares) / len(top3_shares) + median_top3 = compute_median([int(share * 10000) for share in top3_shares]) / 10000 + yearly.append( + { + "year": year, + "average_colors_distinct": f"{average_distinct:.2f}", + "median_colors_distinct": f"{median_distinct:.2f}", + "max_colors_distinct": str(max(distinct_counts)), + "min_colors_distinct": str(min(distinct_counts)), + "average_top3_share": f"{average_top3:.4f}", + "median_top3_share": f"{median_top3:.4f}", + } + ) + yearly.sort(key=lambda row: int(row["year"])) + return yearly + + +def write_richness_by_set(destination_path: Path, rows: Sequence[dict]) -> None: + """Écrit le CSV des métriques par set.""" + ensure_parent_dir(destination_path) + fieldnames = [ + "set_num", + "set_id", + "name", + "year", + "in_collection", + "colors_distinct", + "colors_minifig", + "colors_non_minifig", + "total_parts_non_spare", + "top_color_name", + "top_color_share", + "top3_share", + ] + with destination_path.open("w", newline="") as csv_file: + writer = csv.DictWriter(csv_file, fieldnames=fieldnames) + writer.writeheader() + for row in rows: + writer.writerow(row) + + +def write_richness_by_year(destination_path: Path, rows: Sequence[dict]) -> None: + """Écrit le CSV agrégé par année.""" + ensure_parent_dir(destination_path) + fieldnames = [ + "year", + "average_colors_distinct", + "median_colors_distinct", + "max_colors_distinct", + "min_colors_distinct", + "average_top3_share", + "median_top3_share", + ] + with destination_path.open("w", newline="") as csv_file: + writer = csv.DictWriter(csv_file, fieldnames=fieldnames) + writer.writeheader() + for row in rows: + writer.writerow(row) diff --git a/scripts/compute_color_richness.py b/scripts/compute_color_richness.py new file mode 100644 index 0000000..11c085c --- /dev/null +++ b/scripts/compute_color_richness.py @@ -0,0 +1,28 @@ +"""Calcule la richesse chromatique par set et par année.""" + +from pathlib import Path + +from lib.rebrickable.color_richness import ( + build_richness_by_set, + build_richness_by_year, + write_richness_by_set, + write_richness_by_year, +) + + +COLORS_BY_SET_PATH = Path("data/intermediate/colors_by_set.csv") +SETS_ENRICHED_PATH = Path("data/intermediate/sets_enriched.csv") +RICHNESS_BY_SET_PATH = Path("data/intermediate/color_richness_by_set.csv") +RICHNESS_BY_YEAR_PATH = Path("data/intermediate/color_richness_by_year.csv") + + +def main() -> None: + """Construit les CSV de richesse chromatique.""" + richness_by_set = build_richness_by_set(COLORS_BY_SET_PATH, SETS_ENRICHED_PATH) + richness_by_year = build_richness_by_year(richness_by_set) + write_richness_by_set(RICHNESS_BY_SET_PATH, richness_by_set) + write_richness_by_year(RICHNESS_BY_YEAR_PATH, richness_by_year) + + +if __name__ == "__main__": + main() diff --git a/scripts/plot_color_richness.py b/scripts/plot_color_richness.py new file mode 100644 index 0000000..2c56e6f --- /dev/null +++ b/scripts/plot_color_richness.py @@ -0,0 +1,26 @@ +"""Trace les graphiques de richesse chromatique par set.""" + +from pathlib import Path + +from lib.plots.color_richness import ( + plot_concentration_scatter, + plot_richness_boxplot, + plot_richness_top_sets, +) + + +RICHNESS_PATH = Path("data/intermediate/color_richness_by_set.csv") +BOXPLOT_DESTINATION = Path("figures/step28/color_richness_boxplot.png") +TOP_DESTINATION = Path("figures/step28/color_richness_top_sets.png") +CONCENTRATION_DESTINATION = Path("figures/step28/color_concentration_scatter.png") + + +def main() -> None: + """Génère les visuels de richesse chromatique.""" + plot_richness_boxplot(RICHNESS_PATH, BOXPLOT_DESTINATION) + plot_richness_top_sets(RICHNESS_PATH, TOP_DESTINATION) + plot_concentration_scatter(RICHNESS_PATH, CONCENTRATION_DESTINATION) + + +if __name__ == "__main__": + main() diff --git a/tests/test_color_richness.py b/tests/test_color_richness.py new file mode 100644 index 0000000..8daeb09 --- /dev/null +++ b/tests/test_color_richness.py @@ -0,0 +1,196 @@ +"""Tests des métriques de richesse chromatique.""" + +import csv +from pathlib import Path + +from lib.rebrickable.color_richness import ( + build_richness_by_set, + build_richness_by_year, + write_richness_by_set, + write_richness_by_year, +) + + +def write_csv(path: Path, headers: list[str], rows: list[list[str]]) -> None: + """Écrit un CSV simple pour les besoins de tests.""" + with path.open("w", newline="") as csv_file: + writer = csv.writer(csv_file) + writer.writerow(headers) + writer.writerows(rows) + + +def test_build_richness_by_set_computes_shares_and_counts(tmp_path: Path) -> None: + """Calcule les partages de couleurs principales et les dénombrements.""" + colors_by_set = tmp_path / "colors_by_set.csv" + write_csv( + colors_by_set, + [ + "set_num", + "set_id", + "year", + "color_rgb", + "is_translucent", + "color_name", + "quantity_total", + "quantity_non_spare", + "quantity_minifig", + "quantity_non_minifig", + ], + [ + ["1000-1", "1000", "2020", "AAAAAA", "false", "Gray", "10", "10", "0", "10"], + ["1000-1", "1000", "2020", "BBBBBB", "false", "Blue", "5", "5", "5", "0"], + ["2000-1", "2000", "2021", "CCCCCC", "true", "Trans", "3", "3", "0", "3"], + ], + ) + sets_enriched = tmp_path / "sets_enriched.csv" + write_csv( + sets_enriched, + ["set_num", "set_id", "name", "year", "in_collection"], + [ + ["1000-1", "1000", "Set A", "2020", "true"], + ["2000-1", "2000", "Set B", "2021", "false"], + ], + ) + + richness = build_richness_by_set(colors_by_set, sets_enriched) + + assert richness == [ + { + "set_num": "1000-1", + "set_id": "1000", + "name": "Set A", + "year": "2020", + "in_collection": "true", + "colors_distinct": "2", + "colors_minifig": "1", + "colors_non_minifig": "1", + "total_parts_non_spare": "15", + "top_color_name": "Gray", + "top_color_share": "0.6667", + "top3_share": "1.0000", + }, + { + "set_num": "2000-1", + "set_id": "2000", + "name": "Set B", + "year": "2021", + "in_collection": "false", + "colors_distinct": "1", + "colors_minifig": "0", + "colors_non_minifig": "1", + "total_parts_non_spare": "3", + "top_color_name": "Trans", + "top_color_share": "1.0000", + "top3_share": "1.0000", + }, + ] + + +def test_build_richness_by_year_aggregates_metrics(tmp_path: Path) -> None: + """Agrège les métriques par année.""" + richness_rows = [ + { + "set_num": "s1", + "set_id": "1", + "name": "A", + "year": "2020", + "in_collection": "true", + "colors_distinct": "4", + "colors_minifig": "1", + "colors_non_minifig": "3", + "total_parts_non_spare": "10", + "top_color_name": "Red", + "top_color_share": "0.5000", + "top3_share": "0.9000", + }, + { + "set_num": "s2", + "set_id": "2", + "name": "B", + "year": "2020", + "in_collection": "false", + "colors_distinct": "2", + "colors_minifig": "0", + "colors_non_minifig": "2", + "total_parts_non_spare": "5", + "top_color_name": "Blue", + "top_color_share": "0.6000", + "top3_share": "1.0000", + }, + { + "set_num": "s3", + "set_id": "3", + "name": "C", + "year": "2021", + "in_collection": "true", + "colors_distinct": "3", + "colors_minifig": "1", + "colors_non_minifig": "3", + "total_parts_non_spare": "7", + "top_color_name": "Green", + "top_color_share": "0.5714", + "top3_share": "1.0000", + }, + ] + + yearly = build_richness_by_year(richness_rows) + + assert yearly == [ + { + "year": "2020", + "average_colors_distinct": "3.00", + "median_colors_distinct": "3.00", + "max_colors_distinct": "4", + "min_colors_distinct": "2", + "average_top3_share": "0.9500", + "median_top3_share": "0.9500", + }, + { + "year": "2021", + "average_colors_distinct": "3.00", + "median_colors_distinct": "3.00", + "max_colors_distinct": "3", + "min_colors_distinct": "3", + "average_top3_share": "1.0000", + "median_top3_share": "1.0000", + }, + ] + + +def test_write_richness_outputs_csv(tmp_path: Path) -> None: + """Sérialise les métriques par set et par année.""" + by_set_path = tmp_path / "color_richness_by_set.csv" + by_year_path = tmp_path / "color_richness_by_year.csv" + sample_set_rows = [ + { + "set_num": "s1", + "set_id": "1", + "name": "A", + "year": "2020", + "in_collection": "true", + "colors_distinct": "1", + "colors_minifig": "1", + "colors_non_minifig": "1", + "total_parts_non_spare": "5", + "top_color_name": "Red", + "top_color_share": "1.0000", + "top3_share": "1.0000", + } + ] + sample_year_rows = [ + { + "year": "2020", + "average_colors_distinct": "1.00", + "median_colors_distinct": "1.00", + "max_colors_distinct": "1", + "min_colors_distinct": "1", + "average_top3_share": "1.0000", + "median_top3_share": "1.0000", + } + ] + + write_richness_by_set(by_set_path, sample_set_rows) + write_richness_by_year(by_year_path, sample_year_rows) + + assert by_set_path.exists() + assert by_year_path.exists() diff --git a/tests/test_color_richness_plot.py b/tests/test_color_richness_plot.py new file mode 100644 index 0000000..eccabfb --- /dev/null +++ b/tests/test_color_richness_plot.py @@ -0,0 +1,38 @@ +"""Tests des visuels de richesse chromatique.""" + +import matplotlib +from pathlib import Path + +from lib.plots.color_richness import ( + plot_concentration_scatter, + plot_richness_boxplot, + plot_richness_top_sets, +) + + +matplotlib.use("Agg") + + +def test_plot_richness_outputs_images(tmp_path: Path) -> None: + """Génère les trois graphiques principaux.""" + richness_path = tmp_path / "color_richness_by_set.csv" + richness_path.write_text( + "set_num,set_id,name,year,in_collection,colors_distinct,colors_minifig,colors_non_minifig,total_parts_non_spare,top_color_name,top_color_share,top3_share\n" + "1000-1,1000,Set A,2020,true,6,2,5,50,Red,0.4000,0.6500\n" + "2000-1,2000,Set B,2021,false,4,1,3,30,Blue,0.5000,0.7500\n" + "3000-1,3000,Set C,2021,true,5,1,4,40,Green,0.3000,0.5500\n" + ) + boxplot_dest = tmp_path / "figures" / "step28" / "color_richness_boxplot.png" + top_dest = tmp_path / "figures" / "step28" / "color_richness_top_sets.png" + scatter_dest = tmp_path / "figures" / "step28" / "color_concentration_scatter.png" + + plot_richness_boxplot(richness_path, boxplot_dest) + plot_richness_top_sets(richness_path, top_dest) + plot_concentration_scatter(richness_path, scatter_dest) + + assert boxplot_dest.exists() + assert top_dest.exists() + assert scatter_dest.exists() + assert boxplot_dest.stat().st_size > 0 + assert top_dest.stat().st_size > 0 + assert scatter_dest.stat().st_size > 0 diff --git a/tests/test_rare_parts.py b/tests/test_rare_parts.py index 9a038be..67ed813 100644 --- a/tests/test_rare_parts.py +++ b/tests/test_rare_parts.py @@ -102,17 +102,6 @@ def test_build_rare_parts_detects_exclusive_variations(tmp_path: Path) -> None: }, ] assert rare_by_set == [ - { - "set_num": "1000-1", - "set_id": "1000", - "name": "Set A", - "year": "2020", - "in_collection": "true", - "rare_parts_distinct": "1", - "rare_parts_quantity": "1", - "rare_minifig_parts_distinct": "1", - "rare_minifig_quantity": "1", - }, { "set_num": "2000-1", "set_id": "2000", @@ -124,6 +113,17 @@ def test_build_rare_parts_detects_exclusive_variations(tmp_path: Path) -> None: "rare_minifig_parts_distinct": "0", "rare_minifig_quantity": "0", }, + { + "set_num": "1000-1", + "set_id": "1000", + "name": "Set A", + "year": "2020", + "in_collection": "true", + "rare_parts_distinct": "1", + "rare_parts_quantity": "1", + "rare_minifig_parts_distinct": "1", + "rare_minifig_quantity": "1", + }, ]