Ajoute la richesse chromatique par set

2025-12-02 16:59:59 +01:00
parent f94669d82e
commit d067e2075f
8 changed files with 592 additions and 11 deletions
--- a/README.md
+++ b/README.md
@@ -285,3 +285,16 @@ Le calcul lit `data/intermediate/parts_filtered.csv`, `data/intermediate/sets_en
 - `data/intermediate/rare_parts_by_set.csv` : agrégat par set (comptes distincts, quantités, focus minifigs).
 Le tracé `figures/step27/rare_parts_per_set.png` met en scène le top des sets contenant le plus de variantes exclusives, en distinguant les pièces de minifigs et l’état de possession.
 ### Étape 28 : richesse chromatique par set
 1. `source .venv/bin/activate`
 2. `python -m scripts.compute_color_richness`
 3. `python -m scripts.plot_color_richness`
 Le calcul lit `data/intermediate/colors_by_set.csv` et `data/intermediate/sets_enriched.csv` pour mesurer la diversité des palettes (nombre de couleurs distinctes hors rechanges, part des 3 couleurs principales, part de couleurs de minifigs). Il produit :
 - `data/intermediate/color_richness_by_set.csv` : métriques détaillées par set (comptes et parts principales, possession).
 - `data/intermediate/color_richness_by_year.csv` : agrégat annuel (moyenne, médiane, bornes de diversité et concentration).
 Les graphiques `figures/step28/color_richness_boxplot.png`, `figures/step28/color_richness_top_sets.png` et `figures/step28/color_concentration_scatter.png` montrent respectivement la répartition annuelle, le top des sets les plus colorés et la concentration des palettes (part des 3 couleurs dominantes vs nombre de couleurs).
--- a/lib/plots/color_richness.py
+++ b/lib/plots/color_richness.py
@@ -0,0 +1,130 @@
 """Visualisations de la richesse chromatique par set."""
 from pathlib import Path
 from typing import Iterable, List, Tuple
 import matplotlib.pyplot as plt
 import numpy as np
 from matplotlib.patches import Patch
 from lib.filesystem import ensure_parent_dir
 from lib.rebrickable.stats import read_rows
 def load_richness_rows(path: Path) -> List[dict]:
    """Charge les métriques de richesse chromatique."""
    return read_rows(path)
 def build_boxplot_data(rows: Iterable[dict]) -> Tuple[List[List[int]], List[str]]:
    """Prépare les valeurs de boxplot par année."""
    grouped: dict[str, List[int]] = {}
    for row in rows:
        year_rows = grouped.get(row["year"])
        if year_rows is None:
            year_rows = []
            grouped[row["year"]] = year_rows
        year_rows.append(int(row["colors_distinct"]))
    years = sorted(grouped.keys(), key=int)
    data = [grouped[year] for year in years]
    return data, years
 def plot_richness_boxplot(richness_path: Path, destination_path: Path) -> None:
    """Trace le boxplot du nombre de couleurs distinctes par set et par année."""
    rows = load_richness_rows(richness_path)
    if not rows:
        return
    data, years = build_boxplot_data(rows)
    fig, ax = plt.subplots(figsize=(12, 7))
    box = ax.boxplot(
        data,
        orientation="vertical",
        patch_artist=True,
        tick_labels=years,
        boxprops=dict(facecolor="#1f77b4", alpha=0.3),
        medianprops=dict(color="#0d0d0d", linewidth=1.5),
        whiskerprops=dict(color="#555555", linestyle="--"),
        capprops=dict(color="#555555"),
    )
    for patch in box["boxes"]:
        patch.set_edgecolor("#1f77b4")
    ax.set_xlabel("Année")
    ax.set_ylabel("Nombre de couleurs distinctes (hors rechanges)")
    ax.set_title("Richesse chromatique par set (répartition annuelle)")
    ax.grid(axis="y", linestyle="--", alpha=0.3)
    ensure_parent_dir(destination_path)
    fig.tight_layout()
    fig.savefig(destination_path, dpi=170)
    plt.close(fig)
 def select_top_sets(rows: Iterable[dict], limit: int = 15) -> List[dict]:
    """Retient les sets les plus colorés et les plus concentrés."""
    sorted_rows = sorted(
        rows,
        key=lambda row: (-int(row["colors_distinct"]), float(row["top3_share"]), row["set_num"]),
    )
    return sorted_rows[:limit]
 def plot_richness_top_sets(richness_path: Path, destination_path: Path) -> None:
    """Trace le top des sets les plus riches en couleurs."""
    rows = load_richness_rows(richness_path)
    if not rows:
        return
    top_rows = select_top_sets(rows)
    y_positions = np.arange(len(top_rows))
    counts = [int(row["colors_distinct"]) for row in top_rows]
    labels = [f"{row['set_num']} · {row['name']} ({row['year']})" for row in top_rows]
    owned_mask = [row["in_collection"] == "true" for row in top_rows]
    fig, ax = plt.subplots(figsize=(11, 8))
    for y, value, owned in zip(y_positions, counts, owned_mask):
        alpha = 0.92 if owned else 0.45
        ax.barh(y, value, color="#2ca02c", alpha=alpha)
    ax.set_yticks(y_positions)
    ax.set_yticklabels(labels)
    ax.invert_yaxis()
    ax.set_xlabel("Couleurs distinctes (hors rechanges)")
    ax.set_title("Top des sets les plus colorés")
    ax.grid(axis="x", linestyle="--", alpha=0.3)
    legend = [
        Patch(facecolor="#2ca02c", edgecolor="none", alpha=0.92, label="Set possédé"),
        Patch(facecolor="#2ca02c", edgecolor="none", alpha=0.45, label="Set manquant"),
    ]
    ax.legend(handles=legend, loc="lower right", frameon=False)
    ensure_parent_dir(destination_path)
    fig.tight_layout()
    fig.savefig(destination_path, dpi=170)
    plt.close(fig)
 def plot_concentration_scatter(richness_path: Path, destination_path: Path) -> None:
    """Visualise la concentration de palette vs nombre de couleurs."""
    rows = load_richness_rows(richness_path)
    if not rows:
        return
    x_values = [int(row["colors_distinct"]) for row in rows]
    y_values = [float(row["top3_share"]) for row in rows]
    owned_mask = [row["in_collection"] == "true" for row in rows]
    colors = ["#1f77b4" if owned else "#bbbbbb" for owned in owned_mask]
    fig, ax = plt.subplots(figsize=(10, 7))
    ax.scatter(x_values, y_values, c=colors, alpha=0.7, s=32)
    ax.set_xlabel("Nombre de couleurs distinctes (hors rechanges)")
    ax.set_ylabel("Part des 3 couleurs principales")
    ax.set_title("Concentration des palettes")
    ax.grid(True, linestyle="--", alpha=0.3)
    legend = [
        Patch(facecolor="#1f77b4", edgecolor="none", alpha=0.7, label="Set possédé"),
        Patch(facecolor="#bbbbbb", edgecolor="none", alpha=0.7, label="Set manquant"),
    ]
    ax.legend(handles=legend, loc="upper right", frameon=False)
    ensure_parent_dir(destination_path)
    fig.tight_layout()
    fig.savefig(destination_path, dpi=170)
    plt.close(fig)
--- a/lib/rebrickable/color_richness.py
+++ b/lib/rebrickable/color_richness.py
@@ -0,0 +1,150 @@
 """Métriques de richesse chromatique par set."""
 import csv
 from pathlib import Path
 from typing import Dict, Iterable, List, Sequence
 from lib.filesystem import ensure_parent_dir
 from lib.rebrickable.stats import compute_median, read_rows
 def load_colors_by_set(path: Path) -> List[dict]:
    """Charge colors_by_set.csv en mémoire."""
    return read_rows(path)
 def load_sets(path: Path) -> Dict[str, dict]:
    """Indexe les sets enrichis par set_num."""
    sets: Dict[str, dict] = {}
    with path.open() as csv_file:
        reader = csv.DictReader(csv_file)
        for row in reader:
            sets[row["set_num"]] = row
    return sets
 def group_by_set(rows: Iterable[dict]) -> Dict[str, List[dict]]:
    """Regroupe les couleurs par set."""
    grouped: Dict[str, List[dict]] = {}
    for row in rows:
        set_rows = grouped.get(row["set_num"])
        if set_rows is None:
            set_rows = []
            grouped[row["set_num"]] = set_rows
        set_rows.append(row)
    return grouped
 def build_richness_by_set(
    colors_by_set_path: Path,
    sets_enriched_path: Path,
 ) -> List[dict]:
    """Construit les métriques de richesse chromatique par set."""
    colors = load_colors_by_set(colors_by_set_path)
    sets_lookup = load_sets(sets_enriched_path)
    grouped = group_by_set(colors)
    richness: List[dict] = []
    for set_num, set_rows in grouped.items():
        total_non_spare = sum(int(row["quantity_non_spare"]) for row in set_rows)
        colors_distinct = len(set_rows)
        colors_minifig = sum(1 for row in set_rows if int(row["quantity_minifig"]) > 0)
        colors_non_minifig = sum(1 for row in set_rows if int(row["quantity_non_minifig"]) > 0)
        sorted_by_quantity = sorted(set_rows, key=lambda row: int(row["quantity_non_spare"]), reverse=True)
        top_color = sorted_by_quantity[0]
        top3_total = sum(int(row["quantity_non_spare"]) for row in sorted_by_quantity[:3])
        top_share = int(top_color["quantity_non_spare"]) / total_non_spare
        top3_share = top3_total / total_non_spare
        set_row = sets_lookup[set_num]
        richness.append(
            {
                "set_num": set_num,
                "set_id": set_row["set_id"],
                "name": set_row["name"],
                "year": set_row["year"],
                "in_collection": set_row["in_collection"],
                "colors_distinct": str(colors_distinct),
                "colors_minifig": str(colors_minifig),
                "colors_non_minifig": str(colors_non_minifig),
                "total_parts_non_spare": str(total_non_spare),
                "top_color_name": top_color["color_name"],
                "top_color_share": f"{top_share:.4f}",
                "top3_share": f"{top3_share:.4f}",
            }
        )
    richness.sort(key=lambda row: (-int(row["colors_distinct"]), row["set_num"]))
    return richness
 def build_richness_by_year(richness_rows: Iterable[dict]) -> List[dict]:
    """Agrège les métriques de richesse par année."""
    grouped: Dict[str, List[dict]] = {}
    for row in richness_rows:
        year_rows = grouped.get(row["year"])
        if year_rows is None:
            year_rows = []
            grouped[row["year"]] = year_rows
        year_rows.append(row)
    yearly: List[dict] = []
    for year, rows in grouped.items():
        distinct_counts = [int(row["colors_distinct"]) for row in rows]
        top3_shares = [float(row["top3_share"]) for row in rows]
        average_distinct = sum(distinct_counts) / len(distinct_counts)
        median_distinct = compute_median(distinct_counts)
        average_top3 = sum(top3_shares) / len(top3_shares)
        median_top3 = compute_median([int(share * 10000) for share in top3_shares]) / 10000
        yearly.append(
            {
                "year": year,
                "average_colors_distinct": f"{average_distinct:.2f}",
                "median_colors_distinct": f"{median_distinct:.2f}",
                "max_colors_distinct": str(max(distinct_counts)),
                "min_colors_distinct": str(min(distinct_counts)),
                "average_top3_share": f"{average_top3:.4f}",
                "median_top3_share": f"{median_top3:.4f}",
            }
        )
    yearly.sort(key=lambda row: int(row["year"]))
    return yearly
 def write_richness_by_set(destination_path: Path, rows: Sequence[dict]) -> None:
    """Écrit le CSV des métriques par set."""
    ensure_parent_dir(destination_path)
    fieldnames = [
        "set_num",
        "set_id",
        "name",
        "year",
        "in_collection",
        "colors_distinct",
        "colors_minifig",
        "colors_non_minifig",
        "total_parts_non_spare",
        "top_color_name",
        "top_color_share",
        "top3_share",
    ]
    with destination_path.open("w", newline="") as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()
        for row in rows:
            writer.writerow(row)
 def write_richness_by_year(destination_path: Path, rows: Sequence[dict]) -> None:
    """Écrit le CSV agrégé par année."""
    ensure_parent_dir(destination_path)
    fieldnames = [
        "year",
        "average_colors_distinct",
        "median_colors_distinct",
        "max_colors_distinct",
        "min_colors_distinct",
        "average_top3_share",
        "median_top3_share",
    ]
    with destination_path.open("w", newline="") as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()
        for row in rows:
            writer.writerow(row)
--- a/scripts/compute_color_richness.py
+++ b/scripts/compute_color_richness.py
@@ -0,0 +1,28 @@
 """Calcule la richesse chromatique par set et par année."""
 from pathlib import Path
 from lib.rebrickable.color_richness import (
    build_richness_by_set,
    build_richness_by_year,
    write_richness_by_set,
    write_richness_by_year,
 )
 COLORS_BY_SET_PATH = Path("data/intermediate/colors_by_set.csv")
 SETS_ENRICHED_PATH = Path("data/intermediate/sets_enriched.csv")
 RICHNESS_BY_SET_PATH = Path("data/intermediate/color_richness_by_set.csv")
 RICHNESS_BY_YEAR_PATH = Path("data/intermediate/color_richness_by_year.csv")
 def main() -> None:
    """Construit les CSV de richesse chromatique."""
    richness_by_set = build_richness_by_set(COLORS_BY_SET_PATH, SETS_ENRICHED_PATH)
    richness_by_year = build_richness_by_year(richness_by_set)
    write_richness_by_set(RICHNESS_BY_SET_PATH, richness_by_set)
    write_richness_by_year(RICHNESS_BY_YEAR_PATH, richness_by_year)
 if __name__ == "__main__":
    main()
--- a/scripts/plot_color_richness.py
+++ b/scripts/plot_color_richness.py
@@ -0,0 +1,26 @@
 """Trace les graphiques de richesse chromatique par set."""
 from pathlib import Path
 from lib.plots.color_richness import (
    plot_concentration_scatter,
    plot_richness_boxplot,
    plot_richness_top_sets,
 )
 RICHNESS_PATH = Path("data/intermediate/color_richness_by_set.csv")
 BOXPLOT_DESTINATION = Path("figures/step28/color_richness_boxplot.png")
 TOP_DESTINATION = Path("figures/step28/color_richness_top_sets.png")
 CONCENTRATION_DESTINATION = Path("figures/step28/color_concentration_scatter.png")
 def main() -> None:
    """Génère les visuels de richesse chromatique."""
    plot_richness_boxplot(RICHNESS_PATH, BOXPLOT_DESTINATION)
    plot_richness_top_sets(RICHNESS_PATH, TOP_DESTINATION)
    plot_concentration_scatter(RICHNESS_PATH, CONCENTRATION_DESTINATION)
 if __name__ == "__main__":
    main()
--- a/tests/test_color_richness.py
+++ b/tests/test_color_richness.py
@@ -0,0 +1,196 @@
 """Tests des métriques de richesse chromatique."""
 import csv
 from pathlib import Path
 from lib.rebrickable.color_richness import (
    build_richness_by_set,
    build_richness_by_year,
    write_richness_by_set,
    write_richness_by_year,
 )
 def write_csv(path: Path, headers: list[str], rows: list[list[str]]) -> None:
    """Écrit un CSV simple pour les besoins de tests."""
    with path.open("w", newline="") as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(headers)
        writer.writerows(rows)
 def test_build_richness_by_set_computes_shares_and_counts(tmp_path: Path) -> None:
    """Calcule les partages de couleurs principales et les dénombrements."""
    colors_by_set = tmp_path / "colors_by_set.csv"
    write_csv(
        colors_by_set,
        [
            "set_num",
            "set_id",
            "year",
            "color_rgb",
            "is_translucent",
            "color_name",
            "quantity_total",
            "quantity_non_spare",
            "quantity_minifig",
            "quantity_non_minifig",
        ],
        [
            ["1000-1", "1000", "2020", "AAAAAA", "false", "Gray", "10", "10", "0", "10"],
            ["1000-1", "1000", "2020", "BBBBBB", "false", "Blue", "5", "5", "5", "0"],
            ["2000-1", "2000", "2021", "CCCCCC", "true", "Trans", "3", "3", "0", "3"],
        ],
    )
    sets_enriched = tmp_path / "sets_enriched.csv"
    write_csv(
        sets_enriched,
        ["set_num", "set_id", "name", "year", "in_collection"],
        [
            ["1000-1", "1000", "Set A", "2020", "true"],
            ["2000-1", "2000", "Set B", "2021", "false"],
        ],
    )
    richness = build_richness_by_set(colors_by_set, sets_enriched)
    assert richness == [
        {
            "set_num": "1000-1",
            "set_id": "1000",
            "name": "Set A",
            "year": "2020",
            "in_collection": "true",
            "colors_distinct": "2",
            "colors_minifig": "1",
            "colors_non_minifig": "1",
            "total_parts_non_spare": "15",
            "top_color_name": "Gray",
            "top_color_share": "0.6667",
            "top3_share": "1.0000",
        },
        {
            "set_num": "2000-1",
            "set_id": "2000",
            "name": "Set B",
            "year": "2021",
            "in_collection": "false",
            "colors_distinct": "1",
            "colors_minifig": "0",
            "colors_non_minifig": "1",
            "total_parts_non_spare": "3",
            "top_color_name": "Trans",
            "top_color_share": "1.0000",
            "top3_share": "1.0000",
        },
    ]
 def test_build_richness_by_year_aggregates_metrics(tmp_path: Path) -> None:
    """Agrège les métriques par année."""
    richness_rows = [
        {
            "set_num": "s1",
            "set_id": "1",
            "name": "A",
            "year": "2020",
            "in_collection": "true",
            "colors_distinct": "4",
            "colors_minifig": "1",
            "colors_non_minifig": "3",
            "total_parts_non_spare": "10",
            "top_color_name": "Red",
            "top_color_share": "0.5000",
            "top3_share": "0.9000",
        },
        {
            "set_num": "s2",
            "set_id": "2",
            "name": "B",
            "year": "2020",
            "in_collection": "false",
            "colors_distinct": "2",
            "colors_minifig": "0",
            "colors_non_minifig": "2",
            "total_parts_non_spare": "5",
            "top_color_name": "Blue",
            "top_color_share": "0.6000",
            "top3_share": "1.0000",
        },
        {
            "set_num": "s3",
            "set_id": "3",
            "name": "C",
            "year": "2021",
            "in_collection": "true",
            "colors_distinct": "3",
            "colors_minifig": "1",
            "colors_non_minifig": "3",
            "total_parts_non_spare": "7",
            "top_color_name": "Green",
            "top_color_share": "0.5714",
            "top3_share": "1.0000",
        },
    ]
    yearly = build_richness_by_year(richness_rows)
    assert yearly == [
        {
            "year": "2020",
            "average_colors_distinct": "3.00",
            "median_colors_distinct": "3.00",
            "max_colors_distinct": "4",
            "min_colors_distinct": "2",
            "average_top3_share": "0.9500",
            "median_top3_share": "0.9500",
        },
        {
            "year": "2021",
            "average_colors_distinct": "3.00",
            "median_colors_distinct": "3.00",
            "max_colors_distinct": "3",
            "min_colors_distinct": "3",
            "average_top3_share": "1.0000",
            "median_top3_share": "1.0000",
        },
    ]
 def test_write_richness_outputs_csv(tmp_path: Path) -> None:
    """Sérialise les métriques par set et par année."""
    by_set_path = tmp_path / "color_richness_by_set.csv"
    by_year_path = tmp_path / "color_richness_by_year.csv"
    sample_set_rows = [
        {
            "set_num": "s1",
            "set_id": "1",
            "name": "A",
            "year": "2020",
            "in_collection": "true",
            "colors_distinct": "1",
            "colors_minifig": "1",
            "colors_non_minifig": "1",
            "total_parts_non_spare": "5",
            "top_color_name": "Red",
            "top_color_share": "1.0000",
            "top3_share": "1.0000",
        }
    ]
    sample_year_rows = [
        {
            "year": "2020",
            "average_colors_distinct": "1.00",
            "median_colors_distinct": "1.00",
            "max_colors_distinct": "1",
            "min_colors_distinct": "1",
            "average_top3_share": "1.0000",
            "median_top3_share": "1.0000",
        }
    ]
    write_richness_by_set(by_set_path, sample_set_rows)
    write_richness_by_year(by_year_path, sample_year_rows)
    assert by_set_path.exists()
    assert by_year_path.exists()
--- a/tests/test_color_richness_plot.py
+++ b/tests/test_color_richness_plot.py
@@ -0,0 +1,38 @@
 """Tests des visuels de richesse chromatique."""
 import matplotlib
 from pathlib import Path
 from lib.plots.color_richness import (
    plot_concentration_scatter,
    plot_richness_boxplot,
    plot_richness_top_sets,
 )
 matplotlib.use("Agg")
 def test_plot_richness_outputs_images(tmp_path: Path) -> None:
    """Génère les trois graphiques principaux."""
    richness_path = tmp_path / "color_richness_by_set.csv"
    richness_path.write_text(
        "set_num,set_id,name,year,in_collection,colors_distinct,colors_minifig,colors_non_minifig,total_parts_non_spare,top_color_name,top_color_share,top3_share\n"
        "1000-1,1000,Set A,2020,true,6,2,5,50,Red,0.4000,0.6500\n"
        "2000-1,2000,Set B,2021,false,4,1,3,30,Blue,0.5000,0.7500\n"
        "3000-1,3000,Set C,2021,true,5,1,4,40,Green,0.3000,0.5500\n"
    )
    boxplot_dest = tmp_path / "figures" / "step28" / "color_richness_boxplot.png"
    top_dest = tmp_path / "figures" / "step28" / "color_richness_top_sets.png"
    scatter_dest = tmp_path / "figures" / "step28" / "color_concentration_scatter.png"
    plot_richness_boxplot(richness_path, boxplot_dest)
    plot_richness_top_sets(richness_path, top_dest)
    plot_concentration_scatter(richness_path, scatter_dest)
    assert boxplot_dest.exists()
    assert top_dest.exists()
    assert scatter_dest.exists()
    assert boxplot_dest.stat().st_size > 0
    assert top_dest.stat().st_size > 0
    assert scatter_dest.stat().st_size > 0
--- a/tests/test_rare_parts.py
+++ b/tests/test_rare_parts.py
@@ -102,17 +102,6 @@ def test_build_rare_parts_detects_exclusive_variations(tmp_path: Path) -> None:
        },
    ]
    assert rare_by_set == [
        {
            "set_num": "1000-1",
            "set_id": "1000",
            "name": "Set A",
            "year": "2020",
            "in_collection": "true",
            "rare_parts_distinct": "1",
            "rare_parts_quantity": "1",
            "rare_minifig_parts_distinct": "1",
            "rare_minifig_quantity": "1",
        },
        {
            "set_num": "2000-1",
            "set_id": "2000",
@@ -124,6 +113,17 @@ def test_build_rare_parts_detects_exclusive_variations(tmp_path: Path) -> None:
            "rare_minifig_parts_distinct": "0",
            "rare_minifig_quantity": "0",
        },
        {
            "set_num": "1000-1",
            "set_id": "1000",
            "name": "Set A",
            "year": "2020",
            "in_collection": "true",
            "rare_parts_distinct": "1",
            "rare_parts_quantity": "1",
            "rare_minifig_parts_distinct": "1",
            "rare_minifig_quantity": "1",
        },
    ]