From f94669d82e2aa9ad4586ea4f37d772ca96a5ca17 Mon Sep 17 00:00:00 2001
From: Richard Dern <gitea.local.richard@dern.ovh>
Date: Tue, 2 Dec 2025 16:52:42 +0100
Subject: [PATCH] =?UTF-8?q?Ajoute=20l'analyse=20des=20pi=C3=A8ces=20rares?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md                     |  13 +++
 lib/plots/rare_parts.py       |  77 +++++++++++++
 lib/rebrickable/rare_parts.py | 204 ++++++++++++++++++++++++++++++++++
 scripts/compute_rare_parts.py |  28 +++++
 scripts/plot_rare_parts.py    |  18 +++
 tests/test_rare_parts.py      | 169 ++++++++++++++++++++++++++++
 tests/test_rare_parts_plot.py |  25 +++++
 7 files changed, 534 insertions(+)
 create mode 100644 lib/plots/rare_parts.py
 create mode 100644 lib/rebrickable/rare_parts.py
 create mode 100644 scripts/compute_rare_parts.py
 create mode 100644 scripts/plot_rare_parts.py
 create mode 100644 tests/test_rare_parts.py
 create mode 100644 tests/test_rare_parts_plot.py

diff --git a/README.md b/README.md
index 321684e..c4182fa 100644
--- a/README.md
+++ b/README.md
@@ -272,3 +272,16 @@ Le script lit `data/intermediate/minifigs_by_set.csv`, agrège le nombre de mini
 
 Le script lit `data/intermediate/minifig_counts_by_set.csv`, `data/intermediate/sets_enriched.csv`, `data/raw/sets.csv`, `data/raw/inventories.csv` et `data/raw/inventory_minifigs.csv`, produit `data/intermediate/minifig_parts_correlation.csv` (pièces vs minifigs pour le catalogue global et les thèmes filtrés), puis trace `figures/step26/minifig_parts_correlation.png` en superposant les nuages de points et leurs tendances linéaires.
 Un second export `data/intermediate/minifigs_per_set_timeline.csv` est généré pour l'évolution annuelle du nombre moyen de minifigs par set, visualisé dans `figures/step26/minifigs_per_set_timeline.png` (courbes catalogue vs thèmes filtrés).
+
+### Étape 27 : pièces rares (variantes exclusives)
+
+1. `source .venv/bin/activate`
+2. `python -m scripts.compute_rare_parts`
+3. `python -m scripts.plot_rare_parts`
+
+Le calcul lit `data/intermediate/parts_filtered.csv`, `data/intermediate/sets_enriched.csv`, `data/raw/parts.csv` et `data/raw/colors.csv` pour identifier les combinaisons pièce + couleur présentes dans un seul set (rechanges exclues). Il produit :
+
+- `data/intermediate/rare_parts.csv` : liste détaillée des pièces rares avec set, couleur, nature minifig/hors minifig et possession.
+- `data/intermediate/rare_parts_by_set.csv` : agrégat par set (comptes distincts, quantités, focus minifigs).
+
+Le tracé `figures/step27/rare_parts_per_set.png` met en scène le top des sets contenant le plus de variantes exclusives, en distinguant les pièces de minifigs et l’état de possession.
diff --git a/lib/plots/rare_parts.py b/lib/plots/rare_parts.py
new file mode 100644
index 0000000..a9269c5
--- /dev/null
+++ b/lib/plots/rare_parts.py
@@ -0,0 +1,77 @@
+"""Graphique des pièces rares par set."""
+
+from pathlib import Path
+from typing import List, Tuple
+
+import matplotlib.pyplot as plt
+from matplotlib.patches import Patch
+
+from lib.filesystem import ensure_parent_dir
+from lib.rebrickable.stats import read_rows
+
+
+def load_top_sets(path: Path, limit: int = 15) -> List[dict]:
+    """Charge les sets triés par nombre de pièces rares et limite le top."""
+    rows = read_rows(path)
+    sorted_rows = sorted(
+        rows,
+        key=lambda row: (
+            -int(row["rare_parts_distinct"]),
+            -int(row["rare_parts_quantity"]),
+            row["set_num"],
+        ),
+    )
+    return sorted_rows[:limit]
+
+
+def split_counts(rows: List[dict]) -> Tuple[List[int], List[int]]:
+    """Sépare les comptages minifig vs hors minifig."""
+    non_minifig: List[int] = []
+    minifig: List[int] = []
+    for row in rows:
+        total = int(row["rare_parts_distinct"])
+        minifig_count = int(row["rare_minifig_parts_distinct"])
+        non_minifig.append(total - minifig_count)
+        minifig.append(minifig_count)
+    return non_minifig, minifig
+
+
+def plot_rare_parts_per_set(rare_by_set_path: Path, destination_path: Path) -> None:
+    """Trace le top des sets contenant des pièces exclusives."""
+    rows = load_top_sets(rare_by_set_path)
+    if not rows:
+        return
+    non_minifig, minifig = split_counts(rows)
+    y_positions = list(range(len(rows)))
+    labels = [f"{row['set_num']} · {row['name']} ({row['year']})" for row in rows]
+    owned_mask = [row["in_collection"] == "true" for row in rows]
+
+    base_color = "#1f77b4"
+    accent_color = "#f28e2b"
+    fig, ax = plt.subplots(figsize=(11, 8))
+    for y, value, is_owned in zip(y_positions, non_minifig, owned_mask):
+        alpha = 0.92 if is_owned else 0.45
+        ax.barh(y, value, color=base_color, alpha=alpha, label=None)
+    for y, value, offset, is_owned in zip(y_positions, minifig, non_minifig, owned_mask):
+        alpha = 0.92 if is_owned else 0.45
+        ax.barh(y, value, left=offset, color=accent_color, alpha=alpha, label=None)
+
+    ax.set_yticks(y_positions)
+    ax.set_yticklabels(labels)
+    ax.invert_yaxis()
+    ax.set_xlabel("Variantes de pièces exclusives (hors rechanges)")
+    ax.set_title("Pièces rares par set (top)")
+    ax.grid(axis="x", linestyle="--", alpha=0.35)
+
+    handles = [
+        Patch(facecolor=base_color, edgecolor="none", label="Pièces hors minifigs"),
+        Patch(facecolor=accent_color, edgecolor="none", label="Pièces de minifigs"),
+        Patch(facecolor="#000000", edgecolor="none", alpha=0.92, label="Set possédé"),
+        Patch(facecolor="#000000", edgecolor="none", alpha=0.45, label="Set manquant"),
+    ]
+    ax.legend(handles=handles, loc="lower right", frameon=False)
+
+    ensure_parent_dir(destination_path)
+    fig.tight_layout()
+    fig.savefig(destination_path, dpi=170)
+    plt.close(fig)
diff --git a/lib/rebrickable/rare_parts.py b/lib/rebrickable/rare_parts.py
new file mode 100644
index 0000000..6bd54e0
--- /dev/null
+++ b/lib/rebrickable/rare_parts.py
@@ -0,0 +1,204 @@
+"""Identification des pièces rares (variantes exclusives à un set)."""
+
+import csv
+from pathlib import Path
+from typing import Dict, Iterable, List, Sequence, Set, Tuple
+
+from lib.filesystem import ensure_parent_dir
+from lib.rebrickable.stats import read_rows
+
+
+def load_parts_catalog(path: Path) -> Dict[str, dict]:
+    """Charge le catalogue des pièces et l'indexe par référence."""
+    catalog: Dict[str, dict] = {}
+    with path.open() as csv_file:
+        reader = csv.DictReader(csv_file)
+        for row in reader:
+            catalog[row["part_num"]] = row
+    return catalog
+
+
+def load_colors_lookup(path: Path) -> Dict[Tuple[str, str], str]:
+    """Associe un couple (rgb, is_trans) au nom de couleur."""
+    lookup: Dict[Tuple[str, str], str] = {}
+    for row in read_rows(path):
+        lookup[(row["rgb"], row["is_trans"].lower())] = row["name"]
+    return lookup
+
+
+def load_sets_enriched(path: Path) -> Dict[str, dict]:
+    """Indexe les sets enrichis par numéro complet."""
+    sets: Dict[str, dict] = {}
+    for row in read_rows(path):
+        sets[row["set_num"]] = row
+    return sets
+
+
+def aggregate_non_spare_parts(rows: Iterable[dict]) -> List[dict]:
+    """Agrège les pièces hors rechanges par set et variation couleur."""
+    aggregated: Dict[Tuple[str, str, str, str, str, str, str], int] = {}
+    for row in rows:
+        if row["is_spare"] == "true":
+            continue
+        key = (
+            row["set_num"],
+            row["part_num"],
+            row["color_rgb"],
+            row["is_translucent"],
+            row["is_minifig_part"],
+            row["set_id"],
+            row["year"],
+        )
+        aggregated[key] = aggregated.get(key, 0) + int(row["quantity_in_set"])
+    result: List[dict] = []
+    for key, quantity in aggregated.items():
+        set_num, part_num, color_rgb, is_translucent, is_minifig_part, set_id, year = key
+        result.append(
+            {
+                "set_num": set_num,
+                "part_num": part_num,
+                "color_rgb": color_rgb,
+                "is_translucent": is_translucent,
+                "is_minifig_part": is_minifig_part,
+                "set_id": set_id,
+                "year": year,
+                "quantity_in_set": str(quantity),
+            }
+        )
+    result.sort(key=lambda row: (row["set_num"], row["part_num"], row["color_rgb"]))
+    return result
+
+
+def compute_combo_set_counts(rows: Iterable[dict]) -> Dict[Tuple[str, str, str], Set[str]]:
+    """Compte les sets distincts par combinaison pièce+couleur."""
+    combos: Dict[Tuple[str, str, str], Set[str]] = {}
+    for row in rows:
+        key = (row["part_num"], row["color_rgb"], row["is_translucent"])
+        if key not in combos:
+            combos[key] = set()
+        combos[key].add(row["set_num"])
+    return combos
+
+
+def build_rare_parts(
+    parts_filtered_path: Path,
+    sets_enriched_path: Path,
+    parts_catalog_path: Path,
+    colors_path: Path,
+) -> Tuple[List[dict], List[dict]]:
+    """Construit les listes des pièces rares et leur répartition par set."""
+    parts_rows = read_rows(parts_filtered_path)
+    aggregated = aggregate_non_spare_parts(parts_rows)
+    combo_sets = compute_combo_set_counts(aggregated)
+    parts_catalog = load_parts_catalog(parts_catalog_path)
+    color_names = load_colors_lookup(colors_path)
+    sets_lookup = load_sets_enriched(sets_enriched_path)
+
+    rare_parts: List[dict] = []
+    for row in aggregated:
+        combo_key = (row["part_num"], row["color_rgb"], row["is_translucent"])
+        if len(combo_sets[combo_key]) != 1:
+            continue
+        set_row = sets_lookup[row["set_num"]]
+        part = parts_catalog[row["part_num"]]
+        color_name = color_names[(row["color_rgb"], row["is_translucent"])]
+        rare_parts.append(
+            {
+                "set_num": row["set_num"],
+                "set_id": row["set_id"],
+                "set_name": set_row["name"],
+                "year": set_row["year"],
+                "part_num": row["part_num"],
+                "part_name": part["name"],
+                "part_cat_id": part["part_cat_id"],
+                "color_rgb": row["color_rgb"],
+                "color_name": color_name,
+                "is_translucent": row["is_translucent"],
+                "is_minifig_part": row["is_minifig_part"],
+                "quantity_in_set": row["quantity_in_set"],
+                "in_collection": set_row["in_collection"],
+            }
+        )
+    rare_parts.sort(key=lambda row: (row["set_num"], row["part_num"], row["color_rgb"]))
+
+    rare_by_set: Dict[str, dict] = {}
+    for row in rare_parts:
+        record = rare_by_set.get(row["set_num"])
+        if record is None:
+            record = {
+                "set_num": row["set_num"],
+                "set_id": row["set_id"],
+                "name": row["set_name"],
+                "year": row["year"],
+                "in_collection": row["in_collection"],
+                "rare_parts_distinct": 0,
+                "rare_parts_quantity": 0,
+                "rare_minifig_parts_distinct": 0,
+                "rare_minifig_quantity": 0,
+            }
+            rare_by_set[row["set_num"]] = record
+        record["rare_parts_distinct"] += 1
+        record["rare_parts_quantity"] += int(row["quantity_in_set"])
+        if row["is_minifig_part"] == "true":
+            record["rare_minifig_parts_distinct"] += 1
+            record["rare_minifig_quantity"] += int(row["quantity_in_set"])
+    rare_by_set_rows = list(rare_by_set.values())
+    rare_by_set_rows.sort(
+        key=lambda row: (
+            -row["rare_parts_distinct"],
+            -row["rare_parts_quantity"],
+            row["set_num"],
+        )
+    )
+    for row in rare_by_set_rows:
+        row["rare_parts_distinct"] = str(row["rare_parts_distinct"])
+        row["rare_parts_quantity"] = str(row["rare_parts_quantity"])
+        row["rare_minifig_parts_distinct"] = str(row["rare_minifig_parts_distinct"])
+        row["rare_minifig_quantity"] = str(row["rare_minifig_quantity"])
+    return rare_parts, rare_by_set_rows
+
+
+def write_rare_parts_list(destination_path: Path, rows: Sequence[dict]) -> None:
+    """Écrit le détail des pièces rares avec leur set et leur couleur."""
+    ensure_parent_dir(destination_path)
+    fieldnames = [
+        "set_num",
+        "set_id",
+        "set_name",
+        "year",
+        "part_num",
+        "part_name",
+        "part_cat_id",
+        "color_rgb",
+        "color_name",
+        "is_translucent",
+        "is_minifig_part",
+        "quantity_in_set",
+        "in_collection",
+    ]
+    with destination_path.open("w", newline="") as csv_file:
+        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
+        writer.writeheader()
+        for row in rows:
+            writer.writerow(row)
+
+
+def write_rare_parts_by_set(destination_path: Path, rows: Sequence[dict]) -> None:
+    """Écrit l'agrégat des pièces rares par set."""
+    ensure_parent_dir(destination_path)
+    fieldnames = [
+        "set_num",
+        "set_id",
+        "name",
+        "year",
+        "in_collection",
+        "rare_parts_distinct",
+        "rare_parts_quantity",
+        "rare_minifig_parts_distinct",
+        "rare_minifig_quantity",
+    ]
+    with destination_path.open("w", newline="") as csv_file:
+        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
+        writer.writeheader()
+        for row in rows:
+            writer.writerow(row)
diff --git a/scripts/compute_rare_parts.py b/scripts/compute_rare_parts.py
new file mode 100644
index 0000000..c2b5847
--- /dev/null
+++ b/scripts/compute_rare_parts.py
@@ -0,0 +1,28 @@
+"""Calcule les pièces rares (variantes exclusives) et leurs agrégats."""
+
+from pathlib import Path
+
+from lib.rebrickable.rare_parts import (
+    build_rare_parts,
+    write_rare_parts_by_set,
+    write_rare_parts_list,
+)
+
+
+PARTS_PATH = Path("data/intermediate/parts_filtered.csv")
+SETS_PATH = Path("data/intermediate/sets_enriched.csv")
+PARTS_CATALOG_PATH = Path("data/raw/parts.csv")
+COLORS_PATH = Path("data/raw/colors.csv")
+RARE_PARTS_PATH = Path("data/intermediate/rare_parts.csv")
+RARE_PARTS_BY_SET_PATH = Path("data/intermediate/rare_parts_by_set.csv")
+
+
+def main() -> None:
+    """Construit les fichiers listant les pièces rares et leur répartition par set."""
+    rare_parts, rare_by_set = build_rare_parts(PARTS_PATH, SETS_PATH, PARTS_CATALOG_PATH, COLORS_PATH)
+    write_rare_parts_list(RARE_PARTS_PATH, rare_parts)
+    write_rare_parts_by_set(RARE_PARTS_BY_SET_PATH, rare_by_set)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/plot_rare_parts.py b/scripts/plot_rare_parts.py
new file mode 100644
index 0000000..ef16ca7
--- /dev/null
+++ b/scripts/plot_rare_parts.py
@@ -0,0 +1,18 @@
+"""Trace le top des sets avec pièces exclusives."""
+
+from pathlib import Path
+
+from lib.plots.rare_parts import plot_rare_parts_per_set
+
+
+RARE_PARTS_BY_SET_PATH = Path("data/intermediate/rare_parts_by_set.csv")
+DESTINATION_PATH = Path("figures/step27/rare_parts_per_set.png")
+
+
+def main() -> None:
+    """Génère le visuel des pièces rares par set."""
+    plot_rare_parts_per_set(RARE_PARTS_BY_SET_PATH, DESTINATION_PATH)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/test_rare_parts.py b/tests/test_rare_parts.py
new file mode 100644
index 0000000..9a038be
--- /dev/null
+++ b/tests/test_rare_parts.py
@@ -0,0 +1,169 @@
+"""Tests du calcul des pièces rares."""
+
+import csv
+from pathlib import Path
+
+from lib.rebrickable.rare_parts import build_rare_parts, write_rare_parts_by_set, write_rare_parts_list
+
+
+def write_csv(path: Path, headers: list[str], rows: list[list[str]]) -> None:
+    """Écrit un CSV simple pour les besoins de tests."""
+    with path.open("w", newline="") as csv_file:
+        writer = csv.writer(csv_file)
+        writer.writerow(headers)
+        writer.writerows(rows)
+
+
+def test_build_rare_parts_detects_exclusive_variations(tmp_path: Path) -> None:
+    """Identifie les combinaisons pièce+couleur présentes dans un seul set."""
+    parts_filtered = tmp_path / "parts_filtered.csv"
+    write_csv(
+        parts_filtered,
+        [
+            "part_num",
+            "color_rgb",
+            "is_translucent",
+            "set_num",
+            "set_id",
+            "year",
+            "quantity_in_set",
+            "is_spare",
+            "is_minifig_part",
+        ],
+        [
+            ["p1", "AAAAAA", "false", "1000-1", "1000", "2020", "2", "false", "false"],
+            ["p1", "AAAAAA", "false", "2000-1", "2000", "2021", "3", "false", "false"],
+            ["p2", "BBBBBB", "false", "1000-1", "1000", "2020", "1", "false", "true"],
+            ["p3", "CCCCCC", "true", "2000-1", "2000", "2021", "4", "false", "false"],
+        ],
+    )
+    sets_enriched = tmp_path / "sets_enriched.csv"
+    write_csv(
+        sets_enriched,
+        ["set_num", "set_id", "name", "year", "in_collection"],
+        [
+            ["1000-1", "1000", "Set A", "2020", "true"],
+            ["2000-1", "2000", "Set B", "2021", "false"],
+        ],
+    )
+    parts_catalog = tmp_path / "parts.csv"
+    write_csv(
+        parts_catalog,
+        ["part_num", "name", "part_cat_id"],
+        [
+            ["p1", "Brick 1x1", "1"],
+            ["p2", "Head Custom", "59"],
+            ["p3", "Slope 45", "2"],
+        ],
+    )
+    colors = tmp_path / "colors.csv"
+    write_csv(
+        colors,
+        ["id", "name", "rgb", "is_trans", "num_parts", "num_sets", "y1", "y2"],
+        [
+            ["1", "Gray", "AAAAAA", "false", "0", "0", "0", "0"],
+            ["2", "Blue", "BBBBBB", "false", "0", "0", "0", "0"],
+            ["3", "Trans-Clear", "CCCCCC", "true", "0", "0", "0", "0"],
+        ],
+    )
+
+    rare_parts, rare_by_set = build_rare_parts(parts_filtered, sets_enriched, parts_catalog, colors)
+
+    assert rare_parts == [
+        {
+            "set_num": "1000-1",
+            "set_id": "1000",
+            "set_name": "Set A",
+            "year": "2020",
+            "part_num": "p2",
+            "part_name": "Head Custom",
+            "part_cat_id": "59",
+            "color_rgb": "BBBBBB",
+            "color_name": "Blue",
+            "is_translucent": "false",
+            "is_minifig_part": "true",
+            "quantity_in_set": "1",
+            "in_collection": "true",
+        },
+        {
+            "set_num": "2000-1",
+            "set_id": "2000",
+            "set_name": "Set B",
+            "year": "2021",
+            "part_num": "p3",
+            "part_name": "Slope 45",
+            "part_cat_id": "2",
+            "color_rgb": "CCCCCC",
+            "color_name": "Trans-Clear",
+            "is_translucent": "true",
+            "is_minifig_part": "false",
+            "quantity_in_set": "4",
+            "in_collection": "false",
+        },
+    ]
+    assert rare_by_set == [
+        {
+            "set_num": "1000-1",
+            "set_id": "1000",
+            "name": "Set A",
+            "year": "2020",
+            "in_collection": "true",
+            "rare_parts_distinct": "1",
+            "rare_parts_quantity": "1",
+            "rare_minifig_parts_distinct": "1",
+            "rare_minifig_quantity": "1",
+        },
+        {
+            "set_num": "2000-1",
+            "set_id": "2000",
+            "name": "Set B",
+            "year": "2021",
+            "in_collection": "false",
+            "rare_parts_distinct": "1",
+            "rare_parts_quantity": "4",
+            "rare_minifig_parts_distinct": "0",
+            "rare_minifig_quantity": "0",
+        },
+    ]
+
+
+def test_write_rare_parts_outputs_csv(tmp_path: Path) -> None:
+    """Sérialise les pièces rares et l’agrégat par set."""
+    rare_parts_path = tmp_path / "rare_parts.csv"
+    rare_by_set_path = tmp_path / "rare_parts_by_set.csv"
+    rare_parts_sample = [
+        {
+            "set_num": "123-1",
+            "set_id": "123",
+            "set_name": "Sample",
+            "year": "2020",
+            "part_num": "p1",
+            "part_name": "Brick",
+            "part_cat_id": "1",
+            "color_rgb": "FFFFFF",
+            "color_name": "White",
+            "is_translucent": "false",
+            "is_minifig_part": "false",
+            "quantity_in_set": "2",
+            "in_collection": "true",
+        }
+    ]
+    rare_by_set_sample = [
+        {
+            "set_num": "123-1",
+            "set_id": "123",
+            "name": "Sample",
+            "year": "2020",
+            "in_collection": "true",
+            "rare_parts_distinct": "1",
+            "rare_parts_quantity": "2",
+            "rare_minifig_parts_distinct": "0",
+            "rare_minifig_quantity": "0",
+        }
+    ]
+
+    write_rare_parts_list(rare_parts_path, rare_parts_sample)
+    write_rare_parts_by_set(rare_by_set_path, rare_by_set_sample)
+
+    assert rare_parts_path.exists()
+    assert rare_by_set_path.exists()
diff --git a/tests/test_rare_parts_plot.py b/tests/test_rare_parts_plot.py
new file mode 100644
index 0000000..db5db42
--- /dev/null
+++ b/tests/test_rare_parts_plot.py
@@ -0,0 +1,25 @@
+"""Tests du graphique des pièces rares par set."""
+
+import matplotlib
+from pathlib import Path
+
+from lib.plots.rare_parts import plot_rare_parts_per_set
+
+
+matplotlib.use("Agg")
+
+
+def test_plot_rare_parts_per_set_outputs_image(tmp_path: Path) -> None:
+    """Génère l'image du top des sets avec pièces exclusives."""
+    rare_by_set_path = tmp_path / "rare_parts_by_set.csv"
+    destination_path = tmp_path / "figures" / "step27" / "rare_parts_per_set.png"
+    rare_by_set_path.write_text(
+        "set_num,set_id,name,year,in_collection,rare_parts_distinct,rare_parts_quantity,rare_minifig_parts_distinct,rare_minifig_quantity\n"
+        "1000-1,1000,Set A,2020,true,3,5,1,2\n"
+        "2000-1,2000,Set B,2021,false,2,4,0,0\n"
+    )
+
+    plot_rare_parts_per_set(rare_by_set_path, destination_path)
+
+    assert destination_path.exists()
+    assert destination_path.stat().st_size > 0