From 4f42303eac958e4b9303a085adb30a96060e9540 Mon Sep 17 00:00:00 2001
From: Richard Dern <gitea.local.richard@dern.ovh>
Date: Mon, 1 Dec 2025 22:15:48 +0100
Subject: [PATCH] =?UTF-8?q?Cr=C3=A9ation=20d'un=20fichier=20interm=C3=A9di?=
 =?UTF-8?q?aire=20pour=20les=20statistiques=20sur=20les=20couleurs?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md                        |   8 +++
 lib/rebrickable/colors_by_set.py |  79 ++++++++++++++++++++++++
 scripts/build_colors_by_set.py   |  27 +++++++++
 tests/test_colors_by_set.py      | 101 +++++++++++++++++++++++++++++++
 tests/test_parts_inventory.py    |  16 ++++-
 5 files changed, 228 insertions(+), 3 deletions(-)
 create mode 100644 lib/rebrickable/colors_by_set.py
 create mode 100644 scripts/build_colors_by_set.py
 create mode 100644 tests/test_colors_by_set.py

diff --git a/README.md b/README.md
index fba71a5..60b9923 100644
--- a/README.md
+++ b/README.md
@@ -130,6 +130,7 @@ Le script télécharge les fichiers compressés `inventories.csv.gz`, `inventory
 2. `python -m scripts.build_parts_inventory`
 
 Le script lit `data/intermediate/sets_enriched.csv`, `data/raw/inventories.csv`, `data/raw/inventory_parts.csv`, `data/raw/inventory_minifigs.csv`, `data/raw/minifigs.csv` et `data/raw/colors.csv`, sélectionne la version d'inventaire la plus récente pour chaque set, puis produit `data/intermediate/parts_filtered.csv` contenant : `part_num`, `color_rgb`, `is_translucent`, `set_num`, `set_id`, `quantity_in_set`, `is_spare`. Les minifigs sont éclatées en pièces en exploitant leur propre inventaire (présent dans `inventories.csv` + `inventory_parts.csv`) et leurs quantités dans `inventory_minifigs.csv`. Ce fichier sert de base aux analyses ultérieures sans relire les CSV bruts.
+Le fichier agrégé inclut également l'année du set (`year`) et un indicateur `is_minifig_part` pour distinguer les pièces issues des minifigs.
 
 ### Étape 10 : identifier les écarts d'inventaire
 
@@ -155,3 +156,10 @@ Un tableau Markdown est également généré dans `data/final/inventory_gaps.md`
 2. `python -m scripts.compute_parts_stats`
 
 Le script lit `data/intermediate/parts_filtered.csv` et `data/final/stats.csv` (pour le total catalogue filtré), puis produit `data/final/parts_stats.csv` avec : nombre de variations de pièces (hors rechanges), pièce la moins utilisée, pièce la plus commune, nombre de couleurs utilisées, total de pièces hors rechanges, écart entre le total de pièces attendu (stats catalogue) et l'inventaire agrégé, nombre de sets présentant un écart inventaire/catalogue et écart maximal observé.
+
+### Étape 13 : palette de couleurs par set
+
+1. `source .venv/bin/activate`
+2. `python -m scripts.build_colors_by_set`
+
+Le script agrège `data/intermediate/parts_filtered.csv` avec les libellés de couleurs `data/raw/colors.csv` et produit `data/intermediate/colors_by_set.csv` contenant, pour chaque set et chaque couleur, les quantités totales, hors rechanges, issues des minifigs et hors minifigs. Ce fichier sert de base aux visualisations et matrices de palette.
diff --git a/lib/rebrickable/colors_by_set.py b/lib/rebrickable/colors_by_set.py
new file mode 100644
index 0000000..28adbca
--- /dev/null
+++ b/lib/rebrickable/colors_by_set.py
@@ -0,0 +1,79 @@
+"""Agrégation des couleurs utilisées par set."""
+
+import csv
+from pathlib import Path
+from typing import Dict, Iterable, List, Tuple
+
+from lib.filesystem import ensure_parent_dir
+
+
+def load_parts(parts_path: Path) -> List[dict]:
+    """Charge le fichier parts_filtered pour agrégation."""
+    with parts_path.open() as parts_file:
+        reader = csv.DictReader(parts_file)
+        return list(reader)
+
+
+def build_colors_lookup(colors_path: Path) -> Dict[Tuple[str, str], str]:
+    """Construit un index (rgb, is_translucent) -> nom de couleur."""
+    colors: Dict[Tuple[str, str], str] = {}
+    with colors_path.open() as colors_file:
+        reader = csv.DictReader(colors_file)
+        for row in reader:
+            colors[(row["rgb"], row["is_trans"].lower())] = row["name"]
+    return colors
+
+
+def aggregate_colors_by_set(parts: Iterable[dict], colors_lookup: Dict[Tuple[str, str], str]) -> List[dict]:
+    """Agrège les quantités par set et par couleur."""
+    totals: Dict[Tuple[str, str, str, str, str], dict] = {}
+    for row in parts:
+        key = (row["set_num"], row["set_id"], row["year"], row["color_rgb"], row["is_translucent"])
+        existing = totals.get(key)
+        if existing is None:
+            totals[key] = {
+                "set_num": row["set_num"],
+                "set_id": row["set_id"],
+                "year": row["year"],
+                "color_rgb": row["color_rgb"],
+                "is_translucent": row["is_translucent"],
+                "color_name": colors_lookup[(row["color_rgb"], row["is_translucent"])],
+                "quantity_total": 0,
+                "quantity_non_spare": 0,
+                "quantity_minifig": 0,
+                "quantity_non_minifig": 0,
+            }
+            existing = totals[key]
+        quantity = int(row["quantity_in_set"])
+        existing["quantity_total"] += quantity
+        if row["is_spare"] == "false":
+            existing["quantity_non_spare"] += quantity
+        if row["is_minifig_part"] == "true":
+            existing["quantity_minifig"] += quantity
+        else:
+            existing["quantity_non_minifig"] += quantity
+    aggregated = list(totals.values())
+    aggregated.sort(key=lambda row: (row["set_num"], row["color_name"], row["is_translucent"]))
+    return aggregated
+
+
+def write_colors_by_set(destination_path: Path, rows: Iterable[dict]) -> None:
+    """Sérialise l'agrégat set × couleur dans un CSV dédié."""
+    ensure_parent_dir(destination_path)
+    with destination_path.open("w", newline="") as csv_file:
+        fieldnames = [
+            "set_num",
+            "set_id",
+            "year",
+            "color_rgb",
+            "is_translucent",
+            "color_name",
+            "quantity_total",
+            "quantity_non_spare",
+            "quantity_minifig",
+            "quantity_non_minifig",
+        ]
+        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
+        writer.writeheader()
+        for row in rows:
+            writer.writerow(row)
diff --git a/scripts/build_colors_by_set.py b/scripts/build_colors_by_set.py
new file mode 100644
index 0000000..afabe37
--- /dev/null
+++ b/scripts/build_colors_by_set.py
@@ -0,0 +1,27 @@
+"""Construit l'agrégat set × couleur prêt pour les visualisations."""
+
+from pathlib import Path
+
+from lib.rebrickable.colors_by_set import (
+    aggregate_colors_by_set,
+    build_colors_lookup,
+    load_parts,
+    write_colors_by_set,
+)
+
+
+PARTS_PATH = Path("data/intermediate/parts_filtered.csv")
+COLORS_PATH = Path("data/raw/colors.csv")
+DESTINATION_PATH = Path("data/intermediate/colors_by_set.csv")
+
+
+def main() -> None:
+    """Génère colors_by_set.csv depuis parts_filtered.csv."""
+    parts = load_parts(PARTS_PATH)
+    colors_lookup = build_colors_lookup(COLORS_PATH)
+    aggregated = aggregate_colors_by_set(parts, colors_lookup)
+    write_colors_by_set(DESTINATION_PATH, aggregated)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/test_colors_by_set.py b/tests/test_colors_by_set.py
new file mode 100644
index 0000000..9311d9d
--- /dev/null
+++ b/tests/test_colors_by_set.py
@@ -0,0 +1,101 @@
+"""Tests d'agrégation des couleurs par set."""
+
+import csv
+from pathlib import Path
+
+from lib.rebrickable.colors_by_set import (
+    aggregate_colors_by_set,
+    build_colors_lookup,
+    load_parts,
+    write_colors_by_set,
+)
+
+
+def write_csv(path: Path, headers: list[str], rows: list[list[str]]) -> None:
+    """Écrit un CSV simple pour les besoins des tests."""
+    with path.open("w", newline="") as csv_file:
+        writer = csv.writer(csv_file)
+        writer.writerow(headers)
+        writer.writerows(rows)
+
+
+def test_aggregate_colors_by_set(tmp_path: Path) -> None:
+    """Regroupe les quantités par set et par couleur."""
+    parts_path = tmp_path / "parts_filtered.csv"
+    colors_path = tmp_path / "colors.csv"
+    destination_path = tmp_path / "colors_by_set.csv"
+    write_csv(
+        parts_path,
+        [
+            "part_num",
+            "color_rgb",
+            "is_translucent",
+            "set_num",
+            "set_id",
+            "year",
+            "quantity_in_set",
+            "is_spare",
+            "is_minifig_part",
+        ],
+        [
+            ["3001", "FFFFFF", "false", "1000-1", "1000", "2020", "2", "false", "false"],
+            ["3002", "FFFFFF", "false", "1000-1", "1000", "2020", "1", "true", "false"],
+            ["3003", "000000", "true", "1000-1", "1000", "2020", "4", "false", "true"],
+            ["4001", "FFFFFF", "false", "2000-1", "2000", "2021", "3", "false", "true"],
+        ],
+    )
+    write_csv(
+        colors_path,
+        ["id", "name", "rgb", "is_trans", "num_parts", "num_sets", "y1", "y2"],
+        [
+            ["1", "White", "FFFFFF", "False", "0", "0", "0", "0"],
+            ["2", "Trans-Black", "000000", "True", "0", "0", "0", "0"],
+        ],
+    )
+
+    parts = load_parts(parts_path)
+    colors_lookup = build_colors_lookup(colors_path)
+    aggregated = aggregate_colors_by_set(parts, colors_lookup)
+    write_colors_by_set(destination_path, aggregated)
+
+    with destination_path.open() as csv_file:
+        rows = list(csv.DictReader(csv_file))
+
+    assert rows == [
+        {
+            "set_num": "1000-1",
+            "set_id": "1000",
+            "year": "2020",
+            "color_rgb": "000000",
+            "is_translucent": "true",
+            "color_name": "Trans-Black",
+            "quantity_total": "4",
+            "quantity_non_spare": "4",
+            "quantity_minifig": "4",
+            "quantity_non_minifig": "0",
+        },
+        {
+            "set_num": "1000-1",
+            "set_id": "1000",
+            "year": "2020",
+            "color_rgb": "FFFFFF",
+            "is_translucent": "false",
+            "color_name": "White",
+            "quantity_total": "3",
+            "quantity_non_spare": "2",
+            "quantity_minifig": "0",
+            "quantity_non_minifig": "3",
+        },
+        {
+            "set_num": "2000-1",
+            "set_id": "2000",
+            "year": "2021",
+            "color_rgb": "FFFFFF",
+            "is_translucent": "false",
+            "color_name": "White",
+            "quantity_total": "3",
+            "quantity_non_spare": "3",
+            "quantity_minifig": "3",
+            "quantity_non_minifig": "0",
+        },
+    ]
diff --git a/tests/test_parts_inventory.py b/tests/test_parts_inventory.py
index 0c91452..368117e 100644
--- a/tests/test_parts_inventory.py
+++ b/tests/test_parts_inventory.py
@@ -26,10 +26,10 @@ def test_write_parts_filtered(tmp_path: Path) -> None:
 
     write_csv(
         sets_path,
-        ["set_num", "set_id", "name", "num_parts"],
+        ["set_num", "set_id", "name", "num_parts", "year"],
         [
-            ["1234-1", "1234", "Sample Set A", "9"],
-            ["5678-1", "5678", "Sample Set B", "2"],
+            ["1234-1", "1234", "Sample Set A", "9", "2020"],
+            ["5678-1", "5678", "Sample Set B", "2", "2021"],
         ],
     )
     write_csv(
@@ -98,8 +98,10 @@ def test_write_parts_filtered(tmp_path: Path) -> None:
             "is_translucent": "false",
             "set_num": "1234-1",
             "set_id": "1234",
+            "year": "2020",
             "quantity_in_set": "4",
             "is_spare": "false",
+            "is_minifig_part": "false",
         },
         {
             "part_num": "3002",
@@ -107,8 +109,10 @@ def test_write_parts_filtered(tmp_path: Path) -> None:
             "is_translucent": "true",
             "set_num": "1234-1",
             "set_id": "1234",
+            "year": "2020",
             "quantity_in_set": "1",
             "is_spare": "true",
+            "is_minifig_part": "false",
         },
         {
             "part_num": "mf-1",
@@ -116,8 +120,10 @@ def test_write_parts_filtered(tmp_path: Path) -> None:
             "is_translucent": "true",
             "set_num": "1234-1",
             "set_id": "1234",
+            "year": "2020",
             "quantity_in_set": "1",
             "is_spare": "false",
+            "is_minifig_part": "true",
         },
         {
             "part_num": "mf-2",
@@ -125,8 +131,10 @@ def test_write_parts_filtered(tmp_path: Path) -> None:
             "is_translucent": "false",
             "set_num": "1234-1",
             "set_id": "1234",
+            "year": "2020",
             "quantity_in_set": "2",
             "is_spare": "false",
+            "is_minifig_part": "true",
         },
         {
             "part_num": "3003",
@@ -134,7 +142,9 @@ def test_write_parts_filtered(tmp_path: Path) -> None:
             "is_translucent": "false",
             "set_num": "5678-1",
             "set_id": "5678",
+            "year": "2021",
             "quantity_in_set": "2",
             "is_spare": "false",
+            "is_minifig_part": "false",
         },
     ]