Création d'un fichier intermédiaire pour les statistiques sur les couleurs

2025-12-01 22:15:48 +01:00
parent cf83f51f89
commit 4f42303eac
5 changed files with 228 additions and 3 deletions
--- a/README.md
+++ b/README.md
@@ -130,6 +130,7 @@ Le script télécharge les fichiers compressés `inventories.csv.gz`, `inventory
 2. `python -m scripts.build_parts_inventory`

 Le script lit `data/intermediate/sets_enriched.csv`, `data/raw/inventories.csv`, `data/raw/inventory_parts.csv`, `data/raw/inventory_minifigs.csv`, `data/raw/minifigs.csv` et `data/raw/colors.csv`, sélectionne la version d'inventaire la plus récente pour chaque set, puis produit `data/intermediate/parts_filtered.csv` contenant : `part_num`, `color_rgb`, `is_translucent`, `set_num`, `set_id`, `quantity_in_set`, `is_spare`. Les minifigs sont éclatées en pièces en exploitant leur propre inventaire (présent dans `inventories.csv` + `inventory_parts.csv`) et leurs quantités dans `inventory_minifigs.csv`. Ce fichier sert de base aux analyses ultérieures sans relire les CSV bruts.
+Le fichier agrégé inclut également l'année du set (`year`) et un indicateur `is_minifig_part` pour distinguer les pièces issues des minifigs.

 ### Étape 10 : identifier les écarts d'inventaire

@@ -155,3 +156,10 @@ Un tableau Markdown est également généré dans `data/final/inventory_gaps.md`
 2. `python -m scripts.compute_parts_stats`

 Le script lit `data/intermediate/parts_filtered.csv` et `data/final/stats.csv` (pour le total catalogue filtré), puis produit `data/final/parts_stats.csv` avec : nombre de variations de pièces (hors rechanges), pièce la moins utilisée, pièce la plus commune, nombre de couleurs utilisées, total de pièces hors rechanges, écart entre le total de pièces attendu (stats catalogue) et l'inventaire agrégé, nombre de sets présentant un écart inventaire/catalogue et écart maximal observé.
+
+### Étape 13 : palette de couleurs par set
+
+1. `source .venv/bin/activate`
+2. `python -m scripts.build_colors_by_set`
+
+Le script agrège `data/intermediate/parts_filtered.csv` avec les libellés de couleurs `data/raw/colors.csv` et produit `data/intermediate/colors_by_set.csv` contenant, pour chaque set et chaque couleur, les quantités totales, hors rechanges, issues des minifigs et hors minifigs. Ce fichier sert de base aux visualisations et matrices de palette.
--- a/lib/rebrickable/colors_by_set.py
+++ b/lib/rebrickable/colors_by_set.py
@@ -0,0 +1,79 @@
+"""Agrégation des couleurs utilisées par set."""
+
+import csv
+from pathlib import Path
+from typing import Dict, Iterable, List, Tuple
+
+from lib.filesystem import ensure_parent_dir
+
+
+def load_parts(parts_path: Path) -> List[dict]:
+    """Charge le fichier parts_filtered pour agrégation."""
+    with parts_path.open() as parts_file:
+        reader = csv.DictReader(parts_file)
+        return list(reader)
+
+
+def build_colors_lookup(colors_path: Path) -> Dict[Tuple[str, str], str]:
+    """Construit un index (rgb, is_translucent) -> nom de couleur."""
+    colors: Dict[Tuple[str, str], str] = {}
+    with colors_path.open() as colors_file:
+        reader = csv.DictReader(colors_file)
+        for row in reader:
+            colors[(row["rgb"], row["is_trans"].lower())] = row["name"]
+    return colors
+
+
+def aggregate_colors_by_set(parts: Iterable[dict], colors_lookup: Dict[Tuple[str, str], str]) -> List[dict]:
+    """Agrège les quantités par set et par couleur."""
+    totals: Dict[Tuple[str, str, str, str, str], dict] = {}
+    for row in parts:
+        key = (row["set_num"], row["set_id"], row["year"], row["color_rgb"], row["is_translucent"])
+        existing = totals.get(key)
+        if existing is None:
+            totals[key] = {
+                "set_num": row["set_num"],
+                "set_id": row["set_id"],
+                "year": row["year"],
+                "color_rgb": row["color_rgb"],
+                "is_translucent": row["is_translucent"],
+                "color_name": colors_lookup[(row["color_rgb"], row["is_translucent"])],
+                "quantity_total": 0,
+                "quantity_non_spare": 0,
+                "quantity_minifig": 0,
+                "quantity_non_minifig": 0,
+            }
+            existing = totals[key]
+        quantity = int(row["quantity_in_set"])
+        existing["quantity_total"] += quantity
+        if row["is_spare"] == "false":
+            existing["quantity_non_spare"] += quantity
+        if row["is_minifig_part"] == "true":
+            existing["quantity_minifig"] += quantity
+        else:
+            existing["quantity_non_minifig"] += quantity
+    aggregated = list(totals.values())
+    aggregated.sort(key=lambda row: (row["set_num"], row["color_name"], row["is_translucent"]))
+    return aggregated
+
+
+def write_colors_by_set(destination_path: Path, rows: Iterable[dict]) -> None:
+    """Sérialise l'agrégat set × couleur dans un CSV dédié."""
+    ensure_parent_dir(destination_path)
+    with destination_path.open("w", newline="") as csv_file:
+        fieldnames = [
+            "set_num",
+            "set_id",
+            "year",
+            "color_rgb",
+            "is_translucent",
+            "color_name",
+            "quantity_total",
+            "quantity_non_spare",
+            "quantity_minifig",
+            "quantity_non_minifig",
+        ]
+        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
+        writer.writeheader()
+        for row in rows:
+            writer.writerow(row)
--- a/scripts/build_colors_by_set.py
+++ b/scripts/build_colors_by_set.py
@@ -0,0 +1,27 @@
+"""Construit l'agrégat set × couleur prêt pour les visualisations."""
+
+from pathlib import Path
+
+from lib.rebrickable.colors_by_set import (
+    aggregate_colors_by_set,
+    build_colors_lookup,
+    load_parts,
+    write_colors_by_set,
+)
+
+
+PARTS_PATH = Path("data/intermediate/parts_filtered.csv")
+COLORS_PATH = Path("data/raw/colors.csv")
+DESTINATION_PATH = Path("data/intermediate/colors_by_set.csv")
+
+
+def main() -> None:
+    """Génère colors_by_set.csv depuis parts_filtered.csv."""
+    parts = load_parts(PARTS_PATH)
+    colors_lookup = build_colors_lookup(COLORS_PATH)
+    aggregated = aggregate_colors_by_set(parts, colors_lookup)
+    write_colors_by_set(DESTINATION_PATH, aggregated)
+
+
+if __name__ == "__main__":
+    main()
--- a/tests/test_colors_by_set.py
+++ b/tests/test_colors_by_set.py
@@ -0,0 +1,101 @@
+"""Tests d'agrégation des couleurs par set."""
+
+import csv
+from pathlib import Path
+
+from lib.rebrickable.colors_by_set import (
+    aggregate_colors_by_set,
+    build_colors_lookup,
+    load_parts,
+    write_colors_by_set,
+)
+
+
+def write_csv(path: Path, headers: list[str], rows: list[list[str]]) -> None:
+    """Écrit un CSV simple pour les besoins des tests."""
+    with path.open("w", newline="") as csv_file:
+        writer = csv.writer(csv_file)
+        writer.writerow(headers)
+        writer.writerows(rows)
+
+
+def test_aggregate_colors_by_set(tmp_path: Path) -> None:
+    """Regroupe les quantités par set et par couleur."""
+    parts_path = tmp_path / "parts_filtered.csv"
+    colors_path = tmp_path / "colors.csv"
+    destination_path = tmp_path / "colors_by_set.csv"
+    write_csv(
+        parts_path,
+        [
+            "part_num",
+            "color_rgb",
+            "is_translucent",
+            "set_num",
+            "set_id",
+            "year",
+            "quantity_in_set",
+            "is_spare",
+            "is_minifig_part",
+        ],
+        [
+            ["3001", "FFFFFF", "false", "1000-1", "1000", "2020", "2", "false", "false"],
+            ["3002", "FFFFFF", "false", "1000-1", "1000", "2020", "1", "true", "false"],
+            ["3003", "000000", "true", "1000-1", "1000", "2020", "4", "false", "true"],
+            ["4001", "FFFFFF", "false", "2000-1", "2000", "2021", "3", "false", "true"],
+        ],
+    )
+    write_csv(
+        colors_path,
+        ["id", "name", "rgb", "is_trans", "num_parts", "num_sets", "y1", "y2"],
+        [
+            ["1", "White", "FFFFFF", "False", "0", "0", "0", "0"],
+            ["2", "Trans-Black", "000000", "True", "0", "0", "0", "0"],
+        ],
+    )
+
+    parts = load_parts(parts_path)
+    colors_lookup = build_colors_lookup(colors_path)
+    aggregated = aggregate_colors_by_set(parts, colors_lookup)
+    write_colors_by_set(destination_path, aggregated)
+
+    with destination_path.open() as csv_file:
+        rows = list(csv.DictReader(csv_file))
+
+    assert rows == [
+        {
+            "set_num": "1000-1",
+            "set_id": "1000",
+            "year": "2020",
+            "color_rgb": "000000",
+            "is_translucent": "true",
+            "color_name": "Trans-Black",
+            "quantity_total": "4",
+            "quantity_non_spare": "4",
+            "quantity_minifig": "4",
+            "quantity_non_minifig": "0",
+        },
+        {
+            "set_num": "1000-1",
+            "set_id": "1000",
+            "year": "2020",
+            "color_rgb": "FFFFFF",
+            "is_translucent": "false",
+            "color_name": "White",
+            "quantity_total": "3",
+            "quantity_non_spare": "2",
+            "quantity_minifig": "0",
+            "quantity_non_minifig": "3",
+        },
+        {
+            "set_num": "2000-1",
+            "set_id": "2000",
+            "year": "2021",
+            "color_rgb": "FFFFFF",
+            "is_translucent": "false",
+            "color_name": "White",
+            "quantity_total": "3",
+            "quantity_non_spare": "3",
+            "quantity_minifig": "3",
+            "quantity_non_minifig": "0",
+        },
+    ]
--- a/tests/test_parts_inventory.py
+++ b/tests/test_parts_inventory.py
@@ -26,10 +26,10 @@ def test_write_parts_filtered(tmp_path: Path) -> None:

    write_csv(
        sets_path,
-        ["set_num", "set_id", "name", "num_parts"],
+        ["set_num", "set_id", "name", "num_parts", "year"],
        [
-            ["1234-1", "1234", "Sample Set A", "9"],
-            ["5678-1", "5678", "Sample Set B", "2"],
+            ["1234-1", "1234", "Sample Set A", "9", "2020"],
+            ["5678-1", "5678", "Sample Set B", "2", "2021"],
        ],
    )
    write_csv(
@@ -98,8 +98,10 @@ def test_write_parts_filtered(tmp_path: Path) -> None:
            "is_translucent": "false",
            "set_num": "1234-1",
            "set_id": "1234",
+            "year": "2020",
            "quantity_in_set": "4",
            "is_spare": "false",
+            "is_minifig_part": "false",
        },
        {
            "part_num": "3002",
@@ -107,8 +109,10 @@ def test_write_parts_filtered(tmp_path: Path) -> None:
            "is_translucent": "true",
            "set_num": "1234-1",
            "set_id": "1234",
+            "year": "2020",
            "quantity_in_set": "1",
            "is_spare": "true",
+            "is_minifig_part": "false",
        },
        {
            "part_num": "mf-1",
@@ -116,8 +120,10 @@ def test_write_parts_filtered(tmp_path: Path) -> None:
            "is_translucent": "true",
            "set_num": "1234-1",
            "set_id": "1234",
+            "year": "2020",
            "quantity_in_set": "1",
            "is_spare": "false",
+            "is_minifig_part": "true",
        },
        {
            "part_num": "mf-2",
@@ -125,8 +131,10 @@ def test_write_parts_filtered(tmp_path: Path) -> None:
            "is_translucent": "false",
            "set_num": "1234-1",
            "set_id": "1234",
+            "year": "2020",
            "quantity_in_set": "2",
            "is_spare": "false",
+            "is_minifig_part": "true",
        },
        {
            "part_num": "3003",
@@ -134,7 +142,9 @@ def test_write_parts_filtered(tmp_path: Path) -> None:
            "is_translucent": "false",
            "set_num": "5678-1",
            "set_id": "5678",
+            "year": "2021",
            "quantity_in_set": "2",
            "is_spare": "false",
+            "is_minifig_part": "false",
        },
    ]