1

Création d'un fichier intermédiaire pour les statistiques sur les couleurs

This commit is contained in:
Richard Dern 2025-12-01 22:15:48 +01:00
parent cf83f51f89
commit 4f42303eac
5 changed files with 228 additions and 3 deletions

View File

@ -130,6 +130,7 @@ Le script télécharge les fichiers compressés `inventories.csv.gz`, `inventory
2. `python -m scripts.build_parts_inventory`
Le script lit `data/intermediate/sets_enriched.csv`, `data/raw/inventories.csv`, `data/raw/inventory_parts.csv`, `data/raw/inventory_minifigs.csv`, `data/raw/minifigs.csv` et `data/raw/colors.csv`, sélectionne la version d'inventaire la plus récente pour chaque set, puis produit `data/intermediate/parts_filtered.csv` contenant : `part_num`, `color_rgb`, `is_translucent`, `set_num`, `set_id`, `quantity_in_set`, `is_spare`. Les minifigs sont éclatées en pièces en exploitant leur propre inventaire (présent dans `inventories.csv` + `inventory_parts.csv`) et leurs quantités dans `inventory_minifigs.csv`. Ce fichier sert de base aux analyses ultérieures sans relire les CSV bruts.
Le fichier agrégé inclut également l'année du set (`year`) et un indicateur `is_minifig_part` pour distinguer les pièces issues des minifigs.
### Étape 10 : identifier les écarts d'inventaire
@ -155,3 +156,10 @@ Un tableau Markdown est également généré dans `data/final/inventory_gaps.md`
2. `python -m scripts.compute_parts_stats`
Le script lit `data/intermediate/parts_filtered.csv` et `data/final/stats.csv` (pour le total catalogue filtré), puis produit `data/final/parts_stats.csv` avec : nombre de variations de pièces (hors rechanges), pièce la moins utilisée, pièce la plus commune, nombre de couleurs utilisées, total de pièces hors rechanges, écart entre le total de pièces attendu (stats catalogue) et l'inventaire agrégé, nombre de sets présentant un écart inventaire/catalogue et écart maximal observé.
### Étape 13 : palette de couleurs par set
1. `source .venv/bin/activate`
2. `python -m scripts.build_colors_by_set`
Le script agrège `data/intermediate/parts_filtered.csv` avec les libellés de couleurs `data/raw/colors.csv` et produit `data/intermediate/colors_by_set.csv` contenant, pour chaque set et chaque couleur, les quantités totales, hors rechanges, issues des minifigs et hors minifigs. Ce fichier sert de base aux visualisations et matrices de palette.

View File

@ -0,0 +1,79 @@
"""Agrégation des couleurs utilisées par set."""
import csv
from pathlib import Path
from typing import Dict, Iterable, List, Tuple
from lib.filesystem import ensure_parent_dir
def load_parts(parts_path: Path) -> List[dict]:
"""Charge le fichier parts_filtered pour agrégation."""
with parts_path.open() as parts_file:
reader = csv.DictReader(parts_file)
return list(reader)
def build_colors_lookup(colors_path: Path) -> Dict[Tuple[str, str], str]:
"""Construit un index (rgb, is_translucent) -> nom de couleur."""
colors: Dict[Tuple[str, str], str] = {}
with colors_path.open() as colors_file:
reader = csv.DictReader(colors_file)
for row in reader:
colors[(row["rgb"], row["is_trans"].lower())] = row["name"]
return colors
def aggregate_colors_by_set(parts: Iterable[dict], colors_lookup: Dict[Tuple[str, str], str]) -> List[dict]:
"""Agrège les quantités par set et par couleur."""
totals: Dict[Tuple[str, str, str, str, str], dict] = {}
for row in parts:
key = (row["set_num"], row["set_id"], row["year"], row["color_rgb"], row["is_translucent"])
existing = totals.get(key)
if existing is None:
totals[key] = {
"set_num": row["set_num"],
"set_id": row["set_id"],
"year": row["year"],
"color_rgb": row["color_rgb"],
"is_translucent": row["is_translucent"],
"color_name": colors_lookup[(row["color_rgb"], row["is_translucent"])],
"quantity_total": 0,
"quantity_non_spare": 0,
"quantity_minifig": 0,
"quantity_non_minifig": 0,
}
existing = totals[key]
quantity = int(row["quantity_in_set"])
existing["quantity_total"] += quantity
if row["is_spare"] == "false":
existing["quantity_non_spare"] += quantity
if row["is_minifig_part"] == "true":
existing["quantity_minifig"] += quantity
else:
existing["quantity_non_minifig"] += quantity
aggregated = list(totals.values())
aggregated.sort(key=lambda row: (row["set_num"], row["color_name"], row["is_translucent"]))
return aggregated
def write_colors_by_set(destination_path: Path, rows: Iterable[dict]) -> None:
"""Sérialise l'agrégat set × couleur dans un CSV dédié."""
ensure_parent_dir(destination_path)
with destination_path.open("w", newline="") as csv_file:
fieldnames = [
"set_num",
"set_id",
"year",
"color_rgb",
"is_translucent",
"color_name",
"quantity_total",
"quantity_non_spare",
"quantity_minifig",
"quantity_non_minifig",
]
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader()
for row in rows:
writer.writerow(row)

View File

@ -0,0 +1,27 @@
"""Construit l'agrégat set × couleur prêt pour les visualisations."""
from pathlib import Path
from lib.rebrickable.colors_by_set import (
aggregate_colors_by_set,
build_colors_lookup,
load_parts,
write_colors_by_set,
)
PARTS_PATH = Path("data/intermediate/parts_filtered.csv")
COLORS_PATH = Path("data/raw/colors.csv")
DESTINATION_PATH = Path("data/intermediate/colors_by_set.csv")
def main() -> None:
"""Génère colors_by_set.csv depuis parts_filtered.csv."""
parts = load_parts(PARTS_PATH)
colors_lookup = build_colors_lookup(COLORS_PATH)
aggregated = aggregate_colors_by_set(parts, colors_lookup)
write_colors_by_set(DESTINATION_PATH, aggregated)
if __name__ == "__main__":
main()

101
tests/test_colors_by_set.py Normal file
View File

@ -0,0 +1,101 @@
"""Tests d'agrégation des couleurs par set."""
import csv
from pathlib import Path
from lib.rebrickable.colors_by_set import (
aggregate_colors_by_set,
build_colors_lookup,
load_parts,
write_colors_by_set,
)
def write_csv(path: Path, headers: list[str], rows: list[list[str]]) -> None:
"""Écrit un CSV simple pour les besoins des tests."""
with path.open("w", newline="") as csv_file:
writer = csv.writer(csv_file)
writer.writerow(headers)
writer.writerows(rows)
def test_aggregate_colors_by_set(tmp_path: Path) -> None:
"""Regroupe les quantités par set et par couleur."""
parts_path = tmp_path / "parts_filtered.csv"
colors_path = tmp_path / "colors.csv"
destination_path = tmp_path / "colors_by_set.csv"
write_csv(
parts_path,
[
"part_num",
"color_rgb",
"is_translucent",
"set_num",
"set_id",
"year",
"quantity_in_set",
"is_spare",
"is_minifig_part",
],
[
["3001", "FFFFFF", "false", "1000-1", "1000", "2020", "2", "false", "false"],
["3002", "FFFFFF", "false", "1000-1", "1000", "2020", "1", "true", "false"],
["3003", "000000", "true", "1000-1", "1000", "2020", "4", "false", "true"],
["4001", "FFFFFF", "false", "2000-1", "2000", "2021", "3", "false", "true"],
],
)
write_csv(
colors_path,
["id", "name", "rgb", "is_trans", "num_parts", "num_sets", "y1", "y2"],
[
["1", "White", "FFFFFF", "False", "0", "0", "0", "0"],
["2", "Trans-Black", "000000", "True", "0", "0", "0", "0"],
],
)
parts = load_parts(parts_path)
colors_lookup = build_colors_lookup(colors_path)
aggregated = aggregate_colors_by_set(parts, colors_lookup)
write_colors_by_set(destination_path, aggregated)
with destination_path.open() as csv_file:
rows = list(csv.DictReader(csv_file))
assert rows == [
{
"set_num": "1000-1",
"set_id": "1000",
"year": "2020",
"color_rgb": "000000",
"is_translucent": "true",
"color_name": "Trans-Black",
"quantity_total": "4",
"quantity_non_spare": "4",
"quantity_minifig": "4",
"quantity_non_minifig": "0",
},
{
"set_num": "1000-1",
"set_id": "1000",
"year": "2020",
"color_rgb": "FFFFFF",
"is_translucent": "false",
"color_name": "White",
"quantity_total": "3",
"quantity_non_spare": "2",
"quantity_minifig": "0",
"quantity_non_minifig": "3",
},
{
"set_num": "2000-1",
"set_id": "2000",
"year": "2021",
"color_rgb": "FFFFFF",
"is_translucent": "false",
"color_name": "White",
"quantity_total": "3",
"quantity_non_spare": "3",
"quantity_minifig": "3",
"quantity_non_minifig": "0",
},
]

View File

@ -26,10 +26,10 @@ def test_write_parts_filtered(tmp_path: Path) -> None:
write_csv(
sets_path,
["set_num", "set_id", "name", "num_parts"],
["set_num", "set_id", "name", "num_parts", "year"],
[
["1234-1", "1234", "Sample Set A", "9"],
["5678-1", "5678", "Sample Set B", "2"],
["1234-1", "1234", "Sample Set A", "9", "2020"],
["5678-1", "5678", "Sample Set B", "2", "2021"],
],
)
write_csv(
@ -98,8 +98,10 @@ def test_write_parts_filtered(tmp_path: Path) -> None:
"is_translucent": "false",
"set_num": "1234-1",
"set_id": "1234",
"year": "2020",
"quantity_in_set": "4",
"is_spare": "false",
"is_minifig_part": "false",
},
{
"part_num": "3002",
@ -107,8 +109,10 @@ def test_write_parts_filtered(tmp_path: Path) -> None:
"is_translucent": "true",
"set_num": "1234-1",
"set_id": "1234",
"year": "2020",
"quantity_in_set": "1",
"is_spare": "true",
"is_minifig_part": "false",
},
{
"part_num": "mf-1",
@ -116,8 +120,10 @@ def test_write_parts_filtered(tmp_path: Path) -> None:
"is_translucent": "true",
"set_num": "1234-1",
"set_id": "1234",
"year": "2020",
"quantity_in_set": "1",
"is_spare": "false",
"is_minifig_part": "true",
},
{
"part_num": "mf-2",
@ -125,8 +131,10 @@ def test_write_parts_filtered(tmp_path: Path) -> None:
"is_translucent": "false",
"set_num": "1234-1",
"set_id": "1234",
"year": "2020",
"quantity_in_set": "2",
"is_spare": "false",
"is_minifig_part": "true",
},
{
"part_num": "3003",
@ -134,7 +142,9 @@ def test_write_parts_filtered(tmp_path: Path) -> None:
"is_translucent": "false",
"set_num": "5678-1",
"set_id": "5678",
"year": "2021",
"quantity_in_set": "2",
"is_spare": "false",
"is_minifig_part": "false",
},
]