From 6186a5be4fa7875d20bf258f04813280b8e103c0 Mon Sep 17 00:00:00 2001 From: Richard Dern Date: Tue, 2 Dec 2025 01:47:51 +0100 Subject: [PATCH] Normalise les minifigs anonymes en figurants --- README.md | 2 +- config/known_character_aliases.csv | 31 +++++ lib/rebrickable/minifigs_by_set.py | 207 ++++++++++++++++++++++++++++- scripts/compute_minifigs_by_set.py | 10 ++ tests/test_minifigs_by_set.py | 58 +++++++- 5 files changed, 298 insertions(+), 10 deletions(-) create mode 100644 config/known_character_aliases.csv diff --git a/README.md b/README.md index 4b06b9b..986a41d 100644 --- a/README.md +++ b/README.md @@ -222,7 +222,7 @@ Cette étape se lance après le téléchargement des données d'inventaire (éta 1. `source .venv/bin/activate` 2. `python -m scripts.compute_minifigs_by_set` -Le script lit l'inventaire agrégé `data/intermediate/parts_filtered.csv` ainsi que le catalogue des pièces (`data/raw/parts.csv`). Il sélectionne les têtes de minifigs (catégorie 59), ignore les rechanges et dédoublonne par set et référence. Le CSV `data/intermediate/minifigs_by_set.csv` contient une ligne par set et par référence de tête : `set_num`, `part_num`, `part_name`. +Le script lit l'inventaire agrégé `data/intermediate/parts_filtered.csv`, les inventaires `data/raw/inventories.csv`, `data/raw/inventory_parts.csv`, `data/raw/inventory_minifigs.csv`, le catalogue des pièces (`data/raw/parts.csv`) et celui des minifigs (`data/raw/minifigs.csv`). Il sélectionne les têtes de minifigs (catégorie 59), ignore les rechanges et dédoublonne par set et référence. Si une tête est associée à une minifig précise dans l'inventaire du set, `known_character` est renseigné avec le nom de la minifig et `fig_num` est indiqué ; sinon, `known_character` reste vide après tentative de correspondance automatique. Le CSV `data/intermediate/minifigs_by_set.csv` contient : `set_num`, `part_num`, `known_character`, `fig_num`. ### Étape 21 : visualiser le nombre de minifigs par set diff --git a/config/known_character_aliases.csv b/config/known_character_aliases.csv new file mode 100644 index 0000000..77fe8fa --- /dev/null +++ b/config/known_character_aliases.csv @@ -0,0 +1,31 @@ +alias,canonical +Guard in Helmet with Trans-Brown Visor,Figurant +Guard in Dark Blue Cap,Figurant +Guard with Reddish Brown Skin,Figurant +Scientist with Dark Brown Hair and Glasses,Figurant +Guard in Dark Blue Cap and Stubble,Figurant +Guard in Helmet with Visor,Figurant +Guard in Dark Blue Cap with Headset,Figurant +Guard in Dark Bluish Gray Beanie Hat,Figurant +Dr. Henry Wu,Henry Wu +Dr. Henry Loomis,Henry Loomis +ACU Trooper,Figurant +ACU Guard,Figurant +Wildlife Guard - Neck Bracket,Figurant +Guard in Helmet,Figurant +Johnny Thunder (Desert),Figurant +Actress - Pippin Read,Figurant +Cameraman - Blue Legs,Figurant +Stuntman,Figurant +Guard - Beanie Hat,Figurant +Vet with Fedora Hat,Figurant +Vet with Bowl Cut Hair,Figurant +Tracker in Helmet with Visor,Figurant +Guard in Helmet with Goggles,Figurant +Guard in Helmet with Night Vision Goggles,Figurant +Guard,Figurant +Tracker with Mohawk,Figurant +Guard with Scarf,Figurant +Park Worker,Figurant +Park Guest in Dark Pink Vest Jacket,Figurant +Wildlife Guard,Figurant diff --git a/lib/rebrickable/minifigs_by_set.py b/lib/rebrickable/minifigs_by_set.py index 298de11..40feca3 100644 --- a/lib/rebrickable/minifigs_by_set.py +++ b/lib/rebrickable/minifigs_by_set.py @@ -6,9 +6,61 @@ from typing import Dict, Iterable, List, Sequence, Set, Tuple from lib.filesystem import ensure_parent_dir from lib.rebrickable.minifig_heads import HEAD_CATEGORIES +from lib.rebrickable.parts_inventory import ( + index_inventory_minifigs_by_inventory, + index_inventory_parts_by_inventory, + normalize_boolean, + select_latest_inventories, +) from lib.rebrickable.stats import read_rows +KNOWN_CHARACTERS = [ + "Owen Grady", + "Claire Dearing", + "Alan Grant", + "Ellie Sattler", + "Ian Malcolm", + "John Hammond", + "Dennis Nedry", + "Ray Arnold", + "Robert Muldoon", + "Lex Murphy", + "Tim Murphy", + "Donald Gennaro", + "Dr Wu", + "Henry Wu", + "Vic Hoskins", + "Simon Masrani", + "Zia Rodriguez", + "Franklin Webb", + "Rainn DeLaCourt", + "Gunnar Eversol", + "Soyona Santos", + "Kayla Watts", + "Maisie Lockwood", + "Zach Mitchell", + "Gray Mitchell", + "Zach", + "Gray", + "Kenji", + "Darius", + "Yaz", + "Sammy", + "Brooklynn", + "Sinjin Prescott", + "Danny Nedermeyer", + "ACU Trooper", + "Hudson Harper", + "Isabella Delgado", + "Reuben Delgado", + "Allison Miles", + "Henry Loomis", + "Ben", + "Barry" +] + + def load_parts_filtered(path: Path) -> List[dict]: """Charge parts_filtered.csv en mémoire.""" return read_rows(path) @@ -29,10 +81,126 @@ def select_head_parts(catalog: Dict[str, dict]) -> Set[str]: return {part_num for part_num, row in catalog.items() if row["part_cat_id"] in HEAD_CATEGORIES} +def load_minifig_catalog(path: Path) -> Dict[str, dict]: + """Construit un index des minifigs par identifiant.""" + catalog: Dict[str, dict] = {} + with path.open() as minifigs_file: + reader = csv.DictReader(minifigs_file) + for row in reader: + catalog[row["fig_num"]] = row + return catalog + + +def extract_character_name(part_name: str) -> str: + """Extrait un nom probable de personnage depuis le nom de pièce.""" + cleaned = part_name + prefix = "Minifig Head" + if cleaned.startswith(prefix): + cleaned = cleaned[len(prefix) :] + comma_index = cleaned.find(",") + if comma_index != -1: + cleaned = cleaned[:comma_index] + slash_index = cleaned.find("/") + if slash_index != -1: + cleaned = cleaned[:slash_index] + stripped = cleaned.strip() + if stripped == "": + return "Inconnu" + return stripped + + +def select_known_character(extracted_name: str) -> str: + """Associe un personnage connu si le nom extrait correspond à la liste des jalons.""" + lowered = extracted_name.lower() + for character in KNOWN_CHARACTERS: + if character.lower() == lowered: + return character + return "" + + +def load_aliases(path: Path) -> Dict[str, str]: + """Charge les correspondances alias -> nom canonique.""" + aliases: Dict[str, str] = {} + with path.open() as alias_file: + reader = csv.DictReader(alias_file) + for row in reader: + aliases[row["alias"].lower()] = row["canonical"] + return aliases + + +def normalize_known_character(raw_known: str, extracted_name: str, aliases: Dict[str, str]) -> str: + """Nettoie et mappe un nom vers une version canonique.""" + base = raw_known or extracted_name + if base == "Inconnu": + base = "" + base = base.strip() + if base == "": + return "" + if "," in base: + base = base.split(",", 1)[0] + if "/" in base: + base = base.split("/", 1)[0] + cleaned = base.strip() + lowered_cleaned = cleaned.lower() + for alias, canonical in aliases.items(): + if lowered_cleaned == alias or lowered_cleaned.startswith(alias): + return canonical + for character in KNOWN_CHARACTERS: + lowered = character.lower() + if lowered_cleaned == lowered: + return character + if lowered_cleaned.startswith(f"{lowered} "): + return character + if lowered_cleaned.startswith(f"{lowered}'"): + return character + return cleaned + + +def build_set_minifigs_lookup( + inventories: Dict[str, dict], + inventory_minifigs_path: Path, +) -> Dict[str, List[str]]: + """Associe les sets à leurs minifigs via l'inventaire.""" + minifigs_by_inventory = index_inventory_minifigs_by_inventory(inventory_minifigs_path) + lookup: Dict[str, List[str]] = {} + for set_num, inventory in inventories.items(): + lookup[set_num] = [row["fig_num"] for row in minifigs_by_inventory.get(inventory["id"], [])] + return lookup + + +def build_minifig_heads_lookup( + minifig_catalog: Dict[str, dict], + inventories: Dict[str, dict], + inventory_parts_path: Path, + head_parts: Set[str], +) -> Dict[str, Set[str]]: + """Indexe les têtes présentes dans chaque minifig (hors rechanges).""" + parts_by_inventory = index_inventory_parts_by_inventory(inventory_parts_path) + heads_by_minifig: Dict[str, Set[str]] = {} + for fig_num in minifig_catalog: + inventory = inventories.get(fig_num) + if inventory is None: + continue + heads: Set[str] = set() + for part_row in parts_by_inventory.get(inventory["id"], []): + if part_row["part_num"] not in head_parts: + continue + if normalize_boolean(part_row["is_spare"]) == "true": + continue + heads.add(part_row["part_num"]) + if heads: + heads_by_minifig[fig_num] = heads + return heads_by_minifig + + def aggregate_heads_by_set( parts_rows: Iterable[dict], catalog: Dict[str, dict], head_parts: Set[str], + set_minifigs: Dict[str, List[str]], + minifig_heads: Dict[str, Set[str]], + minifig_catalog: Dict[str, dict], + aliases: Dict[str, str], ) -> List[dict]: """Agrège les têtes de minifigs par set en éliminant les rechanges et doublons.""" seen: Set[Tuple[str, str]] = set() @@ -46,11 +214,26 @@ def aggregate_heads_by_set( if key in seen: continue part = catalog[row["part_num"]] + extracted = extract_character_name(part["name"]) + possible_figs = [ + fig_num for fig_num in set_minifigs.get(row["set_num"], []) if row["part_num"] in minifig_heads.get(fig_num, set()) + ] + known_character = "" + matched_fig = "" + if len(possible_figs) == 1: + matched_fig = possible_figs[0] + known_character = minifig_catalog.get(matched_fig, {}).get("name", "") + if known_character == "": + known_character = select_known_character(extracted) + normalized = normalize_known_character(known_character, extracted, aliases) + if matched_fig == "": + continue heads.append( { "set_num": row["set_num"], "part_num": row["part_num"], - "part_name": part["name"], + "known_character": normalized, + "fig_num": matched_fig, } ) seen.add(key) @@ -61,7 +244,7 @@ def aggregate_heads_by_set( def write_heads_by_set(destination_path: Path, rows: Sequence[dict]) -> None: """Écrit le CSV intermédiaire listant les têtes de minifigs par set.""" ensure_parent_dir(destination_path) - fieldnames = ["set_num", "part_num", "part_name"] + fieldnames = ["set_num", "part_num", "known_character", "fig_num"] with destination_path.open("w", newline="") as csv_file: writer = csv.DictWriter(csv_file, fieldnames=fieldnames) writer.writeheader() @@ -72,11 +255,29 @@ def write_heads_by_set(destination_path: Path, rows: Sequence[dict]) -> None: def build_minifigs_by_set( parts_filtered_path: Path, parts_catalog_path: Path, + inventories_path: Path, + inventory_parts_path: Path, + inventory_minifigs_path: Path, + minifigs_path: Path, + aliases_path: Path, destination_path: Path, ) -> None: """Construit le CSV listant les têtes de minifigs présentes par set.""" parts_rows = load_parts_filtered(parts_filtered_path) parts_catalog = load_parts_catalog(parts_catalog_path) head_parts = select_head_parts(parts_catalog) - heads = aggregate_heads_by_set(parts_rows, parts_catalog, head_parts) + latest_inventories = select_latest_inventories(inventories_path) + minifig_catalog = load_minifig_catalog(minifigs_path) + minifig_heads = build_minifig_heads_lookup(minifig_catalog, latest_inventories, inventory_parts_path, head_parts) + set_minifigs = build_set_minifigs_lookup(latest_inventories, inventory_minifigs_path) + aliases = load_aliases(aliases_path) + heads = aggregate_heads_by_set( + parts_rows, + parts_catalog, + head_parts, + set_minifigs, + minifig_heads, + minifig_catalog, + aliases, + ) write_heads_by_set(destination_path, heads) diff --git a/scripts/compute_minifigs_by_set.py b/scripts/compute_minifigs_by_set.py index 54eab5b..c454247 100644 --- a/scripts/compute_minifigs_by_set.py +++ b/scripts/compute_minifigs_by_set.py @@ -7,6 +7,11 @@ from lib.rebrickable.minifigs_by_set import build_minifigs_by_set PARTS_FILTERED_PATH = Path("data/intermediate/parts_filtered.csv") PARTS_CATALOG_PATH = Path("data/raw/parts.csv") +INVENTORIES_PATH = Path("data/raw/inventories.csv") +INVENTORY_PARTS_PATH = Path("data/raw/inventory_parts.csv") +INVENTORY_MINIFIGS_PATH = Path("data/raw/inventory_minifigs.csv") +MINIFIGS_PATH = Path("data/raw/minifigs.csv") +ALIASES_PATH = Path("config/known_character_aliases.csv") DESTINATION_PATH = Path("data/intermediate/minifigs_by_set.csv") @@ -15,6 +20,11 @@ def main() -> None: build_minifigs_by_set( PARTS_FILTERED_PATH, PARTS_CATALOG_PATH, + INVENTORIES_PATH, + INVENTORY_PARTS_PATH, + INVENTORY_MINIFIGS_PATH, + MINIFIGS_PATH, + ALIASES_PATH, DESTINATION_PATH, ) diff --git a/tests/test_minifigs_by_set.py b/tests/test_minifigs_by_set.py index b14507d..88970cd 100644 --- a/tests/test_minifigs_by_set.py +++ b/tests/test_minifigs_by_set.py @@ -20,27 +20,73 @@ def test_build_minifigs_by_set_filters_spares_and_deduplicates(tmp_path) -> None "head-b,ffffff,false,123-1,123,2020,2,false,true\n" "head-b,ffffff,false,123-1,123,2020,1,true,true\n" "head-b,ffffff,false,124-1,124,2021,1,false,true\n" + "head-c,ffffff,false,123-1,123,2020,1,false,true\n" "other,000000,false,123-1,123,2020,1,false,false\n", ) parts_catalog_path = tmp_path / "parts.csv" write_csv( parts_catalog_path, "part_num,name,part_cat_id\n" - "head-a,Head A,59\n" - "head-b,Head B,59\n" + "head-a,Minifig Head Owen Grady,59\n" + "head-b,\"Minifig Head, Eyebrows\",59\n" + "head-c,Minifig Head Unknown,59\n" "other,Other,1\n", ) + inventories_path = tmp_path / "inventories.csv" + write_csv( + inventories_path, + "id,version,set_num\n" + "1,1,123-1\n" + "2,1,124-1\n" + "10,1,fig-owen\n" + "11,1,fig-guard\n", + ) + inventory_parts_path = tmp_path / "inventory_parts.csv" + write_csv( + inventory_parts_path, + "inventory_id,part_num,color_id,quantity,is_spare\n" + "10,head-a,1,1,false\n" + "10,other,1,1,false\n" + "11,head-b,1,1,false\n", + ) + inventory_minifigs_path = tmp_path / "inventory_minifigs.csv" + write_csv( + inventory_minifigs_path, + "inventory_id,fig_num,quantity\n" + "1,fig-owen,1\n" + "1,fig-guard,1\n" + "2,fig-owen,1\n" + "2,fig-guard,1\n", + ) + minifigs_path = tmp_path / "minifigs.csv" + write_csv( + minifigs_path, + "fig_num,name\n" + "fig-owen,Owen Grady with Backpack\n" + "fig-guard,Guard in Helmet with Trans-Brown Visor, Female\n", + ) + aliases_path = tmp_path / "known_character_aliases.csv" + write_csv( + aliases_path, + "alias,canonical\n" + "Guard in Helmet with Trans-Brown Visor,Figurant\n", + ) destination_path = tmp_path / "minifigs_by_set.csv" build_minifigs_by_set( parts_filtered_path, parts_catalog_path, + inventories_path, + inventory_parts_path, + inventory_minifigs_path, + minifigs_path, + aliases_path, destination_path, ) assert destination_path.read_text() == ( - "set_num,part_num,part_name\n" - "123-1,head-a,Head A\n" - "123-1,head-b,Head B\n" - "124-1,head-b,Head B\n" + "set_num,part_num,known_character,fig_num\n" + "123-1,head-a,Owen Grady,fig-owen\n" + "123-1,head-b,Figurant,fig-guard\n" + "124-1,head-b,Figurant,fig-guard\n" )