1

Normalise les minifigs anonymes en figurants

This commit is contained in:
Richard Dern 2025-12-02 01:47:51 +01:00
parent 2cf7e063fe
commit 6186a5be4f
5 changed files with 298 additions and 10 deletions

View File

@ -222,7 +222,7 @@ Cette étape se lance après le téléchargement des données d'inventaire (éta
1. `source .venv/bin/activate` 1. `source .venv/bin/activate`
2. `python -m scripts.compute_minifigs_by_set` 2. `python -m scripts.compute_minifigs_by_set`
Le script lit l'inventaire agrégé `data/intermediate/parts_filtered.csv` ainsi que le catalogue des pièces (`data/raw/parts.csv`). Il sélectionne les têtes de minifigs (catégorie 59), ignore les rechanges et dédoublonne par set et référence. Le CSV `data/intermediate/minifigs_by_set.csv` contient une ligne par set et par référence de tête : `set_num`, `part_num`, `part_name`. Le script lit l'inventaire agrégé `data/intermediate/parts_filtered.csv`, les inventaires `data/raw/inventories.csv`, `data/raw/inventory_parts.csv`, `data/raw/inventory_minifigs.csv`, le catalogue des pièces (`data/raw/parts.csv`) et celui des minifigs (`data/raw/minifigs.csv`). Il sélectionne les têtes de minifigs (catégorie 59), ignore les rechanges et dédoublonne par set et référence. Si une tête est associée à une minifig précise dans l'inventaire du set, `known_character` est renseigné avec le nom de la minifig et `fig_num` est indiqué ; sinon, `known_character` reste vide après tentative de correspondance automatique. Le CSV `data/intermediate/minifigs_by_set.csv` contient : `set_num`, `part_num`, `known_character`, `fig_num`.
### Étape 21 : visualiser le nombre de minifigs par set ### Étape 21 : visualiser le nombre de minifigs par set

View File

@ -0,0 +1,31 @@
alias,canonical
Guard in Helmet with Trans-Brown Visor,Figurant
Guard in Dark Blue Cap,Figurant
Guard with Reddish Brown Skin,Figurant
Scientist with Dark Brown Hair and Glasses,Figurant
Guard in Dark Blue Cap and Stubble,Figurant
Guard in Helmet with Visor,Figurant
Guard in Dark Blue Cap with Headset,Figurant
Guard in Dark Bluish Gray Beanie Hat,Figurant
Dr. Henry Wu,Henry Wu
Dr. Henry Loomis,Henry Loomis
ACU Trooper,Figurant
ACU Guard,Figurant
Wildlife Guard - Neck Bracket,Figurant
Guard in Helmet,Figurant
Johnny Thunder (Desert),Figurant
Actress - Pippin Read,Figurant
Cameraman - Blue Legs,Figurant
Stuntman,Figurant
Guard - Beanie Hat,Figurant
Vet with Fedora Hat,Figurant
Vet with Bowl Cut Hair,Figurant
Tracker in Helmet with Visor,Figurant
Guard in Helmet with Goggles,Figurant
Guard in Helmet with Night Vision Goggles,Figurant
Guard,Figurant
Tracker with Mohawk,Figurant
Guard with Scarf,Figurant
Park Worker,Figurant
Park Guest in Dark Pink Vest Jacket,Figurant
Wildlife Guard,Figurant
1 alias canonical
2 Guard in Helmet with Trans-Brown Visor Figurant
3 Guard in Dark Blue Cap Figurant
4 Guard with Reddish Brown Skin Figurant
5 Scientist with Dark Brown Hair and Glasses Figurant
6 Guard in Dark Blue Cap and Stubble Figurant
7 Guard in Helmet with Visor Figurant
8 Guard in Dark Blue Cap with Headset Figurant
9 Guard in Dark Bluish Gray Beanie Hat Figurant
10 Dr. Henry Wu Henry Wu
11 Dr. Henry Loomis Henry Loomis
12 ACU Trooper Figurant
13 ACU Guard Figurant
14 Wildlife Guard - Neck Bracket Figurant
15 Guard in Helmet Figurant
16 Johnny Thunder (Desert) Figurant
17 Actress - Pippin Read Figurant
18 Cameraman - Blue Legs Figurant
19 Stuntman Figurant
20 Guard - Beanie Hat Figurant
21 Vet with Fedora Hat Figurant
22 Vet with Bowl Cut Hair Figurant
23 Tracker in Helmet with Visor Figurant
24 Guard in Helmet with Goggles Figurant
25 Guard in Helmet with Night Vision Goggles Figurant
26 Guard Figurant
27 Tracker with Mohawk Figurant
28 Guard with Scarf Figurant
29 Park Worker Figurant
30 Park Guest in Dark Pink Vest Jacket Figurant
31 Wildlife Guard Figurant

View File

@ -6,9 +6,61 @@ from typing import Dict, Iterable, List, Sequence, Set, Tuple
from lib.filesystem import ensure_parent_dir from lib.filesystem import ensure_parent_dir
from lib.rebrickable.minifig_heads import HEAD_CATEGORIES from lib.rebrickable.minifig_heads import HEAD_CATEGORIES
from lib.rebrickable.parts_inventory import (
index_inventory_minifigs_by_inventory,
index_inventory_parts_by_inventory,
normalize_boolean,
select_latest_inventories,
)
from lib.rebrickable.stats import read_rows from lib.rebrickable.stats import read_rows
KNOWN_CHARACTERS = [
"Owen Grady",
"Claire Dearing",
"Alan Grant",
"Ellie Sattler",
"Ian Malcolm",
"John Hammond",
"Dennis Nedry",
"Ray Arnold",
"Robert Muldoon",
"Lex Murphy",
"Tim Murphy",
"Donald Gennaro",
"Dr Wu",
"Henry Wu",
"Vic Hoskins",
"Simon Masrani",
"Zia Rodriguez",
"Franklin Webb",
"Rainn DeLaCourt",
"Gunnar Eversol",
"Soyona Santos",
"Kayla Watts",
"Maisie Lockwood",
"Zach Mitchell",
"Gray Mitchell",
"Zach",
"Gray",
"Kenji",
"Darius",
"Yaz",
"Sammy",
"Brooklynn",
"Sinjin Prescott",
"Danny Nedermeyer",
"ACU Trooper",
"Hudson Harper",
"Isabella Delgado",
"Reuben Delgado",
"Allison Miles",
"Henry Loomis",
"Ben",
"Barry"
]
def load_parts_filtered(path: Path) -> List[dict]: def load_parts_filtered(path: Path) -> List[dict]:
"""Charge parts_filtered.csv en mémoire.""" """Charge parts_filtered.csv en mémoire."""
return read_rows(path) return read_rows(path)
@ -29,10 +81,126 @@ def select_head_parts(catalog: Dict[str, dict]) -> Set[str]:
return {part_num for part_num, row in catalog.items() if row["part_cat_id"] in HEAD_CATEGORIES} return {part_num for part_num, row in catalog.items() if row["part_cat_id"] in HEAD_CATEGORIES}
def load_minifig_catalog(path: Path) -> Dict[str, dict]:
"""Construit un index des minifigs par identifiant."""
catalog: Dict[str, dict] = {}
with path.open() as minifigs_file:
reader = csv.DictReader(minifigs_file)
for row in reader:
catalog[row["fig_num"]] = row
return catalog
def extract_character_name(part_name: str) -> str:
"""Extrait un nom probable de personnage depuis le nom de pièce."""
cleaned = part_name
prefix = "Minifig Head"
if cleaned.startswith(prefix):
cleaned = cleaned[len(prefix) :]
comma_index = cleaned.find(",")
if comma_index != -1:
cleaned = cleaned[:comma_index]
slash_index = cleaned.find("/")
if slash_index != -1:
cleaned = cleaned[:slash_index]
stripped = cleaned.strip()
if stripped == "":
return "Inconnu"
return stripped
def select_known_character(extracted_name: str) -> str:
"""Associe un personnage connu si le nom extrait correspond à la liste des jalons."""
lowered = extracted_name.lower()
for character in KNOWN_CHARACTERS:
if character.lower() == lowered:
return character
return ""
def load_aliases(path: Path) -> Dict[str, str]:
"""Charge les correspondances alias -> nom canonique."""
aliases: Dict[str, str] = {}
with path.open() as alias_file:
reader = csv.DictReader(alias_file)
for row in reader:
aliases[row["alias"].lower()] = row["canonical"]
return aliases
def normalize_known_character(raw_known: str, extracted_name: str, aliases: Dict[str, str]) -> str:
"""Nettoie et mappe un nom vers une version canonique."""
base = raw_known or extracted_name
if base == "Inconnu":
base = ""
base = base.strip()
if base == "":
return ""
if "," in base:
base = base.split(",", 1)[0]
if "/" in base:
base = base.split("/", 1)[0]
cleaned = base.strip()
lowered_cleaned = cleaned.lower()
for alias, canonical in aliases.items():
if lowered_cleaned == alias or lowered_cleaned.startswith(alias):
return canonical
for character in KNOWN_CHARACTERS:
lowered = character.lower()
if lowered_cleaned == lowered:
return character
if lowered_cleaned.startswith(f"{lowered} "):
return character
if lowered_cleaned.startswith(f"{lowered}'"):
return character
return cleaned
def build_set_minifigs_lookup(
inventories: Dict[str, dict],
inventory_minifigs_path: Path,
) -> Dict[str, List[str]]:
"""Associe les sets à leurs minifigs via l'inventaire."""
minifigs_by_inventory = index_inventory_minifigs_by_inventory(inventory_minifigs_path)
lookup: Dict[str, List[str]] = {}
for set_num, inventory in inventories.items():
lookup[set_num] = [row["fig_num"] for row in minifigs_by_inventory.get(inventory["id"], [])]
return lookup
def build_minifig_heads_lookup(
minifig_catalog: Dict[str, dict],
inventories: Dict[str, dict],
inventory_parts_path: Path,
head_parts: Set[str],
) -> Dict[str, Set[str]]:
"""Indexe les têtes présentes dans chaque minifig (hors rechanges)."""
parts_by_inventory = index_inventory_parts_by_inventory(inventory_parts_path)
heads_by_minifig: Dict[str, Set[str]] = {}
for fig_num in minifig_catalog:
inventory = inventories.get(fig_num)
if inventory is None:
continue
heads: Set[str] = set()
for part_row in parts_by_inventory.get(inventory["id"], []):
if part_row["part_num"] not in head_parts:
continue
if normalize_boolean(part_row["is_spare"]) == "true":
continue
heads.add(part_row["part_num"])
if heads:
heads_by_minifig[fig_num] = heads
return heads_by_minifig
def aggregate_heads_by_set( def aggregate_heads_by_set(
parts_rows: Iterable[dict], parts_rows: Iterable[dict],
catalog: Dict[str, dict], catalog: Dict[str, dict],
head_parts: Set[str], head_parts: Set[str],
set_minifigs: Dict[str, List[str]],
minifig_heads: Dict[str, Set[str]],
minifig_catalog: Dict[str, dict],
aliases: Dict[str, str],
) -> List[dict]: ) -> List[dict]:
"""Agrège les têtes de minifigs par set en éliminant les rechanges et doublons.""" """Agrège les têtes de minifigs par set en éliminant les rechanges et doublons."""
seen: Set[Tuple[str, str]] = set() seen: Set[Tuple[str, str]] = set()
@ -46,11 +214,26 @@ def aggregate_heads_by_set(
if key in seen: if key in seen:
continue continue
part = catalog[row["part_num"]] part = catalog[row["part_num"]]
extracted = extract_character_name(part["name"])
possible_figs = [
fig_num for fig_num in set_minifigs.get(row["set_num"], []) if row["part_num"] in minifig_heads.get(fig_num, set())
]
known_character = ""
matched_fig = ""
if len(possible_figs) == 1:
matched_fig = possible_figs[0]
known_character = minifig_catalog.get(matched_fig, {}).get("name", "")
if known_character == "":
known_character = select_known_character(extracted)
normalized = normalize_known_character(known_character, extracted, aliases)
if matched_fig == "":
continue
heads.append( heads.append(
{ {
"set_num": row["set_num"], "set_num": row["set_num"],
"part_num": row["part_num"], "part_num": row["part_num"],
"part_name": part["name"], "known_character": normalized,
"fig_num": matched_fig,
} }
) )
seen.add(key) seen.add(key)
@ -61,7 +244,7 @@ def aggregate_heads_by_set(
def write_heads_by_set(destination_path: Path, rows: Sequence[dict]) -> None: def write_heads_by_set(destination_path: Path, rows: Sequence[dict]) -> None:
"""Écrit le CSV intermédiaire listant les têtes de minifigs par set.""" """Écrit le CSV intermédiaire listant les têtes de minifigs par set."""
ensure_parent_dir(destination_path) ensure_parent_dir(destination_path)
fieldnames = ["set_num", "part_num", "part_name"] fieldnames = ["set_num", "part_num", "known_character", "fig_num"]
with destination_path.open("w", newline="") as csv_file: with destination_path.open("w", newline="") as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=fieldnames) writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader() writer.writeheader()
@ -72,11 +255,29 @@ def write_heads_by_set(destination_path: Path, rows: Sequence[dict]) -> None:
def build_minifigs_by_set( def build_minifigs_by_set(
parts_filtered_path: Path, parts_filtered_path: Path,
parts_catalog_path: Path, parts_catalog_path: Path,
inventories_path: Path,
inventory_parts_path: Path,
inventory_minifigs_path: Path,
minifigs_path: Path,
aliases_path: Path,
destination_path: Path, destination_path: Path,
) -> None: ) -> None:
"""Construit le CSV listant les têtes de minifigs présentes par set.""" """Construit le CSV listant les têtes de minifigs présentes par set."""
parts_rows = load_parts_filtered(parts_filtered_path) parts_rows = load_parts_filtered(parts_filtered_path)
parts_catalog = load_parts_catalog(parts_catalog_path) parts_catalog = load_parts_catalog(parts_catalog_path)
head_parts = select_head_parts(parts_catalog) head_parts = select_head_parts(parts_catalog)
heads = aggregate_heads_by_set(parts_rows, parts_catalog, head_parts) latest_inventories = select_latest_inventories(inventories_path)
minifig_catalog = load_minifig_catalog(minifigs_path)
minifig_heads = build_minifig_heads_lookup(minifig_catalog, latest_inventories, inventory_parts_path, head_parts)
set_minifigs = build_set_minifigs_lookup(latest_inventories, inventory_minifigs_path)
aliases = load_aliases(aliases_path)
heads = aggregate_heads_by_set(
parts_rows,
parts_catalog,
head_parts,
set_minifigs,
minifig_heads,
minifig_catalog,
aliases,
)
write_heads_by_set(destination_path, heads) write_heads_by_set(destination_path, heads)

View File

@ -7,6 +7,11 @@ from lib.rebrickable.minifigs_by_set import build_minifigs_by_set
PARTS_FILTERED_PATH = Path("data/intermediate/parts_filtered.csv") PARTS_FILTERED_PATH = Path("data/intermediate/parts_filtered.csv")
PARTS_CATALOG_PATH = Path("data/raw/parts.csv") PARTS_CATALOG_PATH = Path("data/raw/parts.csv")
INVENTORIES_PATH = Path("data/raw/inventories.csv")
INVENTORY_PARTS_PATH = Path("data/raw/inventory_parts.csv")
INVENTORY_MINIFIGS_PATH = Path("data/raw/inventory_minifigs.csv")
MINIFIGS_PATH = Path("data/raw/minifigs.csv")
ALIASES_PATH = Path("config/known_character_aliases.csv")
DESTINATION_PATH = Path("data/intermediate/minifigs_by_set.csv") DESTINATION_PATH = Path("data/intermediate/minifigs_by_set.csv")
@ -15,6 +20,11 @@ def main() -> None:
build_minifigs_by_set( build_minifigs_by_set(
PARTS_FILTERED_PATH, PARTS_FILTERED_PATH,
PARTS_CATALOG_PATH, PARTS_CATALOG_PATH,
INVENTORIES_PATH,
INVENTORY_PARTS_PATH,
INVENTORY_MINIFIGS_PATH,
MINIFIGS_PATH,
ALIASES_PATH,
DESTINATION_PATH, DESTINATION_PATH,
) )

View File

@ -20,27 +20,73 @@ def test_build_minifigs_by_set_filters_spares_and_deduplicates(tmp_path) -> None
"head-b,ffffff,false,123-1,123,2020,2,false,true\n" "head-b,ffffff,false,123-1,123,2020,2,false,true\n"
"head-b,ffffff,false,123-1,123,2020,1,true,true\n" "head-b,ffffff,false,123-1,123,2020,1,true,true\n"
"head-b,ffffff,false,124-1,124,2021,1,false,true\n" "head-b,ffffff,false,124-1,124,2021,1,false,true\n"
"head-c,ffffff,false,123-1,123,2020,1,false,true\n"
"other,000000,false,123-1,123,2020,1,false,false\n", "other,000000,false,123-1,123,2020,1,false,false\n",
) )
parts_catalog_path = tmp_path / "parts.csv" parts_catalog_path = tmp_path / "parts.csv"
write_csv( write_csv(
parts_catalog_path, parts_catalog_path,
"part_num,name,part_cat_id\n" "part_num,name,part_cat_id\n"
"head-a,Head A,59\n" "head-a,Minifig Head Owen Grady,59\n"
"head-b,Head B,59\n" "head-b,\"Minifig Head, Eyebrows\",59\n"
"head-c,Minifig Head Unknown,59\n"
"other,Other,1\n", "other,Other,1\n",
) )
inventories_path = tmp_path / "inventories.csv"
write_csv(
inventories_path,
"id,version,set_num\n"
"1,1,123-1\n"
"2,1,124-1\n"
"10,1,fig-owen\n"
"11,1,fig-guard\n",
)
inventory_parts_path = tmp_path / "inventory_parts.csv"
write_csv(
inventory_parts_path,
"inventory_id,part_num,color_id,quantity,is_spare\n"
"10,head-a,1,1,false\n"
"10,other,1,1,false\n"
"11,head-b,1,1,false\n",
)
inventory_minifigs_path = tmp_path / "inventory_minifigs.csv"
write_csv(
inventory_minifigs_path,
"inventory_id,fig_num,quantity\n"
"1,fig-owen,1\n"
"1,fig-guard,1\n"
"2,fig-owen,1\n"
"2,fig-guard,1\n",
)
minifigs_path = tmp_path / "minifigs.csv"
write_csv(
minifigs_path,
"fig_num,name\n"
"fig-owen,Owen Grady with Backpack\n"
"fig-guard,Guard in Helmet with Trans-Brown Visor, Female\n",
)
aliases_path = tmp_path / "known_character_aliases.csv"
write_csv(
aliases_path,
"alias,canonical\n"
"Guard in Helmet with Trans-Brown Visor,Figurant\n",
)
destination_path = tmp_path / "minifigs_by_set.csv" destination_path = tmp_path / "minifigs_by_set.csv"
build_minifigs_by_set( build_minifigs_by_set(
parts_filtered_path, parts_filtered_path,
parts_catalog_path, parts_catalog_path,
inventories_path,
inventory_parts_path,
inventory_minifigs_path,
minifigs_path,
aliases_path,
destination_path, destination_path,
) )
assert destination_path.read_text() == ( assert destination_path.read_text() == (
"set_num,part_num,part_name\n" "set_num,part_num,known_character,fig_num\n"
"123-1,head-a,Head A\n" "123-1,head-a,Owen Grady,fig-owen\n"
"123-1,head-b,Head B\n" "123-1,head-b,Figurant,fig-guard\n"
"124-1,head-b,Head B\n" "124-1,head-b,Figurant,fig-guard\n"
) )