Normalise les minifigs anonymes en figurants
This commit is contained in:
parent
2cf7e063fe
commit
6186a5be4f
@ -222,7 +222,7 @@ Cette étape se lance après le téléchargement des données d'inventaire (éta
|
||||
1. `source .venv/bin/activate`
|
||||
2. `python -m scripts.compute_minifigs_by_set`
|
||||
|
||||
Le script lit l'inventaire agrégé `data/intermediate/parts_filtered.csv` ainsi que le catalogue des pièces (`data/raw/parts.csv`). Il sélectionne les têtes de minifigs (catégorie 59), ignore les rechanges et dédoublonne par set et référence. Le CSV `data/intermediate/minifigs_by_set.csv` contient une ligne par set et par référence de tête : `set_num`, `part_num`, `part_name`.
|
||||
Le script lit l'inventaire agrégé `data/intermediate/parts_filtered.csv`, les inventaires `data/raw/inventories.csv`, `data/raw/inventory_parts.csv`, `data/raw/inventory_minifigs.csv`, le catalogue des pièces (`data/raw/parts.csv`) et celui des minifigs (`data/raw/minifigs.csv`). Il sélectionne les têtes de minifigs (catégorie 59), ignore les rechanges et dédoublonne par set et référence. Si une tête est associée à une minifig précise dans l'inventaire du set, `known_character` est renseigné avec le nom de la minifig et `fig_num` est indiqué ; sinon, `known_character` reste vide après tentative de correspondance automatique. Le CSV `data/intermediate/minifigs_by_set.csv` contient : `set_num`, `part_num`, `known_character`, `fig_num`.
|
||||
|
||||
### Étape 21 : visualiser le nombre de minifigs par set
|
||||
|
||||
|
||||
31
config/known_character_aliases.csv
Normal file
31
config/known_character_aliases.csv
Normal file
@ -0,0 +1,31 @@
|
||||
alias,canonical
|
||||
Guard in Helmet with Trans-Brown Visor,Figurant
|
||||
Guard in Dark Blue Cap,Figurant
|
||||
Guard with Reddish Brown Skin,Figurant
|
||||
Scientist with Dark Brown Hair and Glasses,Figurant
|
||||
Guard in Dark Blue Cap and Stubble,Figurant
|
||||
Guard in Helmet with Visor,Figurant
|
||||
Guard in Dark Blue Cap with Headset,Figurant
|
||||
Guard in Dark Bluish Gray Beanie Hat,Figurant
|
||||
Dr. Henry Wu,Henry Wu
|
||||
Dr. Henry Loomis,Henry Loomis
|
||||
ACU Trooper,Figurant
|
||||
ACU Guard,Figurant
|
||||
Wildlife Guard - Neck Bracket,Figurant
|
||||
Guard in Helmet,Figurant
|
||||
Johnny Thunder (Desert),Figurant
|
||||
Actress - Pippin Read,Figurant
|
||||
Cameraman - Blue Legs,Figurant
|
||||
Stuntman,Figurant
|
||||
Guard - Beanie Hat,Figurant
|
||||
Vet with Fedora Hat,Figurant
|
||||
Vet with Bowl Cut Hair,Figurant
|
||||
Tracker in Helmet with Visor,Figurant
|
||||
Guard in Helmet with Goggles,Figurant
|
||||
Guard in Helmet with Night Vision Goggles,Figurant
|
||||
Guard,Figurant
|
||||
Tracker with Mohawk,Figurant
|
||||
Guard with Scarf,Figurant
|
||||
Park Worker,Figurant
|
||||
Park Guest in Dark Pink Vest Jacket,Figurant
|
||||
Wildlife Guard,Figurant
|
||||
|
@ -6,9 +6,61 @@ from typing import Dict, Iterable, List, Sequence, Set, Tuple
|
||||
|
||||
from lib.filesystem import ensure_parent_dir
|
||||
from lib.rebrickable.minifig_heads import HEAD_CATEGORIES
|
||||
from lib.rebrickable.parts_inventory import (
|
||||
index_inventory_minifigs_by_inventory,
|
||||
index_inventory_parts_by_inventory,
|
||||
normalize_boolean,
|
||||
select_latest_inventories,
|
||||
)
|
||||
from lib.rebrickable.stats import read_rows
|
||||
|
||||
|
||||
KNOWN_CHARACTERS = [
|
||||
"Owen Grady",
|
||||
"Claire Dearing",
|
||||
"Alan Grant",
|
||||
"Ellie Sattler",
|
||||
"Ian Malcolm",
|
||||
"John Hammond",
|
||||
"Dennis Nedry",
|
||||
"Ray Arnold",
|
||||
"Robert Muldoon",
|
||||
"Lex Murphy",
|
||||
"Tim Murphy",
|
||||
"Donald Gennaro",
|
||||
"Dr Wu",
|
||||
"Henry Wu",
|
||||
"Vic Hoskins",
|
||||
"Simon Masrani",
|
||||
"Zia Rodriguez",
|
||||
"Franklin Webb",
|
||||
"Rainn DeLaCourt",
|
||||
"Gunnar Eversol",
|
||||
"Soyona Santos",
|
||||
"Kayla Watts",
|
||||
"Maisie Lockwood",
|
||||
"Zach Mitchell",
|
||||
"Gray Mitchell",
|
||||
"Zach",
|
||||
"Gray",
|
||||
"Kenji",
|
||||
"Darius",
|
||||
"Yaz",
|
||||
"Sammy",
|
||||
"Brooklynn",
|
||||
"Sinjin Prescott",
|
||||
"Danny Nedermeyer",
|
||||
"ACU Trooper",
|
||||
"Hudson Harper",
|
||||
"Isabella Delgado",
|
||||
"Reuben Delgado",
|
||||
"Allison Miles",
|
||||
"Henry Loomis",
|
||||
"Ben",
|
||||
"Barry"
|
||||
]
|
||||
|
||||
|
||||
def load_parts_filtered(path: Path) -> List[dict]:
|
||||
"""Charge parts_filtered.csv en mémoire."""
|
||||
return read_rows(path)
|
||||
@ -29,10 +81,126 @@ def select_head_parts(catalog: Dict[str, dict]) -> Set[str]:
|
||||
return {part_num for part_num, row in catalog.items() if row["part_cat_id"] in HEAD_CATEGORIES}
|
||||
|
||||
|
||||
def load_minifig_catalog(path: Path) -> Dict[str, dict]:
|
||||
"""Construit un index des minifigs par identifiant."""
|
||||
catalog: Dict[str, dict] = {}
|
||||
with path.open() as minifigs_file:
|
||||
reader = csv.DictReader(minifigs_file)
|
||||
for row in reader:
|
||||
catalog[row["fig_num"]] = row
|
||||
return catalog
|
||||
|
||||
|
||||
def extract_character_name(part_name: str) -> str:
|
||||
"""Extrait un nom probable de personnage depuis le nom de pièce."""
|
||||
cleaned = part_name
|
||||
prefix = "Minifig Head"
|
||||
if cleaned.startswith(prefix):
|
||||
cleaned = cleaned[len(prefix) :]
|
||||
comma_index = cleaned.find(",")
|
||||
if comma_index != -1:
|
||||
cleaned = cleaned[:comma_index]
|
||||
slash_index = cleaned.find("/")
|
||||
if slash_index != -1:
|
||||
cleaned = cleaned[:slash_index]
|
||||
stripped = cleaned.strip()
|
||||
if stripped == "":
|
||||
return "Inconnu"
|
||||
return stripped
|
||||
|
||||
|
||||
def select_known_character(extracted_name: str) -> str:
|
||||
"""Associe un personnage connu si le nom extrait correspond à la liste des jalons."""
|
||||
lowered = extracted_name.lower()
|
||||
for character in KNOWN_CHARACTERS:
|
||||
if character.lower() == lowered:
|
||||
return character
|
||||
return ""
|
||||
|
||||
|
||||
def load_aliases(path: Path) -> Dict[str, str]:
|
||||
"""Charge les correspondances alias -> nom canonique."""
|
||||
aliases: Dict[str, str] = {}
|
||||
with path.open() as alias_file:
|
||||
reader = csv.DictReader(alias_file)
|
||||
for row in reader:
|
||||
aliases[row["alias"].lower()] = row["canonical"]
|
||||
return aliases
|
||||
|
||||
|
||||
def normalize_known_character(raw_known: str, extracted_name: str, aliases: Dict[str, str]) -> str:
|
||||
"""Nettoie et mappe un nom vers une version canonique."""
|
||||
base = raw_known or extracted_name
|
||||
if base == "Inconnu":
|
||||
base = ""
|
||||
base = base.strip()
|
||||
if base == "":
|
||||
return ""
|
||||
if "," in base:
|
||||
base = base.split(",", 1)[0]
|
||||
if "/" in base:
|
||||
base = base.split("/", 1)[0]
|
||||
cleaned = base.strip()
|
||||
lowered_cleaned = cleaned.lower()
|
||||
for alias, canonical in aliases.items():
|
||||
if lowered_cleaned == alias or lowered_cleaned.startswith(alias):
|
||||
return canonical
|
||||
for character in KNOWN_CHARACTERS:
|
||||
lowered = character.lower()
|
||||
if lowered_cleaned == lowered:
|
||||
return character
|
||||
if lowered_cleaned.startswith(f"{lowered} "):
|
||||
return character
|
||||
if lowered_cleaned.startswith(f"{lowered}'"):
|
||||
return character
|
||||
return cleaned
|
||||
|
||||
|
||||
def build_set_minifigs_lookup(
|
||||
inventories: Dict[str, dict],
|
||||
inventory_minifigs_path: Path,
|
||||
) -> Dict[str, List[str]]:
|
||||
"""Associe les sets à leurs minifigs via l'inventaire."""
|
||||
minifigs_by_inventory = index_inventory_minifigs_by_inventory(inventory_minifigs_path)
|
||||
lookup: Dict[str, List[str]] = {}
|
||||
for set_num, inventory in inventories.items():
|
||||
lookup[set_num] = [row["fig_num"] for row in minifigs_by_inventory.get(inventory["id"], [])]
|
||||
return lookup
|
||||
|
||||
|
||||
def build_minifig_heads_lookup(
|
||||
minifig_catalog: Dict[str, dict],
|
||||
inventories: Dict[str, dict],
|
||||
inventory_parts_path: Path,
|
||||
head_parts: Set[str],
|
||||
) -> Dict[str, Set[str]]:
|
||||
"""Indexe les têtes présentes dans chaque minifig (hors rechanges)."""
|
||||
parts_by_inventory = index_inventory_parts_by_inventory(inventory_parts_path)
|
||||
heads_by_minifig: Dict[str, Set[str]] = {}
|
||||
for fig_num in minifig_catalog:
|
||||
inventory = inventories.get(fig_num)
|
||||
if inventory is None:
|
||||
continue
|
||||
heads: Set[str] = set()
|
||||
for part_row in parts_by_inventory.get(inventory["id"], []):
|
||||
if part_row["part_num"] not in head_parts:
|
||||
continue
|
||||
if normalize_boolean(part_row["is_spare"]) == "true":
|
||||
continue
|
||||
heads.add(part_row["part_num"])
|
||||
if heads:
|
||||
heads_by_minifig[fig_num] = heads
|
||||
return heads_by_minifig
|
||||
|
||||
|
||||
def aggregate_heads_by_set(
|
||||
parts_rows: Iterable[dict],
|
||||
catalog: Dict[str, dict],
|
||||
head_parts: Set[str],
|
||||
set_minifigs: Dict[str, List[str]],
|
||||
minifig_heads: Dict[str, Set[str]],
|
||||
minifig_catalog: Dict[str, dict],
|
||||
aliases: Dict[str, str],
|
||||
) -> List[dict]:
|
||||
"""Agrège les têtes de minifigs par set en éliminant les rechanges et doublons."""
|
||||
seen: Set[Tuple[str, str]] = set()
|
||||
@ -46,11 +214,26 @@ def aggregate_heads_by_set(
|
||||
if key in seen:
|
||||
continue
|
||||
part = catalog[row["part_num"]]
|
||||
extracted = extract_character_name(part["name"])
|
||||
possible_figs = [
|
||||
fig_num for fig_num in set_minifigs.get(row["set_num"], []) if row["part_num"] in minifig_heads.get(fig_num, set())
|
||||
]
|
||||
known_character = ""
|
||||
matched_fig = ""
|
||||
if len(possible_figs) == 1:
|
||||
matched_fig = possible_figs[0]
|
||||
known_character = minifig_catalog.get(matched_fig, {}).get("name", "")
|
||||
if known_character == "":
|
||||
known_character = select_known_character(extracted)
|
||||
normalized = normalize_known_character(known_character, extracted, aliases)
|
||||
if matched_fig == "":
|
||||
continue
|
||||
heads.append(
|
||||
{
|
||||
"set_num": row["set_num"],
|
||||
"part_num": row["part_num"],
|
||||
"part_name": part["name"],
|
||||
"known_character": normalized,
|
||||
"fig_num": matched_fig,
|
||||
}
|
||||
)
|
||||
seen.add(key)
|
||||
@ -61,7 +244,7 @@ def aggregate_heads_by_set(
|
||||
def write_heads_by_set(destination_path: Path, rows: Sequence[dict]) -> None:
|
||||
"""Écrit le CSV intermédiaire listant les têtes de minifigs par set."""
|
||||
ensure_parent_dir(destination_path)
|
||||
fieldnames = ["set_num", "part_num", "part_name"]
|
||||
fieldnames = ["set_num", "part_num", "known_character", "fig_num"]
|
||||
with destination_path.open("w", newline="") as csv_file:
|
||||
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
@ -72,11 +255,29 @@ def write_heads_by_set(destination_path: Path, rows: Sequence[dict]) -> None:
|
||||
def build_minifigs_by_set(
|
||||
parts_filtered_path: Path,
|
||||
parts_catalog_path: Path,
|
||||
inventories_path: Path,
|
||||
inventory_parts_path: Path,
|
||||
inventory_minifigs_path: Path,
|
||||
minifigs_path: Path,
|
||||
aliases_path: Path,
|
||||
destination_path: Path,
|
||||
) -> None:
|
||||
"""Construit le CSV listant les têtes de minifigs présentes par set."""
|
||||
parts_rows = load_parts_filtered(parts_filtered_path)
|
||||
parts_catalog = load_parts_catalog(parts_catalog_path)
|
||||
head_parts = select_head_parts(parts_catalog)
|
||||
heads = aggregate_heads_by_set(parts_rows, parts_catalog, head_parts)
|
||||
latest_inventories = select_latest_inventories(inventories_path)
|
||||
minifig_catalog = load_minifig_catalog(minifigs_path)
|
||||
minifig_heads = build_minifig_heads_lookup(minifig_catalog, latest_inventories, inventory_parts_path, head_parts)
|
||||
set_minifigs = build_set_minifigs_lookup(latest_inventories, inventory_minifigs_path)
|
||||
aliases = load_aliases(aliases_path)
|
||||
heads = aggregate_heads_by_set(
|
||||
parts_rows,
|
||||
parts_catalog,
|
||||
head_parts,
|
||||
set_minifigs,
|
||||
minifig_heads,
|
||||
minifig_catalog,
|
||||
aliases,
|
||||
)
|
||||
write_heads_by_set(destination_path, heads)
|
||||
|
||||
@ -7,6 +7,11 @@ from lib.rebrickable.minifigs_by_set import build_minifigs_by_set
|
||||
|
||||
PARTS_FILTERED_PATH = Path("data/intermediate/parts_filtered.csv")
|
||||
PARTS_CATALOG_PATH = Path("data/raw/parts.csv")
|
||||
INVENTORIES_PATH = Path("data/raw/inventories.csv")
|
||||
INVENTORY_PARTS_PATH = Path("data/raw/inventory_parts.csv")
|
||||
INVENTORY_MINIFIGS_PATH = Path("data/raw/inventory_minifigs.csv")
|
||||
MINIFIGS_PATH = Path("data/raw/minifigs.csv")
|
||||
ALIASES_PATH = Path("config/known_character_aliases.csv")
|
||||
DESTINATION_PATH = Path("data/intermediate/minifigs_by_set.csv")
|
||||
|
||||
|
||||
@ -15,6 +20,11 @@ def main() -> None:
|
||||
build_minifigs_by_set(
|
||||
PARTS_FILTERED_PATH,
|
||||
PARTS_CATALOG_PATH,
|
||||
INVENTORIES_PATH,
|
||||
INVENTORY_PARTS_PATH,
|
||||
INVENTORY_MINIFIGS_PATH,
|
||||
MINIFIGS_PATH,
|
||||
ALIASES_PATH,
|
||||
DESTINATION_PATH,
|
||||
)
|
||||
|
||||
|
||||
@ -20,27 +20,73 @@ def test_build_minifigs_by_set_filters_spares_and_deduplicates(tmp_path) -> None
|
||||
"head-b,ffffff,false,123-1,123,2020,2,false,true\n"
|
||||
"head-b,ffffff,false,123-1,123,2020,1,true,true\n"
|
||||
"head-b,ffffff,false,124-1,124,2021,1,false,true\n"
|
||||
"head-c,ffffff,false,123-1,123,2020,1,false,true\n"
|
||||
"other,000000,false,123-1,123,2020,1,false,false\n",
|
||||
)
|
||||
parts_catalog_path = tmp_path / "parts.csv"
|
||||
write_csv(
|
||||
parts_catalog_path,
|
||||
"part_num,name,part_cat_id\n"
|
||||
"head-a,Head A,59\n"
|
||||
"head-b,Head B,59\n"
|
||||
"head-a,Minifig Head Owen Grady,59\n"
|
||||
"head-b,\"Minifig Head, Eyebrows\",59\n"
|
||||
"head-c,Minifig Head Unknown,59\n"
|
||||
"other,Other,1\n",
|
||||
)
|
||||
inventories_path = tmp_path / "inventories.csv"
|
||||
write_csv(
|
||||
inventories_path,
|
||||
"id,version,set_num\n"
|
||||
"1,1,123-1\n"
|
||||
"2,1,124-1\n"
|
||||
"10,1,fig-owen\n"
|
||||
"11,1,fig-guard\n",
|
||||
)
|
||||
inventory_parts_path = tmp_path / "inventory_parts.csv"
|
||||
write_csv(
|
||||
inventory_parts_path,
|
||||
"inventory_id,part_num,color_id,quantity,is_spare\n"
|
||||
"10,head-a,1,1,false\n"
|
||||
"10,other,1,1,false\n"
|
||||
"11,head-b,1,1,false\n",
|
||||
)
|
||||
inventory_minifigs_path = tmp_path / "inventory_minifigs.csv"
|
||||
write_csv(
|
||||
inventory_minifigs_path,
|
||||
"inventory_id,fig_num,quantity\n"
|
||||
"1,fig-owen,1\n"
|
||||
"1,fig-guard,1\n"
|
||||
"2,fig-owen,1\n"
|
||||
"2,fig-guard,1\n",
|
||||
)
|
||||
minifigs_path = tmp_path / "minifigs.csv"
|
||||
write_csv(
|
||||
minifigs_path,
|
||||
"fig_num,name\n"
|
||||
"fig-owen,Owen Grady with Backpack\n"
|
||||
"fig-guard,Guard in Helmet with Trans-Brown Visor, Female\n",
|
||||
)
|
||||
aliases_path = tmp_path / "known_character_aliases.csv"
|
||||
write_csv(
|
||||
aliases_path,
|
||||
"alias,canonical\n"
|
||||
"Guard in Helmet with Trans-Brown Visor,Figurant\n",
|
||||
)
|
||||
destination_path = tmp_path / "minifigs_by_set.csv"
|
||||
|
||||
build_minifigs_by_set(
|
||||
parts_filtered_path,
|
||||
parts_catalog_path,
|
||||
inventories_path,
|
||||
inventory_parts_path,
|
||||
inventory_minifigs_path,
|
||||
minifigs_path,
|
||||
aliases_path,
|
||||
destination_path,
|
||||
)
|
||||
|
||||
assert destination_path.read_text() == (
|
||||
"set_num,part_num,part_name\n"
|
||||
"123-1,head-a,Head A\n"
|
||||
"123-1,head-b,Head B\n"
|
||||
"124-1,head-b,Head B\n"
|
||||
"set_num,part_num,known_character,fig_num\n"
|
||||
"123-1,head-a,Owen Grady,fig-owen\n"
|
||||
"123-1,head-b,Figurant,fig-guard\n"
|
||||
"124-1,head-b,Figurant,fig-guard\n"
|
||||
)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user