1

Exclure les pièces de minifigs et intégrer les visuels de rareté

This commit is contained in:
Richard Dern 2025-12-03 11:34:05 +01:00
parent 6dc1f1cac5
commit 39af0d3a8b
8 changed files with 606 additions and 0 deletions

View File

@ -365,3 +365,19 @@ Le calcul lit `data/intermediate/minifigs_by_set.csv`, `data/raw/parts.csv`, `da
- `data/intermediate/head_reuse.csv` : pour chaque tête observée dans les sets filtrés, le nombre de sets filtrés qui la contiennent, le nombre de sets du reste du catalogue et le total. - `data/intermediate/head_reuse.csv` : pour chaque tête observée dans les sets filtrés, le nombre de sets filtrés qui la contiennent, le nombre de sets du reste du catalogue et le total.
- `figures/step33/head_reuse.png` : bar chart horizontal montrant, par tête, la part filtrée vs le reste du catalogue (têtes exclusives en haut). - `figures/step33/head_reuse.png` : bar chart horizontal montrant, par tête, la part filtrée vs le reste du catalogue (têtes exclusives en haut).
- Les étiquettes affichent aussi lidentifiant de la minifig (`fig-*`) et un astérisque à côté du set (`set_num*`) lorsquil est présent dans la collection. - Les étiquettes affichent aussi lidentifiant de la minifig (`fig-*`) et un astérisque à côté du set (`set_num*`) lorsquil est présent dans la collection.
### Étape 34 : rareté des pièces (occurrences catalogue)
1. `source .venv/bin/activate`
2. `python -m scripts.compute_part_rarity`
3. `python -m scripts.download_part_rarity_resources`
4. `python -m scripts.plot_part_rarity`
Le calcul lit `data/intermediate/parts_filtered.csv`, `data/raw/parts.csv`, `data/raw/part_categories.csv`, `data/raw/inventories.csv`, `data/raw/inventory_parts.csv` et `data/intermediate/sets_enriched.csv`. Il additionne `quantity_in_set` pour chaque `part_num` des sets filtrés (rechanges incluses), ignore les catégories animales, stickers et pièces de minifigs (`28`, `58`, `74`, `75`, `13`, `27`, `59`, `60`, `61`, `65`, `70`, `71`, `72`, `73` de `part_categories.csv`, ainsi que `is_minifig_part=true`) pour écarter les moules de dinosaures, les planches dautocollants et les pièces de figurines, puis compte les occurrences restantes dans le reste du catalogue. Les sorties sont :
- `data/intermediate/part_rarity.csv` : classement complet des pièces avec leurs occurrences filtrées, catalogue et part filtrée.
- `data/intermediate/part_rarity_exclusive.csv` : toutes les pièces exclusives aux sets filtrés, suivies de la première pièce réutilisée ailleurs (pour visualiser la bascule entre exclusivité et réemploi).
Le téléchargement sappuie sur `REBRICKABLE_TOKEN` et place les visuels des pièces dans `figures/rebrickable/{set_id}/rare_parts/{part_num}.jpg`, en journalisant les manques dans `data/intermediate/part_rarity_download_log.csv`.
Le tracé `figures/step34/part_rarity.png` juxtapose, pour chaque pièce de `part_rarity_exclusive.csv`, les occurrences dans les sets filtrés vs le reste du catalogue avec les images incrustées.

86
lib/plots/part_rarity.py Normal file
View File

@ -0,0 +1,86 @@
"""Visualisation des pièces les plus rares observées dans les sets filtrés."""
import csv
from pathlib import Path
from typing import List
import matplotlib.pyplot as plt
from matplotlib.offsetbox import AnnotationBbox, OffsetImage
from PIL import Image
from lib.filesystem import ensure_parent_dir
def load_part_rarity(path: Path) -> List[dict]:
"""Charge le CSV des pièces rares."""
rows: List[dict] = []
with path.open() as csv_file:
reader = csv.DictReader(csv_file)
for row in reader:
rows.append(row)
return rows
def format_label(row: dict) -> str:
"""Formate létiquette de laxe vertical."""
return f"{row['part_num']}{row['part_name']}"
def load_part_image(row: dict, resources_dir: Path) -> Image.Image | None:
"""Charge l'image associée à une pièce si elle est disponible."""
path = resources_dir / row["sample_set_id"] / "rare_parts" / f"{row['part_num']}.jpg"
if not path.exists():
return None
return Image.open(path)
def plot_part_rarity(
path: Path,
destination_path: Path,
resources_dir: Path = Path("figures/rebrickable"),
show_images: bool = True,
) -> None:
"""Trace un bar chart horizontal des pièces les plus rares avec leurs visuels."""
rows = load_part_rarity(path)
selected = rows
labels = [format_label(row) for row in selected]
filtered_counts = [int(row["filtered_quantity"]) for row in selected]
other_counts = [int(row["other_sets_quantity"]) for row in selected]
positions = list(range(len(selected)))
fig, ax = plt.subplots(figsize=(13, 0.55 * len(selected) + 1.4))
ax.barh(positions, filtered_counts, color="#1f78b4", label="Sets filtrés")
ax.barh(positions, other_counts, left=filtered_counts, color="#b2df8a", label="Autres sets")
ax.set_yticks(positions)
ax.set_yticklabels(labels)
ax.set_xlabel("Occurrences de la pièce (rechanges incluses)")
ax.grid(axis="x", linestyle="--", alpha=0.4)
ax.legend()
if show_images:
max_count = max((f + o) for f, o in zip(filtered_counts, other_counts)) if selected else 0
pad = max_count * 0.15 if max_count > 0 else 1.0
ax.set_xlim(left=-pad, right=max_count + pad * 0.3)
for row, pos in zip(selected, positions):
image = load_part_image(row, resources_dir)
if image is None:
continue
target_height = 28
ratio = target_height / image.height
resized = image.resize((int(image.width * ratio), target_height))
imagebox = OffsetImage(resized)
ab = AnnotationBbox(
imagebox,
(-pad * 0.45, pos),
xycoords=("data", "data"),
box_alignment=(0.5, 0.5),
frameon=False,
)
ax.add_artist(ab)
fig.subplots_adjust(left=0.42)
fig.tight_layout()
ensure_parent_dir(destination_path)
fig.savefig(destination_path, dpi=150)
plt.close(fig)

View File

@ -0,0 +1,176 @@
"""Mesure la rareté des pièces présentes dans les sets filtrés."""
import csv
from pathlib import Path
from typing import Dict, Iterable, List, Sequence, Set
from lib.filesystem import ensure_parent_dir
from lib.rebrickable.parts_inventory import index_inventory_parts_by_inventory, select_latest_inventories
from lib.rebrickable.stats import read_rows
IGNORED_PART_CATEGORY_IDS = {"28", "58", "74", "75"}
MINIFIG_PART_CATEGORY_IDS = {"13", "27", "59", "60", "61", "65", "70", "71", "72", "73"}
def load_parts_catalog(path: Path) -> Dict[str, dict]:
"""Indexe les pièces par référence avec leur catégorie et leur nom."""
catalog: Dict[str, dict] = {}
with path.open() as csv_file:
reader = csv.DictReader(csv_file)
for row in reader:
catalog[row["part_num"]] = row
return catalog
def load_part_categories(path: Path) -> Dict[str, str]:
"""Associe les identifiants de catégorie à leur libellé."""
categories: Dict[str, str] = {}
with path.open() as csv_file:
reader = csv.DictReader(csv_file)
for row in reader:
categories[row["id"]] = row["name"]
return categories
def load_filtered_sets(path: Path) -> Dict[str, dict]:
"""Charge les sets filtrés avec leurs métadonnées."""
lookup: Dict[str, dict] = {}
for row in read_rows(path):
lookup[row["set_num"]] = row
return lookup
def aggregate_filtered_parts(
rows: Iterable[dict],
parts_catalog: Dict[str, dict],
ignored_categories: Set[str] = IGNORED_PART_CATEGORY_IDS,
ignored_minifig_categories: Set[str] = MINIFIG_PART_CATEGORY_IDS,
) -> Dict[str, dict]:
"""Agrège les quantités par pièce pour les sets filtrés (rechanges incluses)."""
aggregated: Dict[str, dict] = {}
for row in rows:
if row["is_minifig_part"] == "true":
continue
part = parts_catalog[row["part_num"]]
if part["part_cat_id"] in ignored_categories:
continue
if part["part_cat_id"] in ignored_minifig_categories:
continue
entry = aggregated.get(row["part_num"])
if entry is None:
entry = {"quantity": 0, "set_numbers": set()}
aggregated[row["part_num"]] = entry
entry["quantity"] += int(row["quantity_in_set"])
entry["set_numbers"].add(row["set_num"])
return aggregated
def compute_other_set_usage(
inventories_path: Path,
inventory_parts_path: Path,
parts_catalog: Dict[str, dict],
filtered_set_numbers: Set[str],
ignored_categories: Set[str] = IGNORED_PART_CATEGORY_IDS,
ignored_minifig_categories: Set[str] = MINIFIG_PART_CATEGORY_IDS,
) -> Dict[str, int]:
"""Compte les occurrences des pièces dans le reste du catalogue (rechanges incluses)."""
inventories = select_latest_inventories(inventories_path)
parts_by_inventory = index_inventory_parts_by_inventory(inventory_parts_path)
totals: Dict[str, int] = {}
for set_num, inventory in inventories.items():
if set_num in filtered_set_numbers:
continue
for row in parts_by_inventory.get(inventory["id"], []):
part = parts_catalog[row["part_num"]]
if part["part_cat_id"] in ignored_categories:
continue
if part["part_cat_id"] in ignored_minifig_categories:
continue
totals[row["part_num"]] = totals.get(row["part_num"], 0) + int(row["quantity"])
return totals
def build_part_rarity(
parts_filtered_path: Path,
inventories_path: Path,
inventory_parts_path: Path,
parts_catalog_path: Path,
part_categories_path: Path,
filtered_sets_path: Path,
) -> List[dict]:
"""Construit le classement de rareté des pièces filtrées."""
parts_catalog = load_parts_catalog(parts_catalog_path)
categories = load_part_categories(part_categories_path)
filtered_sets = load_filtered_sets(filtered_sets_path)
filtered_set_numbers = set(filtered_sets.keys())
filtered_rows = read_rows(parts_filtered_path)
filtered_usage = aggregate_filtered_parts(filtered_rows, parts_catalog)
other_usage = compute_other_set_usage(
inventories_path,
inventory_parts_path,
parts_catalog,
filtered_set_numbers,
)
rows: List[dict] = []
for part_num, entry in filtered_usage.items():
part = parts_catalog[part_num]
other_quantity = other_usage.get(part_num, 0)
total_quantity = entry["quantity"] + other_quantity
sample_set_num = sorted(entry["set_numbers"])[0]
sample_set_id = filtered_sets[sample_set_num]["set_id"]
rows.append(
{
"part_num": part_num,
"part_name": part["name"],
"part_cat_id": part["part_cat_id"],
"part_category": categories[part["part_cat_id"]],
"sample_set_num": sample_set_num,
"sample_set_id": sample_set_id,
"filtered_quantity": str(entry["quantity"]),
"filtered_set_count": str(len(entry["set_numbers"])),
"other_sets_quantity": str(other_quantity),
"catalog_total_quantity": str(total_quantity),
"filtered_share": f"{entry['quantity'] / total_quantity:.4f}",
}
)
rows.sort(key=lambda row: (int(row["other_sets_quantity"]), int(row["catalog_total_quantity"]), row["part_num"]))
return rows
def write_part_rarity(destination_path: Path, rows: Sequence[dict]) -> None:
"""Écrit le CSV complet des pièces classées par rareté."""
ensure_parent_dir(destination_path)
fieldnames = [
"part_num",
"part_name",
"part_cat_id",
"part_category",
"sample_set_num",
"sample_set_id",
"filtered_quantity",
"filtered_set_count",
"other_sets_quantity",
"catalog_total_quantity",
"filtered_share",
]
with destination_path.open("w", newline="") as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader()
for row in rows:
writer.writerow(row)
def select_until_reused(rows: Sequence[dict]) -> List[dict]:
"""Retient les pièces exclusives puis la première réutilisée dans dautres sets."""
selected: List[dict] = []
for row in rows:
selected.append(row)
if int(row["other_sets_quantity"]) > 0:
break
return selected
def load_part_rarity(path: Path) -> List[dict]:
"""Charge le CSV de rareté des pièces."""
return read_rows(path)

View File

@ -0,0 +1,34 @@
"""Calcule les pièces rares en comparant les sets filtrés au reste du catalogue."""
from pathlib import Path
from lib.rebrickable.part_rarity import build_part_rarity, select_until_reused, write_part_rarity
PARTS_FILTERED_PATH = Path("data/intermediate/parts_filtered.csv")
INVENTORIES_PATH = Path("data/raw/inventories.csv")
INVENTORY_PARTS_PATH = Path("data/raw/inventory_parts.csv")
PARTS_CATALOG_PATH = Path("data/raw/parts.csv")
PART_CATEGORIES_PATH = Path("data/raw/part_categories.csv")
FILTERED_SETS_PATH = Path("data/intermediate/sets_enriched.csv")
DESTINATION_PATH = Path("data/intermediate/part_rarity.csv")
TOP_DESTINATION_PATH = Path("data/intermediate/part_rarity_exclusive.csv")
def main() -> None:
"""Construit les CSV complets et top 10 des pièces les plus rares."""
rows = build_part_rarity(
PARTS_FILTERED_PATH,
INVENTORIES_PATH,
INVENTORY_PARTS_PATH,
PARTS_CATALOG_PATH,
PART_CATEGORIES_PATH,
FILTERED_SETS_PATH,
)
write_part_rarity(DESTINATION_PATH, rows)
top_rows = select_until_reused(rows)
write_part_rarity(TOP_DESTINATION_PATH, top_rows)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,63 @@
"""Télécharge les visuels des pièces les plus rares identifiées à l'étape 34."""
import os
from pathlib import Path
import requests
from dotenv import load_dotenv
from lib.rebrickable.part_rarity import load_part_rarity
from lib.rebrickable.resources import (
build_part_img_lookup,
download_binary,
download_resources,
fetch_part_img_url,
load_part_img_cache,
persist_part_img_cache,
)
PART_RARITY_TOP_PATH = Path("data/intermediate/part_rarity_exclusive.csv")
RESOURCES_DIR = Path("figures/rebrickable")
PART_IMG_CACHE_PATH = Path("data/intermediate/part_img_cache.csv")
DOWNLOAD_LOG_PATH = Path("data/intermediate/part_rarity_download_log.csv")
REQUEST_DELAY_SECONDS_IMAGES = 0.35
REQUEST_DELAY_SECONDS_LOOKUP = 0.6
def main() -> None:
"""Construit les URLs d'images des pièces rares et les télécharge."""
load_dotenv()
token = os.environ["REBRICKABLE_TOKEN"]
session = requests.Session()
rows = load_part_rarity(PART_RARITY_TOP_PATH)
cache = load_part_img_cache(PART_IMG_CACHE_PATH)
part_img_lookup = build_part_img_lookup(
{row["part_num"] for row in rows},
fetcher=lambda part_num: fetch_part_img_url(part_num, token, session),
cache_path=PART_IMG_CACHE_PATH,
existing_cache=cache,
delay_seconds=REQUEST_DELAY_SECONDS_LOOKUP,
)
if cache:
part_img_lookup.update(cache)
persist_part_img_cache(PART_IMG_CACHE_PATH, part_img_lookup)
plan = [
{
"url": part_img_lookup[row["part_num"]],
"path": RESOURCES_DIR / row["sample_set_id"] / "rare_parts" / f"{row['part_num']}.jpg",
}
for row in rows
]
download_resources(
plan,
downloader=lambda url, path: download_binary(url, path, session),
delay_seconds=REQUEST_DELAY_SECONDS_IMAGES,
log_path=DOWNLOAD_LOG_PATH,
)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,19 @@
"""Trace le diagramme des pièces rares pour l'étape 34."""
from pathlib import Path
from lib.plots.part_rarity import plot_part_rarity
PART_RARITY_TOP_PATH = Path("data/intermediate/part_rarity_exclusive.csv")
DESTINATION_PATH = Path("figures/step34/part_rarity.png")
RESOURCES_DIR = Path("figures/rebrickable")
def main() -> None:
"""Charge le top des pièces rares et produit le graphique illustré."""
plot_part_rarity(PART_RARITY_TOP_PATH, DESTINATION_PATH, resources_dir=RESOURCES_DIR)
if __name__ == "__main__":
main()

181
tests/test_part_rarity.py Normal file
View File

@ -0,0 +1,181 @@
"""Tests du calcul de rareté des pièces."""
import csv
from pathlib import Path
from lib.rebrickable.part_rarity import build_part_rarity, select_until_reused, write_part_rarity
def write_csv(path: Path, headers: list[str], rows: list[list[str]]) -> None:
"""Écrit un CSV utilitaire pour les cas de test."""
with path.open("w", newline="") as csv_file:
writer = csv.writer(csv_file)
writer.writerow(headers)
writer.writerows(rows)
def test_build_part_rarity_counts_spares_and_ignores_categories(tmp_path: Path) -> None:
"""Comptabilise les pièces (rechanges incluses) et ignore les catégories animales."""
parts_filtered = tmp_path / "parts_filtered.csv"
write_csv(
parts_filtered,
[
"part_num",
"color_rgb",
"is_translucent",
"set_num",
"set_id",
"year",
"quantity_in_set",
"is_spare",
"is_minifig_part",
],
[
["p1", "AAAAAA", "false", "1000-1", "1000", "2020", "2", "false", "false"],
["p1", "AAAAAA", "false", "2000-1", "2000", "2021", "1", "true", "false"],
["p2", "BBBBBB", "false", "1000-1", "1000", "2020", "5", "false", "false"],
["p4", "CCCCCC", "false", "2000-1", "2000", "2021", "2", "false", "false"],
["p6", "DDDDDD", "false", "2000-1", "2000", "2021", "1", "false", "false"],
],
)
sets_enriched = tmp_path / "sets_enriched.csv"
write_csv(
sets_enriched,
["set_num", "name", "year", "theme_id", "num_parts", "img_url", "set_id", "rebrickable_url", "in_collection"],
[
["1000-1", "Set A", "2020", "1", "10", "http://example.com", "1000", "http://example.com", "false"],
["2000-1", "Set B", "2021", "1", "10", "http://example.com", "2000", "http://example.com", "false"],
],
)
parts_catalog = tmp_path / "parts.csv"
write_csv(
parts_catalog,
["part_num", "name", "part_cat_id", "part_material"],
[
["p1", "Brick 1x1", "1", "Plastic"],
["p2", "Baby Dino", "28", "Plastic"],
["p3", "Raptor Body", "75", "Plastic"],
["p4", "Figure Limb", "41", "Plastic"],
["p5", "Sticker Sheet", "58", "Plastic"],
["p6", "Exclusive Tile", "1", "Plastic"],
],
)
part_categories = tmp_path / "part_categories.csv"
write_csv(
part_categories,
["id", "name"],
[
["1", "Bricks"],
["28", "Animals / Creatures"],
["41", "Large Buildable Figures"],
["75", "Animal / Creature Body Parts"],
["58", "Stickers"],
],
)
inventories = tmp_path / "inventories.csv"
write_csv(
inventories,
["id", "version", "set_num"],
[
["1", "1", "3000-1"],
["2", "2", "3000-1"],
["3", "1", "4000-1"],
["4", "1", "1000-1"],
["5", "1", "5000-1"],
],
)
inventory_parts = tmp_path / "inventory_parts.csv"
write_csv(
inventory_parts,
["inventory_id", "part_num", "color_id", "quantity", "is_spare", "img_url"],
[
["1", "p1", "1", "1", "False", ""],
["2", "p1", "1", "3", "False", ""],
["2", "p2", "1", "2", "False", ""],
["3", "p4", "1", "4", "True", ""],
["4", "p1", "1", "8", "False", ""],
["5", "p5", "1", "9", "False", ""],
],
)
rows = build_part_rarity(
parts_filtered,
inventories,
inventory_parts,
parts_catalog,
part_categories,
sets_enriched,
)
assert rows == [
{
"part_num": "p6",
"part_name": "Exclusive Tile",
"part_cat_id": "1",
"part_category": "Bricks",
"sample_set_num": "2000-1",
"sample_set_id": "2000",
"filtered_quantity": "1",
"filtered_set_count": "1",
"other_sets_quantity": "0",
"catalog_total_quantity": "1",
"filtered_share": "1.0000",
},
{
"part_num": "p1",
"part_name": "Brick 1x1",
"part_cat_id": "1",
"part_category": "Bricks",
"sample_set_num": "1000-1",
"sample_set_id": "1000",
"filtered_quantity": "3",
"filtered_set_count": "2",
"other_sets_quantity": "3",
"catalog_total_quantity": "6",
"filtered_share": "0.5000",
},
{
"part_num": "p4",
"part_name": "Figure Limb",
"part_cat_id": "41",
"part_category": "Large Buildable Figures",
"sample_set_num": "2000-1",
"sample_set_id": "2000",
"filtered_quantity": "2",
"filtered_set_count": "1",
"other_sets_quantity": "4",
"catalog_total_quantity": "6",
"filtered_share": "0.3333",
},
]
assert select_until_reused(rows) == [rows[0], rows[1]]
def test_write_part_rarity_outputs_csv(tmp_path: Path) -> None:
"""Sérialise le classement de rareté."""
destination = tmp_path / "part_rarity.csv"
rows = [
{
"part_num": "p1",
"part_name": "Brick 1x1",
"part_cat_id": "1",
"part_category": "Bricks",
"sample_set_num": "123-1",
"sample_set_id": "123",
"filtered_quantity": "3",
"filtered_set_count": "2",
"other_sets_quantity": "3",
"catalog_total_quantity": "6",
"filtered_share": "0.5000",
}
]
write_part_rarity(destination, rows)
assert destination.exists()
content = destination.read_text().strip().splitlines()
assert content[0] == (
"part_num,part_name,part_cat_id,part_category,sample_set_num,sample_set_id,filtered_quantity,filtered_set_count,"
"other_sets_quantity,catalog_total_quantity,filtered_share"
)
assert content[1] == "p1,Brick 1x1,1,Bricks,123-1,123,3,2,3,6,0.5000"

View File

@ -0,0 +1,31 @@
"""Tests des visualisations de rareté des pièces."""
import matplotlib
from pathlib import Path
from PIL import Image
from lib.plots.part_rarity import plot_part_rarity
matplotlib.use("Agg")
def test_plot_part_rarity_with_images(tmp_path: Path) -> None:
"""Génère le graphique des pièces rares avec incrustation des visuels."""
data_path = tmp_path / "part_rarity_exclusive.csv"
destination = tmp_path / "figures" / "step34" / "part_rarity.png"
resources_dir = tmp_path / "figures" / "rebrickable"
(resources_dir / "1000" / "rare_parts").mkdir(parents=True)
(resources_dir / "2000" / "rare_parts").mkdir(parents=True)
Image.new("RGB", (50, 50), color=(255, 0, 0)).save(resources_dir / "1000" / "rare_parts" / "p1.jpg")
Image.new("RGB", (50, 50), color=(0, 255, 0)).save(resources_dir / "2000" / "rare_parts" / "p2.jpg")
data_path.write_text(
"part_num,part_name,part_cat_id,part_category,sample_set_num,sample_set_id,filtered_quantity,filtered_set_count,other_sets_quantity,catalog_total_quantity,filtered_share\n"
"p1,Brick 1x1,1,Bricks,1000-1,1000,3,2,0,3,1.0000\n"
"p2,Plate 1x2,1,Bricks,2000-1,2000,2,1,1,3,0.6667\n"
)
plot_part_rarity(data_path, destination, resources_dir=resources_dir, show_images=True)
assert destination.exists()
assert destination.stat().st_size > 0