From fd5db106930e822d6b009a98a1bf36f2d79464c7 Mon Sep 17 00:00:00 2001 From: Richard Dern Date: Tue, 2 Dec 2025 21:19:30 +0100 Subject: [PATCH] =?UTF-8?q?Rend=20le=20t=C3=A9l=C3=A9chargement=20des=20re?= =?UTF-8?q?ssources=20plus=20patient=20et=20cache?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 14 +++ lib/rebrickable/resources.py | 201 ++++++++++++++++++++++++++++++++++ scripts/download_resources.py | 72 ++++++++++++ tests/test_resources.py | 108 ++++++++++++++++++ 4 files changed, 395 insertions(+) create mode 100644 lib/rebrickable/resources.py create mode 100644 scripts/download_resources.py create mode 100644 tests/test_resources.py diff --git a/README.md b/README.md index bae399b..c9d663b 100644 --- a/README.md +++ b/README.md @@ -327,3 +327,17 @@ Le calcul lit `data/intermediate/minifigs_by_set.csv`, `data/raw/parts.csv` et ` - `data/intermediate/minifig_head_faces_by_character.csv` : volume et part par personnage. Les visuels `figures/step30/minifig_head_faces_timeline.png`, `figures/step30/minifig_head_faces_top_sets.png` et `figures/step30/minifig_head_faces_characters.png` montrent respectivement la trajectoire annuelle, le top des sets concernés et les personnages dotés de têtes dual-face. + +### Étape 31 : téléchargement des ressources visuelles (sets, minifigs, têtes) + +1. Renseigner `REBRICKABLE_TOKEN` dans `.env` (clé API Rebrickable). +2. `source .venv/bin/activate` +3. `python -m scripts.download_resources` + +Le script lit `data/intermediate/sets_enriched.csv`, `data/intermediate/minifigs_by_set.csv` et `data/raw/minifigs.csv`. Il récupère les URLs d'images de têtes via l’API Rebrickable `/api/v3/lego/parts/{part_num}/`, ajoute la colonne `part_img_url` à `data/intermediate/minifigs_by_set.csv`, puis télécharge : + +- `resources/{set_id}/set.jpg` : visuel du set (img_url). +- `resources/{set_id}/{known_character}/minifig.jpg` : visuel de la minifig complète. +- `resources/{set_id}/{known_character}/head.jpg` : visuel de la tête correspondante. + +Les requêtes API sont dédoublonnées, espacées (fair-use) et mises en cache dans `data/intermediate/part_img_cache.csv` pour reprise en cas d’interruption. Les images déjà téléchargées sont réutilisées localement pour éviter les requêtes répétées. diff --git a/lib/rebrickable/resources.py b/lib/rebrickable/resources.py new file mode 100644 index 0000000..5de4721 --- /dev/null +++ b/lib/rebrickable/resources.py @@ -0,0 +1,201 @@ +"""Téléchargement des ressources (sets, minifigs, têtes) et enrichissement des URLs.""" + +import os +import time +import re +import csv +import shutil +from pathlib import Path +from typing import Callable, Dict, Iterable, List, Sequence + +import requests + +from lib.filesystem import ensure_parent_dir +from lib.rebrickable.stats import read_rows + + +def load_sets_enriched(path: Path) -> List[dict]: + """Charge les sets enrichis pour accéder aux URLs d'images de set.""" + return read_rows(path) + + +def load_minifigs_by_set(path: Path) -> List[dict]: + """Charge minifigs_by_set.csv en mémoire.""" + return read_rows(path) + + +def load_minifigs_catalog(path: Path) -> Dict[str, dict]: + """Indexe les minifigs par identifiant.""" + catalog: Dict[str, dict] = {} + with path.open() as csv_file: + reader = csv.DictReader(csv_file) + for row in reader: + catalog[row["fig_num"]] = row + return catalog + + +def fetch_part_img_url(part_num: str, token: str, session: requests.Session) -> str: + """Récupère l'URL d'image d'une pièce via l'API Rebrickable.""" + retries = 0 + backoff = 2.0 + while True: + response = session.get(f"https://rebrickable.com/api/v3/lego/parts/{part_num}/", headers={"Authorization": f"key {token}"}) + if response.status_code == 429: + time.sleep(backoff) + retries += 1 + backoff = min(backoff * 1.5, 10.0) + if retries > 8: + response.raise_for_status() + continue + response.raise_for_status() + payload = response.json() + return payload["part_img_url"] + + +def load_part_img_cache(cache_path: Path) -> Dict[str, str]: + """Charge le cache des URLs de têtes s'il existe.""" + if not cache_path.exists(): + return {} + cache: Dict[str, str] = {} + with cache_path.open() as cache_file: + reader = csv.DictReader(cache_file) + for row in reader: + cache[row["part_num"]] = row["part_img_url"] + return cache + + +def persist_part_img_cache(cache_path: Path, cache: Dict[str, str]) -> None: + """Persist le cache des URLs pour reprise après interruption.""" + ensure_parent_dir(cache_path) + with cache_path.open("w", newline="") as cache_file: + writer = csv.DictWriter(cache_file, fieldnames=["part_num", "part_img_url"]) + writer.writeheader() + for part_num, url in sorted(cache.items()): + writer.writerow({"part_num": part_num, "part_img_url": url}) + + +def build_part_img_lookup( + part_numbers: Iterable[str], + fetcher: Callable[[str], str], + cache_path: Path | None = None, + existing_cache: Dict[str, str] | None = None, + delay_seconds: float = 1.6, +) -> Dict[str, str]: + """Construit un index part_num -> URL d'image en espaçant les requêtes.""" + cache = dict(existing_cache or {}) + unique_parts = sorted(set(part_numbers)) + for part_num in unique_parts: + if part_num in cache: + continue + cache[part_num] = fetcher(part_num) + if cache_path is not None: + persist_part_img_cache(cache_path, cache) + time.sleep(delay_seconds) + return cache + + +def add_part_img_urls(minifigs_rows: Iterable[dict], part_img_lookup: Dict[str, str]) -> List[dict]: + """Ajoute part_img_url aux lignes minifigs_by_set.""" + enriched: List[dict] = [] + for row in minifigs_rows: + existing = row.get("part_img_url", "").strip() + part_img_url = existing if existing != "" else part_img_lookup[row["part_num"]] + enriched.append( + { + "set_num": row["set_num"], + "part_num": row["part_num"], + "known_character": row["known_character"], + "fig_num": row["fig_num"], + "gender": row["gender"], + "part_img_url": part_img_url, + } + ) + return enriched + + +def write_minifigs_by_set_with_images(destination_path: Path, rows: Sequence[dict]) -> None: + """Réécrit le CSV minifigs_by_set avec la colonne part_img_url.""" + ensure_parent_dir(destination_path) + fieldnames = ["set_num", "part_num", "known_character", "fig_num", "gender", "part_img_url"] + with destination_path.open("w", newline="") as csv_file: + writer = csv.DictWriter(csv_file, fieldnames=fieldnames) + writer.writeheader() + for row in rows: + writer.writerow(row) + + +def sanitize_name(raw_name: str) -> str: + """Nettoie un nom pour construire un chemin de fichier sûr.""" + cleaned = re.sub(r"[^A-Za-z0-9]+", "_", raw_name).strip("_") + if cleaned == "": + return "Unknown" + return cleaned + + +def build_download_plan( + sets_rows: Iterable[dict], + minifigs_rows: Iterable[dict], + minifigs_catalog: Dict[str, dict], + base_dir: Path, +) -> List[dict]: + """Construit la liste des fichiers à télécharger (sets, minifigs, têtes).""" + plan: List[dict] = [] + sets_list = list(sets_rows) + set_ids: Dict[str, str] = {row["set_num"]: row["set_id"] for row in sets_list} + for set_row in sets_list: + set_dir = base_dir / set_row["set_id"] + plan.append({"url": set_row["img_url"], "path": set_dir / "set.jpg"}) + for row in minifigs_rows: + if (row.get("known_character") or "").strip().lower() == "figurant": + continue + set_dir = base_dir / set_ids[row["set_num"]] + character_dir = set_dir / sanitize_name(row["known_character"] or "Unknown") + minifig = minifigs_catalog[row["fig_num"]] + plan.append({"url": minifig["img_url"], "path": character_dir / "minifig.jpg"}) + plan.append({"url": row["part_img_url"], "path": character_dir / "head.jpg"}) + return plan + + +def download_binary(url: str, destination_path: Path, session: requests.Session) -> bool: + """Télécharge un binaire vers un chemin donné. Retourne False si 404.""" + ensure_parent_dir(destination_path) + response = session.get(url, stream=True) + if response.status_code == 404: + return False + response.raise_for_status() + with destination_path.open("wb") as target_file: + for chunk in response.iter_content(chunk_size=8192): + target_file.write(chunk) + return True + + +def download_resources( + plan: Iterable[dict], + downloader: Callable[[str, Path], bool], + delay_seconds: float = 0.35, + log_path: Path | None = None, +) -> None: + """Exécute les téléchargements en espaçant les requêtes et journalise les statuts.""" + cache: Dict[str, Path] = {} + log_rows: List[dict] = [] + for item in plan: + if item["path"].exists(): + time.sleep(delay_seconds) + continue + if item["url"] in cache and cache[item["url"]].exists(): + ensure_parent_dir(item["path"]) + shutil.copy2(cache[item["url"]], item["path"]) + else: + success = downloader(item["url"], item["path"]) + if success: + cache[item["url"]] = item["path"] + else: + log_rows.append({"url": item["url"], "path": str(item["path"]), "status": "missing"}) + time.sleep(delay_seconds) + if log_path is not None: + ensure_parent_dir(log_path) + with log_path.open("w", newline="") as csv_file: + writer = csv.DictWriter(csv_file, fieldnames=["url", "path", "status"]) + writer.writeheader() + for row in log_rows: + writer.writerow(row) diff --git a/scripts/download_resources.py b/scripts/download_resources.py new file mode 100644 index 0000000..fec8925 --- /dev/null +++ b/scripts/download_resources.py @@ -0,0 +1,72 @@ +"""Télécharge les ressources (sets, minifigs, têtes) pour les sets filtrés.""" + +import os +from pathlib import Path + +import requests +from dotenv import load_dotenv + +from lib.rebrickable.resources import ( + add_part_img_urls, + build_download_plan, + build_part_img_lookup, + download_binary, + download_resources, + fetch_part_img_url, + load_minifigs_by_set, + load_minifigs_catalog, + load_sets_enriched, + load_part_img_cache, + persist_part_img_cache, + write_minifigs_by_set_with_images, +) + + +SETS_PATH = Path("data/intermediate/sets_enriched.csv") +MINIFIGS_BY_SET_PATH = Path("data/intermediate/minifigs_by_set.csv") +MINIFIGS_CATALOG_PATH = Path("data/raw/minifigs.csv") +RESOURCES_DIR = Path("figures/rebrickable") +REQUEST_DELAY_SECONDS_IMAGES = 0.35 +PART_IMG_CACHE_PATH = Path("data/intermediate/part_img_cache.csv") +DOWNLOAD_LOG_PATH = Path("data/intermediate/resources_download_log.csv") + + +def main() -> None: + """Construit les URLs manquantes et télécharge les images associées.""" + load_dotenv() + token = os.environ["REBRICKABLE_TOKEN"] + session = requests.Session() + + sets = load_sets_enriched(SETS_PATH) + minifigs_by_set = load_minifigs_by_set(MINIFIGS_BY_SET_PATH) + minifigs_catalog = load_minifigs_catalog(MINIFIGS_CATALOG_PATH) + cache = load_part_img_cache(PART_IMG_CACHE_PATH) + + missing_part_numbers = { + row["part_num"] + for row in minifigs_by_set + if row.get("part_img_url", "").strip() == "" and row["part_num"] not in cache + } + part_img_lookup = build_part_img_lookup( + missing_part_numbers, + fetcher=lambda part_num: fetch_part_img_url(part_num, token, session), + cache_path=PART_IMG_CACHE_PATH, + existing_cache=cache, + ) + if cache: + part_img_lookup.update(cache) + persist_part_img_cache(PART_IMG_CACHE_PATH, part_img_lookup) + minifigs_with_imgs = add_part_img_urls(minifigs_by_set, part_img_lookup) + write_minifigs_by_set_with_images(MINIFIGS_BY_SET_PATH, minifigs_with_imgs) + + plan = build_download_plan(sets, minifigs_with_imgs, minifigs_catalog, RESOURCES_DIR) + download_resources( + plan, + downloader=lambda url, path: download_binary(url, path, session), + delay_seconds=REQUEST_DELAY_SECONDS_IMAGES, + log_path=DOWNLOAD_LOG_PATH, + ) + + +if __name__ == "__main__": + main() diff --git a/tests/test_resources.py b/tests/test_resources.py new file mode 100644 index 0000000..a6cda29 --- /dev/null +++ b/tests/test_resources.py @@ -0,0 +1,108 @@ +"""Tests des outils de téléchargement de ressources.""" + +from pathlib import Path + +from lib.rebrickable.resources import ( + add_part_img_urls, + build_download_plan, + build_part_img_lookup, + download_resources, + sanitize_name, + write_minifigs_by_set_with_images, +) +from lib.filesystem import ensure_parent_dir +from lib.rebrickable.stats import read_rows + + +def test_build_part_img_lookup_calls_fetcher_once_per_part() -> None: + """Construit un index en appelant le fetcher sur les références uniques.""" + calls: list[str] = [] + + def fetcher(part_num: str) -> str: + calls.append(part_num) + return f"url-{part_num}" + + lookup = build_part_img_lookup(["p1", "p2", "p1"], fetcher, delay_seconds=0) + + assert lookup == {"p1": "url-p1", "p2": "url-p2"} + assert calls == ["p1", "p2"] + + +def test_add_part_img_urls_and_write(tmp_path: Path) -> None: + """Ajoute les URLs de tête et réécrit minifigs_by_set.""" + rows = [ + {"set_num": "123-1", "part_num": "p1", "known_character": "Alice", "fig_num": "f1", "gender": "female"}, + ] + lookup = {"p1": "http://img/p1.jpg"} + + enriched = add_part_img_urls(rows, lookup) + destination = tmp_path / "minifigs_by_set.csv" + write_minifigs_by_set_with_images(destination, enriched) + + assert read_rows(destination) == [ + { + "set_num": "123-1", + "part_num": "p1", + "known_character": "Alice", + "fig_num": "f1", + "gender": "female", + "part_img_url": "http://img/p1.jpg", + } + ] + + +def test_build_download_plan_and_download(tmp_path: Path) -> None: + """Construit le plan et télécharge les binaires via un downloader stub.""" + sets_rows = [ + {"set_num": "123-1", "set_id": "123", "img_url": "http://set.img", "name": "A", "year": "2020"}, + ] + minifigs_rows = [ + {"set_num": "123-1", "part_num": "p1", "known_character": "Bob", "fig_num": "fig-1", "gender": "male", "part_img_url": "http://head.img"} + ] + minifigs_catalog = {"fig-1": {"img_url": "http://fig.img"}} + base_dir = tmp_path / "resources" + + plan = build_download_plan(sets_rows, minifigs_rows, minifigs_catalog, base_dir) + downloaded: list[tuple[str, Path]] = [] + + def downloader(url: str, path: Path) -> bool: + downloaded.append((url, path)) + path.parent.mkdir(parents=True, exist_ok=True) + path.write_bytes(b"data") + return True + + download_resources(plan, downloader, delay_seconds=0, log_path=tmp_path / "log.csv") + + assert downloaded == [ + ("http://set.img", base_dir / "123" / "set.jpg"), + ("http://fig.img", base_dir / "123" / "Bob" / "minifig.jpg"), + ("http://head.img", base_dir / "123" / "Bob" / "head.jpg"), + ] + assert (base_dir / "123" / "Bob" / "head.jpg").exists() + + +def test_download_resources_duplicates_from_cache(tmp_path: Path) -> None: + """Duplique les fichiers déjà téléchargés pour d'autres sets.""" + plan = [ + {"url": "http://same.img", "path": tmp_path / "resources" / "111" / "set.jpg"}, + {"url": "http://same.img", "path": tmp_path / "resources" / "222" / "set.jpg"}, + ] + downloads: list[tuple[str, Path]] = [] + + def downloader(url: str, path: Path) -> bool: + downloads.append((url, path)) + ensure_parent_dir(path) + path.write_bytes(b"img") + return True + + download_resources(plan, downloader, delay_seconds=0, log_path=tmp_path / "log.csv") + + assert downloads == [("http://same.img", tmp_path / "resources" / "111" / "set.jpg")] + assert (tmp_path / "resources" / "222" / "set.jpg").exists() + + +def test_sanitize_name_handles_special_chars() -> None: + """Nettoie les noms en enlevant les caractères spéciaux.""" + assert sanitize_name("Owen Grady") == "Owen_Grady" + assert sanitize_name("Kayla-Watts!") == "Kayla_Watts" + assert sanitize_name("") == "Unknown"