Rend le téléchargement des ressources plus patient et cache

2025-12-02 21:19:30 +01:00 · 2025-12-02 21:19:30 +01:00 · fd5db10693
commit fd5db10693
parent 41798ea3df
4 changed files with 395 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -327,3 +327,17 @@ Le calcul lit `data/intermediate/minifigs_by_set.csv`, `data/raw/parts.csv` et `
 - `data/intermediate/minifig_head_faces_by_character.csv` : volume et part par personnage.

 Les visuels `figures/step30/minifig_head_faces_timeline.png`, `figures/step30/minifig_head_faces_top_sets.png` et `figures/step30/minifig_head_faces_characters.png` montrent respectivement la trajectoire annuelle, le top des sets concernés et les personnages dotés de têtes dual-face.
+
+### Étape 31 : téléchargement des ressources visuelles (sets, minifigs, têtes)
+
+1. Renseigner `REBRICKABLE_TOKEN` dans `.env` (clé API Rebrickable).
+2. `source .venv/bin/activate`
+3. `python -m scripts.download_resources`
+
+Le script lit `data/intermediate/sets_enriched.csv`, `data/intermediate/minifigs_by_set.csv` et `data/raw/minifigs.csv`. Il récupère les URLs d'images de têtes via l’API Rebrickable `/api/v3/lego/parts/{part_num}/`, ajoute la colonne `part_img_url` à `data/intermediate/minifigs_by_set.csv`, puis télécharge :
+
+- `resources/{set_id}/set.jpg` : visuel du set (img_url).
+- `resources/{set_id}/{known_character}/minifig.jpg` : visuel de la minifig complète.
+- `resources/{set_id}/{known_character}/head.jpg` : visuel de la tête correspondante.
+
+Les requêtes API sont dédoublonnées, espacées (fair-use) et mises en cache dans `data/intermediate/part_img_cache.csv` pour reprise en cas d’interruption. Les images déjà téléchargées sont réutilisées localement pour éviter les requêtes répétées.
--- a/lib/rebrickable/resources.py
+++ b/lib/rebrickable/resources.py
@ -0,0 +1,201 @@
+"""Téléchargement des ressources (sets, minifigs, têtes) et enrichissement des URLs."""
+
+import os
+import time
+import re
+import csv
+import shutil
+from pathlib import Path
+from typing import Callable, Dict, Iterable, List, Sequence
+
+import requests
+
+from lib.filesystem import ensure_parent_dir
+from lib.rebrickable.stats import read_rows
+
+
+def load_sets_enriched(path: Path) -> List[dict]:
+    """Charge les sets enrichis pour accéder aux URLs d'images de set."""
+    return read_rows(path)
+
+
+def load_minifigs_by_set(path: Path) -> List[dict]:
+    """Charge minifigs_by_set.csv en mémoire."""
+    return read_rows(path)
+
+
+def load_minifigs_catalog(path: Path) -> Dict[str, dict]:
+    """Indexe les minifigs par identifiant."""
+    catalog: Dict[str, dict] = {}
+    with path.open() as csv_file:
+        reader = csv.DictReader(csv_file)
+        for row in reader:
+            catalog[row["fig_num"]] = row
+    return catalog
+
+
+def fetch_part_img_url(part_num: str, token: str, session: requests.Session) -> str:
+    """Récupère l'URL d'image d'une pièce via l'API Rebrickable."""
+    retries = 0
+    backoff = 2.0
+    while True:
+        response = session.get(f"https://rebrickable.com/api/v3/lego/parts/{part_num}/", headers={"Authorization": f"key {token}"})
+        if response.status_code == 429:
+            time.sleep(backoff)
+            retries += 1
+            backoff = min(backoff * 1.5, 10.0)
+            if retries > 8:
+                response.raise_for_status()
+            continue
+        response.raise_for_status()
+        payload = response.json()
+        return payload["part_img_url"]
+
+
+def load_part_img_cache(cache_path: Path) -> Dict[str, str]:
+    """Charge le cache des URLs de têtes s'il existe."""
+    if not cache_path.exists():
+        return {}
+    cache: Dict[str, str] = {}
+    with cache_path.open() as cache_file:
+        reader = csv.DictReader(cache_file)
+        for row in reader:
+            cache[row["part_num"]] = row["part_img_url"]
+    return cache
+
+
+def persist_part_img_cache(cache_path: Path, cache: Dict[str, str]) -> None:
+    """Persist le cache des URLs pour reprise après interruption."""
+    ensure_parent_dir(cache_path)
+    with cache_path.open("w", newline="") as cache_file:
+        writer = csv.DictWriter(cache_file, fieldnames=["part_num", "part_img_url"])
+        writer.writeheader()
+        for part_num, url in sorted(cache.items()):
+            writer.writerow({"part_num": part_num, "part_img_url": url})
+
+
+def build_part_img_lookup(
+    part_numbers: Iterable[str],
+    fetcher: Callable[[str], str],
+    cache_path: Path | None = None,
+    existing_cache: Dict[str, str] | None = None,
+    delay_seconds: float = 1.6,
+) -> Dict[str, str]:
+    """Construit un index part_num -> URL d'image en espaçant les requêtes."""
+    cache = dict(existing_cache or {})
+    unique_parts = sorted(set(part_numbers))
+    for part_num in unique_parts:
+        if part_num in cache:
+            continue
+        cache[part_num] = fetcher(part_num)
+        if cache_path is not None:
+            persist_part_img_cache(cache_path, cache)
+        time.sleep(delay_seconds)
+    return cache
+
+
+def add_part_img_urls(minifigs_rows: Iterable[dict], part_img_lookup: Dict[str, str]) -> List[dict]:
+    """Ajoute part_img_url aux lignes minifigs_by_set."""
+    enriched: List[dict] = []
+    for row in minifigs_rows:
+        existing = row.get("part_img_url", "").strip()
+        part_img_url = existing if existing != "" else part_img_lookup[row["part_num"]]
+        enriched.append(
+            {
+                "set_num": row["set_num"],
+                "part_num": row["part_num"],
+                "known_character": row["known_character"],
+                "fig_num": row["fig_num"],
+                "gender": row["gender"],
+                "part_img_url": part_img_url,
+            }
+        )
+    return enriched
+
+
+def write_minifigs_by_set_with_images(destination_path: Path, rows: Sequence[dict]) -> None:
+    """Réécrit le CSV minifigs_by_set avec la colonne part_img_url."""
+    ensure_parent_dir(destination_path)
+    fieldnames = ["set_num", "part_num", "known_character", "fig_num", "gender", "part_img_url"]
+    with destination_path.open("w", newline="") as csv_file:
+        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
+        writer.writeheader()
+        for row in rows:
+            writer.writerow(row)
+
+
+def sanitize_name(raw_name: str) -> str:
+    """Nettoie un nom pour construire un chemin de fichier sûr."""
+    cleaned = re.sub(r"[^A-Za-z0-9]+", "_", raw_name).strip("_")
+    if cleaned == "":
+        return "Unknown"
+    return cleaned
+
+
+def build_download_plan(
+    sets_rows: Iterable[dict],
+    minifigs_rows: Iterable[dict],
+    minifigs_catalog: Dict[str, dict],
+    base_dir: Path,
+) -> List[dict]:
+    """Construit la liste des fichiers à télécharger (sets, minifigs, têtes)."""
+    plan: List[dict] = []
+    sets_list = list(sets_rows)
+    set_ids: Dict[str, str] = {row["set_num"]: row["set_id"] for row in sets_list}
+    for set_row in sets_list:
+        set_dir = base_dir / set_row["set_id"]
+        plan.append({"url": set_row["img_url"], "path": set_dir / "set.jpg"})
+    for row in minifigs_rows:
+        if (row.get("known_character") or "").strip().lower() == "figurant":
+            continue
+        set_dir = base_dir / set_ids[row["set_num"]]
+        character_dir = set_dir / sanitize_name(row["known_character"] or "Unknown")
+        minifig = minifigs_catalog[row["fig_num"]]
+        plan.append({"url": minifig["img_url"], "path": character_dir / "minifig.jpg"})
+        plan.append({"url": row["part_img_url"], "path": character_dir / "head.jpg"})
+    return plan
+
+
+def download_binary(url: str, destination_path: Path, session: requests.Session) -> bool:
+    """Télécharge un binaire vers un chemin donné. Retourne False si 404."""
+    ensure_parent_dir(destination_path)
+    response = session.get(url, stream=True)
+    if response.status_code == 404:
+        return False
+    response.raise_for_status()
+    with destination_path.open("wb") as target_file:
+        for chunk in response.iter_content(chunk_size=8192):
+            target_file.write(chunk)
+    return True
+
+
+def download_resources(
+    plan: Iterable[dict],
+    downloader: Callable[[str, Path], bool],
+    delay_seconds: float = 0.35,
+    log_path: Path | None = None,
+) -> None:
+    """Exécute les téléchargements en espaçant les requêtes et journalise les statuts."""
+    cache: Dict[str, Path] = {}
+    log_rows: List[dict] = []
+    for item in plan:
+        if item["path"].exists():
+            time.sleep(delay_seconds)
+            continue
+        if item["url"] in cache and cache[item["url"]].exists():
+            ensure_parent_dir(item["path"])
+            shutil.copy2(cache[item["url"]], item["path"])
+        else:
+            success = downloader(item["url"], item["path"])
+            if success:
+                cache[item["url"]] = item["path"]
+            else:
+                log_rows.append({"url": item["url"], "path": str(item["path"]), "status": "missing"})
+        time.sleep(delay_seconds)
+    if log_path is not None:
+        ensure_parent_dir(log_path)
+        with log_path.open("w", newline="") as csv_file:
+            writer = csv.DictWriter(csv_file, fieldnames=["url", "path", "status"])
+            writer.writeheader()
+            for row in log_rows:
+                writer.writerow(row)
--- a/scripts/download_resources.py
+++ b/scripts/download_resources.py
@ -0,0 +1,72 @@
+"""Télécharge les ressources (sets, minifigs, têtes) pour les sets filtrés."""
+
+import os
+from pathlib import Path
+
+import requests
+from dotenv import load_dotenv
+
+from lib.rebrickable.resources import (
+    add_part_img_urls,
+    build_download_plan,
+    build_part_img_lookup,
+    download_binary,
+    download_resources,
+    fetch_part_img_url,
+    load_minifigs_by_set,
+    load_minifigs_catalog,
+    load_sets_enriched,
+    load_part_img_cache,
+    persist_part_img_cache,
+    write_minifigs_by_set_with_images,
+)
+
+
+SETS_PATH = Path("data/intermediate/sets_enriched.csv")
+MINIFIGS_BY_SET_PATH = Path("data/intermediate/minifigs_by_set.csv")
+MINIFIGS_CATALOG_PATH = Path("data/raw/minifigs.csv")
+RESOURCES_DIR = Path("figures/rebrickable")
+REQUEST_DELAY_SECONDS_IMAGES = 0.35
+PART_IMG_CACHE_PATH = Path("data/intermediate/part_img_cache.csv")
+DOWNLOAD_LOG_PATH = Path("data/intermediate/resources_download_log.csv")
+
+
+def main() -> None:
+    """Construit les URLs manquantes et télécharge les images associées."""
+    load_dotenv()
+    token = os.environ["REBRICKABLE_TOKEN"]
+    session = requests.Session()
+
+    sets = load_sets_enriched(SETS_PATH)
+    minifigs_by_set = load_minifigs_by_set(MINIFIGS_BY_SET_PATH)
+    minifigs_catalog = load_minifigs_catalog(MINIFIGS_CATALOG_PATH)
+    cache = load_part_img_cache(PART_IMG_CACHE_PATH)
+
+    missing_part_numbers = {
+        row["part_num"]
+        for row in minifigs_by_set
+        if row.get("part_img_url", "").strip() == "" and row["part_num"] not in cache
+    }
+    part_img_lookup = build_part_img_lookup(
+        missing_part_numbers,
+        fetcher=lambda part_num: fetch_part_img_url(part_num, token, session),
+        cache_path=PART_IMG_CACHE_PATH,
+        existing_cache=cache,
+    )
+    if cache:
+        part_img_lookup.update(cache)
+    persist_part_img_cache(PART_IMG_CACHE_PATH, part_img_lookup)
+    minifigs_with_imgs = add_part_img_urls(minifigs_by_set, part_img_lookup)
+    write_minifigs_by_set_with_images(MINIFIGS_BY_SET_PATH, minifigs_with_imgs)
+
+    plan = build_download_plan(sets, minifigs_with_imgs, minifigs_catalog, RESOURCES_DIR)
+    download_resources(
+        plan,
+        downloader=lambda url, path: download_binary(url, path, session),
+        delay_seconds=REQUEST_DELAY_SECONDS_IMAGES,
+        log_path=DOWNLOAD_LOG_PATH,
+    )
+
+
+if __name__ == "__main__":
+    main()
--- a/tests/test_resources.py
+++ b/tests/test_resources.py
@ -0,0 +1,108 @@
+"""Tests des outils de téléchargement de ressources."""
+
+from pathlib import Path
+
+from lib.rebrickable.resources import (
+    add_part_img_urls,
+    build_download_plan,
+    build_part_img_lookup,
+    download_resources,
+    sanitize_name,
+    write_minifigs_by_set_with_images,
+)
+from lib.filesystem import ensure_parent_dir
+from lib.rebrickable.stats import read_rows
+
+
+def test_build_part_img_lookup_calls_fetcher_once_per_part() -> None:
+    """Construit un index en appelant le fetcher sur les références uniques."""
+    calls: list[str] = []
+
+    def fetcher(part_num: str) -> str:
+        calls.append(part_num)
+        return f"url-{part_num}"
+
+    lookup = build_part_img_lookup(["p1", "p2", "p1"], fetcher, delay_seconds=0)
+
+    assert lookup == {"p1": "url-p1", "p2": "url-p2"}
+    assert calls == ["p1", "p2"]
+
+
+def test_add_part_img_urls_and_write(tmp_path: Path) -> None:
+    """Ajoute les URLs de tête et réécrit minifigs_by_set."""
+    rows = [
+        {"set_num": "123-1", "part_num": "p1", "known_character": "Alice", "fig_num": "f1", "gender": "female"},
+    ]
+    lookup = {"p1": "http://img/p1.jpg"}
+
+    enriched = add_part_img_urls(rows, lookup)
+    destination = tmp_path / "minifigs_by_set.csv"
+    write_minifigs_by_set_with_images(destination, enriched)
+
+    assert read_rows(destination) == [
+        {
+            "set_num": "123-1",
+            "part_num": "p1",
+            "known_character": "Alice",
+            "fig_num": "f1",
+            "gender": "female",
+            "part_img_url": "http://img/p1.jpg",
+        }
+    ]
+
+
+def test_build_download_plan_and_download(tmp_path: Path) -> None:
+    """Construit le plan et télécharge les binaires via un downloader stub."""
+    sets_rows = [
+        {"set_num": "123-1", "set_id": "123", "img_url": "http://set.img", "name": "A", "year": "2020"},
+    ]
+    minifigs_rows = [
+        {"set_num": "123-1", "part_num": "p1", "known_character": "Bob", "fig_num": "fig-1", "gender": "male", "part_img_url": "http://head.img"}
+    ]
+    minifigs_catalog = {"fig-1": {"img_url": "http://fig.img"}}
+    base_dir = tmp_path / "resources"
+
+    plan = build_download_plan(sets_rows, minifigs_rows, minifigs_catalog, base_dir)
+    downloaded: list[tuple[str, Path]] = []
+
+    def downloader(url: str, path: Path) -> bool:
+        downloaded.append((url, path))
+        path.parent.mkdir(parents=True, exist_ok=True)
+        path.write_bytes(b"data")
+        return True
+
+    download_resources(plan, downloader, delay_seconds=0, log_path=tmp_path / "log.csv")
+
+    assert downloaded == [
+        ("http://set.img", base_dir / "123" / "set.jpg"),
+        ("http://fig.img", base_dir / "123" / "Bob" / "minifig.jpg"),
+        ("http://head.img", base_dir / "123" / "Bob" / "head.jpg"),
+    ]
+    assert (base_dir / "123" / "Bob" / "head.jpg").exists()
+
+
+def test_download_resources_duplicates_from_cache(tmp_path: Path) -> None:
+    """Duplique les fichiers déjà téléchargés pour d'autres sets."""
+    plan = [
+        {"url": "http://same.img", "path": tmp_path / "resources" / "111" / "set.jpg"},
+        {"url": "http://same.img", "path": tmp_path / "resources" / "222" / "set.jpg"},
+    ]
+    downloads: list[tuple[str, Path]] = []
+
+    def downloader(url: str, path: Path) -> bool:
+        downloads.append((url, path))
+        ensure_parent_dir(path)
+        path.write_bytes(b"img")
+        return True
+
+    download_resources(plan, downloader, delay_seconds=0, log_path=tmp_path / "log.csv")
+
+    assert downloads == [("http://same.img", tmp_path / "resources" / "111" / "set.jpg")]
+    assert (tmp_path / "resources" / "222" / "set.jpg").exists()
+
+
+def test_sanitize_name_handles_special_chars() -> None:
+    """Nettoie les noms en enlevant les caractères spéciaux."""
+    assert sanitize_name("Owen Grady") == "Owen_Grady"
+    assert sanitize_name("Kayla-Watts!") == "Kayla_Watts"
+    assert sanitize_name("") == "Unknown"