Rend le téléchargement des ressources plus patient et cache

2025-12-02 21:19:30 +01:00
parent 41798ea3df
commit fd5db10693
4 changed files with 395 additions and 0 deletions
--- a/README.md
+++ b/README.md
@@ -327,3 +327,17 @@ Le calcul lit `data/intermediate/minifigs_by_set.csv`, `data/raw/parts.csv` et `
 - `data/intermediate/minifig_head_faces_by_character.csv` : volume et part par personnage.
 Les visuels `figures/step30/minifig_head_faces_timeline.png`, `figures/step30/minifig_head_faces_top_sets.png` et `figures/step30/minifig_head_faces_characters.png` montrent respectivement la trajectoire annuelle, le top des sets concernés et les personnages dotés de têtes dual-face.
 ### Étape 31 : téléchargement des ressources visuelles (sets, minifigs, têtes)
 1. Renseigner `REBRICKABLE_TOKEN` dans `.env` (clé API Rebrickable).
 2. `source .venv/bin/activate`
 3. `python -m scripts.download_resources`
 Le script lit `data/intermediate/sets_enriched.csv`, `data/intermediate/minifigs_by_set.csv` et `data/raw/minifigs.csv`. Il récupère les URLs d'images de têtes via l’API Rebrickable `/api/v3/lego/parts/{part_num}/`, ajoute la colonne `part_img_url` à `data/intermediate/minifigs_by_set.csv`, puis télécharge :
 - `resources/{set_id}/set.jpg` : visuel du set (img_url).
 - `resources/{set_id}/{known_character}/minifig.jpg` : visuel de la minifig complète.
 - `resources/{set_id}/{known_character}/head.jpg` : visuel de la tête correspondante.
 Les requêtes API sont dédoublonnées, espacées (fair-use) et mises en cache dans `data/intermediate/part_img_cache.csv` pour reprise en cas d’interruption. Les images déjà téléchargées sont réutilisées localement pour éviter les requêtes répétées.
--- a/lib/rebrickable/resources.py
+++ b/lib/rebrickable/resources.py
@@ -0,0 +1,201 @@
 """Téléchargement des ressources (sets, minifigs, têtes) et enrichissement des URLs."""
 import os
 import time
 import re
 import csv
 import shutil
 from pathlib import Path
 from typing import Callable, Dict, Iterable, List, Sequence
 import requests
 from lib.filesystem import ensure_parent_dir
 from lib.rebrickable.stats import read_rows
 def load_sets_enriched(path: Path) -> List[dict]:
    """Charge les sets enrichis pour accéder aux URLs d'images de set."""
    return read_rows(path)
 def load_minifigs_by_set(path: Path) -> List[dict]:
    """Charge minifigs_by_set.csv en mémoire."""
    return read_rows(path)
 def load_minifigs_catalog(path: Path) -> Dict[str, dict]:
    """Indexe les minifigs par identifiant."""
    catalog: Dict[str, dict] = {}
    with path.open() as csv_file:
        reader = csv.DictReader(csv_file)
        for row in reader:
            catalog[row["fig_num"]] = row
    return catalog
 def fetch_part_img_url(part_num: str, token: str, session: requests.Session) -> str:
    """Récupère l'URL d'image d'une pièce via l'API Rebrickable."""
    retries = 0
    backoff = 2.0
    while True:
        response = session.get(f"https://rebrickable.com/api/v3/lego/parts/{part_num}/", headers={"Authorization": f"key {token}"})
        if response.status_code == 429:
            time.sleep(backoff)
            retries += 1
            backoff = min(backoff * 1.5, 10.0)
            if retries > 8:
                response.raise_for_status()
            continue
        response.raise_for_status()
        payload = response.json()
        return payload["part_img_url"]
 def load_part_img_cache(cache_path: Path) -> Dict[str, str]:
    """Charge le cache des URLs de têtes s'il existe."""
    if not cache_path.exists():
        return {}
    cache: Dict[str, str] = {}
    with cache_path.open() as cache_file:
        reader = csv.DictReader(cache_file)
        for row in reader:
            cache[row["part_num"]] = row["part_img_url"]
    return cache
 def persist_part_img_cache(cache_path: Path, cache: Dict[str, str]) -> None:
    """Persist le cache des URLs pour reprise après interruption."""
    ensure_parent_dir(cache_path)
    with cache_path.open("w", newline="") as cache_file:
        writer = csv.DictWriter(cache_file, fieldnames=["part_num", "part_img_url"])
        writer.writeheader()
        for part_num, url in sorted(cache.items()):
            writer.writerow({"part_num": part_num, "part_img_url": url})
 def build_part_img_lookup(
    part_numbers: Iterable[str],
    fetcher: Callable[[str], str],
    cache_path: Path | None = None,
    existing_cache: Dict[str, str] | None = None,
    delay_seconds: float = 1.6,
 ) -> Dict[str, str]:
    """Construit un index part_num -> URL d'image en espaçant les requêtes."""
    cache = dict(existing_cache or {})
    unique_parts = sorted(set(part_numbers))
    for part_num in unique_parts:
        if part_num in cache:
            continue
        cache[part_num] = fetcher(part_num)
        if cache_path is not None:
            persist_part_img_cache(cache_path, cache)
        time.sleep(delay_seconds)
    return cache
 def add_part_img_urls(minifigs_rows: Iterable[dict], part_img_lookup: Dict[str, str]) -> List[dict]:
    """Ajoute part_img_url aux lignes minifigs_by_set."""
    enriched: List[dict] = []
    for row in minifigs_rows:
        existing = row.get("part_img_url", "").strip()
        part_img_url = existing if existing != "" else part_img_lookup[row["part_num"]]
        enriched.append(
            {
                "set_num": row["set_num"],
                "part_num": row["part_num"],
                "known_character": row["known_character"],
                "fig_num": row["fig_num"],
                "gender": row["gender"],
                "part_img_url": part_img_url,
            }
        )
    return enriched
 def write_minifigs_by_set_with_images(destination_path: Path, rows: Sequence[dict]) -> None:
    """Réécrit le CSV minifigs_by_set avec la colonne part_img_url."""
    ensure_parent_dir(destination_path)
    fieldnames = ["set_num", "part_num", "known_character", "fig_num", "gender", "part_img_url"]
    with destination_path.open("w", newline="") as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()
        for row in rows:
            writer.writerow(row)
 def sanitize_name(raw_name: str) -> str:
    """Nettoie un nom pour construire un chemin de fichier sûr."""
    cleaned = re.sub(r"[^A-Za-z0-9]+", "_", raw_name).strip("_")
    if cleaned == "":
        return "Unknown"
    return cleaned
 def build_download_plan(
    sets_rows: Iterable[dict],
    minifigs_rows: Iterable[dict],
    minifigs_catalog: Dict[str, dict],
    base_dir: Path,
 ) -> List[dict]:
    """Construit la liste des fichiers à télécharger (sets, minifigs, têtes)."""
    plan: List[dict] = []
    sets_list = list(sets_rows)
    set_ids: Dict[str, str] = {row["set_num"]: row["set_id"] for row in sets_list}
    for set_row in sets_list:
        set_dir = base_dir / set_row["set_id"]
        plan.append({"url": set_row["img_url"], "path": set_dir / "set.jpg"})
    for row in minifigs_rows:
        if (row.get("known_character") or "").strip().lower() == "figurant":
            continue
        set_dir = base_dir / set_ids[row["set_num"]]
        character_dir = set_dir / sanitize_name(row["known_character"] or "Unknown")
        minifig = minifigs_catalog[row["fig_num"]]
        plan.append({"url": minifig["img_url"], "path": character_dir / "minifig.jpg"})
        plan.append({"url": row["part_img_url"], "path": character_dir / "head.jpg"})
    return plan
 def download_binary(url: str, destination_path: Path, session: requests.Session) -> bool:
    """Télécharge un binaire vers un chemin donné. Retourne False si 404."""
    ensure_parent_dir(destination_path)
    response = session.get(url, stream=True)
    if response.status_code == 404:
        return False
    response.raise_for_status()
    with destination_path.open("wb") as target_file:
        for chunk in response.iter_content(chunk_size=8192):
            target_file.write(chunk)
    return True
 def download_resources(
    plan: Iterable[dict],
    downloader: Callable[[str, Path], bool],
    delay_seconds: float = 0.35,
    log_path: Path | None = None,
 ) -> None:
    """Exécute les téléchargements en espaçant les requêtes et journalise les statuts."""
    cache: Dict[str, Path] = {}
    log_rows: List[dict] = []
    for item in plan:
        if item["path"].exists():
            time.sleep(delay_seconds)
            continue
        if item["url"] in cache and cache[item["url"]].exists():
            ensure_parent_dir(item["path"])
            shutil.copy2(cache[item["url"]], item["path"])
        else:
            success = downloader(item["url"], item["path"])
            if success:
                cache[item["url"]] = item["path"]
            else:
                log_rows.append({"url": item["url"], "path": str(item["path"]), "status": "missing"})
        time.sleep(delay_seconds)
    if log_path is not None:
        ensure_parent_dir(log_path)
        with log_path.open("w", newline="") as csv_file:
            writer = csv.DictWriter(csv_file, fieldnames=["url", "path", "status"])
            writer.writeheader()
            for row in log_rows:
                writer.writerow(row)
--- a/scripts/download_resources.py
+++ b/scripts/download_resources.py
@@ -0,0 +1,72 @@
 """Télécharge les ressources (sets, minifigs, têtes) pour les sets filtrés."""
 import os
 from pathlib import Path
 import requests
 from dotenv import load_dotenv
 from lib.rebrickable.resources import (
    add_part_img_urls,
    build_download_plan,
    build_part_img_lookup,
    download_binary,
    download_resources,
    fetch_part_img_url,
    load_minifigs_by_set,
    load_minifigs_catalog,
    load_sets_enriched,
    load_part_img_cache,
    persist_part_img_cache,
    write_minifigs_by_set_with_images,
 )
 SETS_PATH = Path("data/intermediate/sets_enriched.csv")
 MINIFIGS_BY_SET_PATH = Path("data/intermediate/minifigs_by_set.csv")
 MINIFIGS_CATALOG_PATH = Path("data/raw/minifigs.csv")
 RESOURCES_DIR = Path("figures/rebrickable")
 REQUEST_DELAY_SECONDS_IMAGES = 0.35
 PART_IMG_CACHE_PATH = Path("data/intermediate/part_img_cache.csv")
 DOWNLOAD_LOG_PATH = Path("data/intermediate/resources_download_log.csv")
 def main() -> None:
    """Construit les URLs manquantes et télécharge les images associées."""
    load_dotenv()
    token = os.environ["REBRICKABLE_TOKEN"]
    session = requests.Session()
    sets = load_sets_enriched(SETS_PATH)
    minifigs_by_set = load_minifigs_by_set(MINIFIGS_BY_SET_PATH)
    minifigs_catalog = load_minifigs_catalog(MINIFIGS_CATALOG_PATH)
    cache = load_part_img_cache(PART_IMG_CACHE_PATH)
    missing_part_numbers = {
        row["part_num"]
        for row in minifigs_by_set
        if row.get("part_img_url", "").strip() == "" and row["part_num"] not in cache
    }
    part_img_lookup = build_part_img_lookup(
        missing_part_numbers,
        fetcher=lambda part_num: fetch_part_img_url(part_num, token, session),
        cache_path=PART_IMG_CACHE_PATH,
        existing_cache=cache,
    )
    if cache:
        part_img_lookup.update(cache)
    persist_part_img_cache(PART_IMG_CACHE_PATH, part_img_lookup)
    minifigs_with_imgs = add_part_img_urls(minifigs_by_set, part_img_lookup)
    write_minifigs_by_set_with_images(MINIFIGS_BY_SET_PATH, minifigs_with_imgs)
    plan = build_download_plan(sets, minifigs_with_imgs, minifigs_catalog, RESOURCES_DIR)
    download_resources(
        plan,
        downloader=lambda url, path: download_binary(url, path, session),
        delay_seconds=REQUEST_DELAY_SECONDS_IMAGES,
        log_path=DOWNLOAD_LOG_PATH,
    )
 if __name__ == "__main__":
    main()
--- a/tests/test_resources.py
+++ b/tests/test_resources.py
@@ -0,0 +1,108 @@
 """Tests des outils de téléchargement de ressources."""
 from pathlib import Path
 from lib.rebrickable.resources import (
    add_part_img_urls,
    build_download_plan,
    build_part_img_lookup,
    download_resources,
    sanitize_name,
    write_minifigs_by_set_with_images,
 )
 from lib.filesystem import ensure_parent_dir
 from lib.rebrickable.stats import read_rows
 def test_build_part_img_lookup_calls_fetcher_once_per_part() -> None:
    """Construit un index en appelant le fetcher sur les références uniques."""
    calls: list[str] = []
    def fetcher(part_num: str) -> str:
        calls.append(part_num)
        return f"url-{part_num}"
    lookup = build_part_img_lookup(["p1", "p2", "p1"], fetcher, delay_seconds=0)
    assert lookup == {"p1": "url-p1", "p2": "url-p2"}
    assert calls == ["p1", "p2"]
 def test_add_part_img_urls_and_write(tmp_path: Path) -> None:
    """Ajoute les URLs de tête et réécrit minifigs_by_set."""
    rows = [
        {"set_num": "123-1", "part_num": "p1", "known_character": "Alice", "fig_num": "f1", "gender": "female"},
    ]
    lookup = {"p1": "http://img/p1.jpg"}
    enriched = add_part_img_urls(rows, lookup)
    destination = tmp_path / "minifigs_by_set.csv"
    write_minifigs_by_set_with_images(destination, enriched)
    assert read_rows(destination) == [
        {
            "set_num": "123-1",
            "part_num": "p1",
            "known_character": "Alice",
            "fig_num": "f1",
            "gender": "female",
            "part_img_url": "http://img/p1.jpg",
        }
    ]
 def test_build_download_plan_and_download(tmp_path: Path) -> None:
    """Construit le plan et télécharge les binaires via un downloader stub."""
    sets_rows = [
        {"set_num": "123-1", "set_id": "123", "img_url": "http://set.img", "name": "A", "year": "2020"},
    ]
    minifigs_rows = [
        {"set_num": "123-1", "part_num": "p1", "known_character": "Bob", "fig_num": "fig-1", "gender": "male", "part_img_url": "http://head.img"}
    ]
    minifigs_catalog = {"fig-1": {"img_url": "http://fig.img"}}
    base_dir = tmp_path / "resources"
    plan = build_download_plan(sets_rows, minifigs_rows, minifigs_catalog, base_dir)
    downloaded: list[tuple[str, Path]] = []
    def downloader(url: str, path: Path) -> bool:
        downloaded.append((url, path))
        path.parent.mkdir(parents=True, exist_ok=True)
        path.write_bytes(b"data")
        return True
    download_resources(plan, downloader, delay_seconds=0, log_path=tmp_path / "log.csv")
    assert downloaded == [
        ("http://set.img", base_dir / "123" / "set.jpg"),
        ("http://fig.img", base_dir / "123" / "Bob" / "minifig.jpg"),
        ("http://head.img", base_dir / "123" / "Bob" / "head.jpg"),
    ]
    assert (base_dir / "123" / "Bob" / "head.jpg").exists()
 def test_download_resources_duplicates_from_cache(tmp_path: Path) -> None:
    """Duplique les fichiers déjà téléchargés pour d'autres sets."""
    plan = [
        {"url": "http://same.img", "path": tmp_path / "resources" / "111" / "set.jpg"},
        {"url": "http://same.img", "path": tmp_path / "resources" / "222" / "set.jpg"},
    ]
    downloads: list[tuple[str, Path]] = []
    def downloader(url: str, path: Path) -> bool:
        downloads.append((url, path))
        ensure_parent_dir(path)
        path.write_bytes(b"img")
        return True
    download_resources(plan, downloader, delay_seconds=0, log_path=tmp_path / "log.csv")
    assert downloads == [("http://same.img", tmp_path / "resources" / "111" / "set.jpg")]
    assert (tmp_path / "resources" / "222" / "set.jpg").exists()
 def test_sanitize_name_handles_special_chars() -> None:
    """Nettoie les noms en enlevant les caractères spéciaux."""
    assert sanitize_name("Owen Grady") == "Owen_Grady"
    assert sanitize_name("Kayla-Watts!") == "Kayla_Watts"
    assert sanitize_name("") == "Unknown"