1

Rend le téléchargement des ressources plus patient et cache

This commit is contained in:
Richard Dern 2025-12-02 21:19:30 +01:00
parent 41798ea3df
commit fd5db10693
4 changed files with 395 additions and 0 deletions

View File

@ -327,3 +327,17 @@ Le calcul lit `data/intermediate/minifigs_by_set.csv`, `data/raw/parts.csv` et `
- `data/intermediate/minifig_head_faces_by_character.csv` : volume et part par personnage.
Les visuels `figures/step30/minifig_head_faces_timeline.png`, `figures/step30/minifig_head_faces_top_sets.png` et `figures/step30/minifig_head_faces_characters.png` montrent respectivement la trajectoire annuelle, le top des sets concernés et les personnages dotés de têtes dual-face.
### Étape 31 : téléchargement des ressources visuelles (sets, minifigs, têtes)
1. Renseigner `REBRICKABLE_TOKEN` dans `.env` (clé API Rebrickable).
2. `source .venv/bin/activate`
3. `python -m scripts.download_resources`
Le script lit `data/intermediate/sets_enriched.csv`, `data/intermediate/minifigs_by_set.csv` et `data/raw/minifigs.csv`. Il récupère les URLs d'images de têtes via lAPI Rebrickable `/api/v3/lego/parts/{part_num}/`, ajoute la colonne `part_img_url` à `data/intermediate/minifigs_by_set.csv`, puis télécharge :
- `resources/{set_id}/set.jpg` : visuel du set (img_url).
- `resources/{set_id}/{known_character}/minifig.jpg` : visuel de la minifig complète.
- `resources/{set_id}/{known_character}/head.jpg` : visuel de la tête correspondante.
Les requêtes API sont dédoublonnées, espacées (fair-use) et mises en cache dans `data/intermediate/part_img_cache.csv` pour reprise en cas dinterruption. Les images déjà téléchargées sont réutilisées localement pour éviter les requêtes répétées.

View File

@ -0,0 +1,201 @@
"""Téléchargement des ressources (sets, minifigs, têtes) et enrichissement des URLs."""
import os
import time
import re
import csv
import shutil
from pathlib import Path
from typing import Callable, Dict, Iterable, List, Sequence
import requests
from lib.filesystem import ensure_parent_dir
from lib.rebrickable.stats import read_rows
def load_sets_enriched(path: Path) -> List[dict]:
"""Charge les sets enrichis pour accéder aux URLs d'images de set."""
return read_rows(path)
def load_minifigs_by_set(path: Path) -> List[dict]:
"""Charge minifigs_by_set.csv en mémoire."""
return read_rows(path)
def load_minifigs_catalog(path: Path) -> Dict[str, dict]:
"""Indexe les minifigs par identifiant."""
catalog: Dict[str, dict] = {}
with path.open() as csv_file:
reader = csv.DictReader(csv_file)
for row in reader:
catalog[row["fig_num"]] = row
return catalog
def fetch_part_img_url(part_num: str, token: str, session: requests.Session) -> str:
"""Récupère l'URL d'image d'une pièce via l'API Rebrickable."""
retries = 0
backoff = 2.0
while True:
response = session.get(f"https://rebrickable.com/api/v3/lego/parts/{part_num}/", headers={"Authorization": f"key {token}"})
if response.status_code == 429:
time.sleep(backoff)
retries += 1
backoff = min(backoff * 1.5, 10.0)
if retries > 8:
response.raise_for_status()
continue
response.raise_for_status()
payload = response.json()
return payload["part_img_url"]
def load_part_img_cache(cache_path: Path) -> Dict[str, str]:
"""Charge le cache des URLs de têtes s'il existe."""
if not cache_path.exists():
return {}
cache: Dict[str, str] = {}
with cache_path.open() as cache_file:
reader = csv.DictReader(cache_file)
for row in reader:
cache[row["part_num"]] = row["part_img_url"]
return cache
def persist_part_img_cache(cache_path: Path, cache: Dict[str, str]) -> None:
"""Persist le cache des URLs pour reprise après interruption."""
ensure_parent_dir(cache_path)
with cache_path.open("w", newline="") as cache_file:
writer = csv.DictWriter(cache_file, fieldnames=["part_num", "part_img_url"])
writer.writeheader()
for part_num, url in sorted(cache.items()):
writer.writerow({"part_num": part_num, "part_img_url": url})
def build_part_img_lookup(
part_numbers: Iterable[str],
fetcher: Callable[[str], str],
cache_path: Path | None = None,
existing_cache: Dict[str, str] | None = None,
delay_seconds: float = 1.6,
) -> Dict[str, str]:
"""Construit un index part_num -> URL d'image en espaçant les requêtes."""
cache = dict(existing_cache or {})
unique_parts = sorted(set(part_numbers))
for part_num in unique_parts:
if part_num in cache:
continue
cache[part_num] = fetcher(part_num)
if cache_path is not None:
persist_part_img_cache(cache_path, cache)
time.sleep(delay_seconds)
return cache
def add_part_img_urls(minifigs_rows: Iterable[dict], part_img_lookup: Dict[str, str]) -> List[dict]:
"""Ajoute part_img_url aux lignes minifigs_by_set."""
enriched: List[dict] = []
for row in minifigs_rows:
existing = row.get("part_img_url", "").strip()
part_img_url = existing if existing != "" else part_img_lookup[row["part_num"]]
enriched.append(
{
"set_num": row["set_num"],
"part_num": row["part_num"],
"known_character": row["known_character"],
"fig_num": row["fig_num"],
"gender": row["gender"],
"part_img_url": part_img_url,
}
)
return enriched
def write_minifigs_by_set_with_images(destination_path: Path, rows: Sequence[dict]) -> None:
"""Réécrit le CSV minifigs_by_set avec la colonne part_img_url."""
ensure_parent_dir(destination_path)
fieldnames = ["set_num", "part_num", "known_character", "fig_num", "gender", "part_img_url"]
with destination_path.open("w", newline="") as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader()
for row in rows:
writer.writerow(row)
def sanitize_name(raw_name: str) -> str:
"""Nettoie un nom pour construire un chemin de fichier sûr."""
cleaned = re.sub(r"[^A-Za-z0-9]+", "_", raw_name).strip("_")
if cleaned == "":
return "Unknown"
return cleaned
def build_download_plan(
sets_rows: Iterable[dict],
minifigs_rows: Iterable[dict],
minifigs_catalog: Dict[str, dict],
base_dir: Path,
) -> List[dict]:
"""Construit la liste des fichiers à télécharger (sets, minifigs, têtes)."""
plan: List[dict] = []
sets_list = list(sets_rows)
set_ids: Dict[str, str] = {row["set_num"]: row["set_id"] for row in sets_list}
for set_row in sets_list:
set_dir = base_dir / set_row["set_id"]
plan.append({"url": set_row["img_url"], "path": set_dir / "set.jpg"})
for row in minifigs_rows:
if (row.get("known_character") or "").strip().lower() == "figurant":
continue
set_dir = base_dir / set_ids[row["set_num"]]
character_dir = set_dir / sanitize_name(row["known_character"] or "Unknown")
minifig = minifigs_catalog[row["fig_num"]]
plan.append({"url": minifig["img_url"], "path": character_dir / "minifig.jpg"})
plan.append({"url": row["part_img_url"], "path": character_dir / "head.jpg"})
return plan
def download_binary(url: str, destination_path: Path, session: requests.Session) -> bool:
"""Télécharge un binaire vers un chemin donné. Retourne False si 404."""
ensure_parent_dir(destination_path)
response = session.get(url, stream=True)
if response.status_code == 404:
return False
response.raise_for_status()
with destination_path.open("wb") as target_file:
for chunk in response.iter_content(chunk_size=8192):
target_file.write(chunk)
return True
def download_resources(
plan: Iterable[dict],
downloader: Callable[[str, Path], bool],
delay_seconds: float = 0.35,
log_path: Path | None = None,
) -> None:
"""Exécute les téléchargements en espaçant les requêtes et journalise les statuts."""
cache: Dict[str, Path] = {}
log_rows: List[dict] = []
for item in plan:
if item["path"].exists():
time.sleep(delay_seconds)
continue
if item["url"] in cache and cache[item["url"]].exists():
ensure_parent_dir(item["path"])
shutil.copy2(cache[item["url"]], item["path"])
else:
success = downloader(item["url"], item["path"])
if success:
cache[item["url"]] = item["path"]
else:
log_rows.append({"url": item["url"], "path": str(item["path"]), "status": "missing"})
time.sleep(delay_seconds)
if log_path is not None:
ensure_parent_dir(log_path)
with log_path.open("w", newline="") as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=["url", "path", "status"])
writer.writeheader()
for row in log_rows:
writer.writerow(row)

View File

@ -0,0 +1,72 @@
"""Télécharge les ressources (sets, minifigs, têtes) pour les sets filtrés."""
import os
from pathlib import Path
import requests
from dotenv import load_dotenv
from lib.rebrickable.resources import (
add_part_img_urls,
build_download_plan,
build_part_img_lookup,
download_binary,
download_resources,
fetch_part_img_url,
load_minifigs_by_set,
load_minifigs_catalog,
load_sets_enriched,
load_part_img_cache,
persist_part_img_cache,
write_minifigs_by_set_with_images,
)
SETS_PATH = Path("data/intermediate/sets_enriched.csv")
MINIFIGS_BY_SET_PATH = Path("data/intermediate/minifigs_by_set.csv")
MINIFIGS_CATALOG_PATH = Path("data/raw/minifigs.csv")
RESOURCES_DIR = Path("figures/rebrickable")
REQUEST_DELAY_SECONDS_IMAGES = 0.35
PART_IMG_CACHE_PATH = Path("data/intermediate/part_img_cache.csv")
DOWNLOAD_LOG_PATH = Path("data/intermediate/resources_download_log.csv")
def main() -> None:
"""Construit les URLs manquantes et télécharge les images associées."""
load_dotenv()
token = os.environ["REBRICKABLE_TOKEN"]
session = requests.Session()
sets = load_sets_enriched(SETS_PATH)
minifigs_by_set = load_minifigs_by_set(MINIFIGS_BY_SET_PATH)
minifigs_catalog = load_minifigs_catalog(MINIFIGS_CATALOG_PATH)
cache = load_part_img_cache(PART_IMG_CACHE_PATH)
missing_part_numbers = {
row["part_num"]
for row in minifigs_by_set
if row.get("part_img_url", "").strip() == "" and row["part_num"] not in cache
}
part_img_lookup = build_part_img_lookup(
missing_part_numbers,
fetcher=lambda part_num: fetch_part_img_url(part_num, token, session),
cache_path=PART_IMG_CACHE_PATH,
existing_cache=cache,
)
if cache:
part_img_lookup.update(cache)
persist_part_img_cache(PART_IMG_CACHE_PATH, part_img_lookup)
minifigs_with_imgs = add_part_img_urls(minifigs_by_set, part_img_lookup)
write_minifigs_by_set_with_images(MINIFIGS_BY_SET_PATH, minifigs_with_imgs)
plan = build_download_plan(sets, minifigs_with_imgs, minifigs_catalog, RESOURCES_DIR)
download_resources(
plan,
downloader=lambda url, path: download_binary(url, path, session),
delay_seconds=REQUEST_DELAY_SECONDS_IMAGES,
log_path=DOWNLOAD_LOG_PATH,
)
if __name__ == "__main__":
main()

108
tests/test_resources.py Normal file
View File

@ -0,0 +1,108 @@
"""Tests des outils de téléchargement de ressources."""
from pathlib import Path
from lib.rebrickable.resources import (
add_part_img_urls,
build_download_plan,
build_part_img_lookup,
download_resources,
sanitize_name,
write_minifigs_by_set_with_images,
)
from lib.filesystem import ensure_parent_dir
from lib.rebrickable.stats import read_rows
def test_build_part_img_lookup_calls_fetcher_once_per_part() -> None:
"""Construit un index en appelant le fetcher sur les références uniques."""
calls: list[str] = []
def fetcher(part_num: str) -> str:
calls.append(part_num)
return f"url-{part_num}"
lookup = build_part_img_lookup(["p1", "p2", "p1"], fetcher, delay_seconds=0)
assert lookup == {"p1": "url-p1", "p2": "url-p2"}
assert calls == ["p1", "p2"]
def test_add_part_img_urls_and_write(tmp_path: Path) -> None:
"""Ajoute les URLs de tête et réécrit minifigs_by_set."""
rows = [
{"set_num": "123-1", "part_num": "p1", "known_character": "Alice", "fig_num": "f1", "gender": "female"},
]
lookup = {"p1": "http://img/p1.jpg"}
enriched = add_part_img_urls(rows, lookup)
destination = tmp_path / "minifigs_by_set.csv"
write_minifigs_by_set_with_images(destination, enriched)
assert read_rows(destination) == [
{
"set_num": "123-1",
"part_num": "p1",
"known_character": "Alice",
"fig_num": "f1",
"gender": "female",
"part_img_url": "http://img/p1.jpg",
}
]
def test_build_download_plan_and_download(tmp_path: Path) -> None:
"""Construit le plan et télécharge les binaires via un downloader stub."""
sets_rows = [
{"set_num": "123-1", "set_id": "123", "img_url": "http://set.img", "name": "A", "year": "2020"},
]
minifigs_rows = [
{"set_num": "123-1", "part_num": "p1", "known_character": "Bob", "fig_num": "fig-1", "gender": "male", "part_img_url": "http://head.img"}
]
minifigs_catalog = {"fig-1": {"img_url": "http://fig.img"}}
base_dir = tmp_path / "resources"
plan = build_download_plan(sets_rows, minifigs_rows, minifigs_catalog, base_dir)
downloaded: list[tuple[str, Path]] = []
def downloader(url: str, path: Path) -> bool:
downloaded.append((url, path))
path.parent.mkdir(parents=True, exist_ok=True)
path.write_bytes(b"data")
return True
download_resources(plan, downloader, delay_seconds=0, log_path=tmp_path / "log.csv")
assert downloaded == [
("http://set.img", base_dir / "123" / "set.jpg"),
("http://fig.img", base_dir / "123" / "Bob" / "minifig.jpg"),
("http://head.img", base_dir / "123" / "Bob" / "head.jpg"),
]
assert (base_dir / "123" / "Bob" / "head.jpg").exists()
def test_download_resources_duplicates_from_cache(tmp_path: Path) -> None:
"""Duplique les fichiers déjà téléchargés pour d'autres sets."""
plan = [
{"url": "http://same.img", "path": tmp_path / "resources" / "111" / "set.jpg"},
{"url": "http://same.img", "path": tmp_path / "resources" / "222" / "set.jpg"},
]
downloads: list[tuple[str, Path]] = []
def downloader(url: str, path: Path) -> bool:
downloads.append((url, path))
ensure_parent_dir(path)
path.write_bytes(b"img")
return True
download_resources(plan, downloader, delay_seconds=0, log_path=tmp_path / "log.csv")
assert downloads == [("http://same.img", tmp_path / "resources" / "111" / "set.jpg")]
assert (tmp_path / "resources" / "222" / "set.jpg").exists()
def test_sanitize_name_handles_special_chars() -> None:
"""Nettoie les noms en enlevant les caractères spéciaux."""
assert sanitize_name("Owen Grady") == "Owen_Grady"
assert sanitize_name("Kayla-Watts!") == "Kayla_Watts"
assert sanitize_name("") == "Unknown"