Rend le téléchargement des ressources plus patient et cache
This commit is contained in:
parent
41798ea3df
commit
fd5db10693
14
README.md
14
README.md
@ -327,3 +327,17 @@ Le calcul lit `data/intermediate/minifigs_by_set.csv`, `data/raw/parts.csv` et `
|
|||||||
- `data/intermediate/minifig_head_faces_by_character.csv` : volume et part par personnage.
|
- `data/intermediate/minifig_head_faces_by_character.csv` : volume et part par personnage.
|
||||||
|
|
||||||
Les visuels `figures/step30/minifig_head_faces_timeline.png`, `figures/step30/minifig_head_faces_top_sets.png` et `figures/step30/minifig_head_faces_characters.png` montrent respectivement la trajectoire annuelle, le top des sets concernés et les personnages dotés de têtes dual-face.
|
Les visuels `figures/step30/minifig_head_faces_timeline.png`, `figures/step30/minifig_head_faces_top_sets.png` et `figures/step30/minifig_head_faces_characters.png` montrent respectivement la trajectoire annuelle, le top des sets concernés et les personnages dotés de têtes dual-face.
|
||||||
|
|
||||||
|
### Étape 31 : téléchargement des ressources visuelles (sets, minifigs, têtes)
|
||||||
|
|
||||||
|
1. Renseigner `REBRICKABLE_TOKEN` dans `.env` (clé API Rebrickable).
|
||||||
|
2. `source .venv/bin/activate`
|
||||||
|
3. `python -m scripts.download_resources`
|
||||||
|
|
||||||
|
Le script lit `data/intermediate/sets_enriched.csv`, `data/intermediate/minifigs_by_set.csv` et `data/raw/minifigs.csv`. Il récupère les URLs d'images de têtes via l’API Rebrickable `/api/v3/lego/parts/{part_num}/`, ajoute la colonne `part_img_url` à `data/intermediate/minifigs_by_set.csv`, puis télécharge :
|
||||||
|
|
||||||
|
- `resources/{set_id}/set.jpg` : visuel du set (img_url).
|
||||||
|
- `resources/{set_id}/{known_character}/minifig.jpg` : visuel de la minifig complète.
|
||||||
|
- `resources/{set_id}/{known_character}/head.jpg` : visuel de la tête correspondante.
|
||||||
|
|
||||||
|
Les requêtes API sont dédoublonnées, espacées (fair-use) et mises en cache dans `data/intermediate/part_img_cache.csv` pour reprise en cas d’interruption. Les images déjà téléchargées sont réutilisées localement pour éviter les requêtes répétées.
|
||||||
|
|||||||
201
lib/rebrickable/resources.py
Normal file
201
lib/rebrickable/resources.py
Normal file
@ -0,0 +1,201 @@
|
|||||||
|
"""Téléchargement des ressources (sets, minifigs, têtes) et enrichissement des URLs."""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
import re
|
||||||
|
import csv
|
||||||
|
import shutil
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Callable, Dict, Iterable, List, Sequence
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from lib.filesystem import ensure_parent_dir
|
||||||
|
from lib.rebrickable.stats import read_rows
|
||||||
|
|
||||||
|
|
||||||
|
def load_sets_enriched(path: Path) -> List[dict]:
|
||||||
|
"""Charge les sets enrichis pour accéder aux URLs d'images de set."""
|
||||||
|
return read_rows(path)
|
||||||
|
|
||||||
|
|
||||||
|
def load_minifigs_by_set(path: Path) -> List[dict]:
|
||||||
|
"""Charge minifigs_by_set.csv en mémoire."""
|
||||||
|
return read_rows(path)
|
||||||
|
|
||||||
|
|
||||||
|
def load_minifigs_catalog(path: Path) -> Dict[str, dict]:
|
||||||
|
"""Indexe les minifigs par identifiant."""
|
||||||
|
catalog: Dict[str, dict] = {}
|
||||||
|
with path.open() as csv_file:
|
||||||
|
reader = csv.DictReader(csv_file)
|
||||||
|
for row in reader:
|
||||||
|
catalog[row["fig_num"]] = row
|
||||||
|
return catalog
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_part_img_url(part_num: str, token: str, session: requests.Session) -> str:
|
||||||
|
"""Récupère l'URL d'image d'une pièce via l'API Rebrickable."""
|
||||||
|
retries = 0
|
||||||
|
backoff = 2.0
|
||||||
|
while True:
|
||||||
|
response = session.get(f"https://rebrickable.com/api/v3/lego/parts/{part_num}/", headers={"Authorization": f"key {token}"})
|
||||||
|
if response.status_code == 429:
|
||||||
|
time.sleep(backoff)
|
||||||
|
retries += 1
|
||||||
|
backoff = min(backoff * 1.5, 10.0)
|
||||||
|
if retries > 8:
|
||||||
|
response.raise_for_status()
|
||||||
|
continue
|
||||||
|
response.raise_for_status()
|
||||||
|
payload = response.json()
|
||||||
|
return payload["part_img_url"]
|
||||||
|
|
||||||
|
|
||||||
|
def load_part_img_cache(cache_path: Path) -> Dict[str, str]:
|
||||||
|
"""Charge le cache des URLs de têtes s'il existe."""
|
||||||
|
if not cache_path.exists():
|
||||||
|
return {}
|
||||||
|
cache: Dict[str, str] = {}
|
||||||
|
with cache_path.open() as cache_file:
|
||||||
|
reader = csv.DictReader(cache_file)
|
||||||
|
for row in reader:
|
||||||
|
cache[row["part_num"]] = row["part_img_url"]
|
||||||
|
return cache
|
||||||
|
|
||||||
|
|
||||||
|
def persist_part_img_cache(cache_path: Path, cache: Dict[str, str]) -> None:
|
||||||
|
"""Persist le cache des URLs pour reprise après interruption."""
|
||||||
|
ensure_parent_dir(cache_path)
|
||||||
|
with cache_path.open("w", newline="") as cache_file:
|
||||||
|
writer = csv.DictWriter(cache_file, fieldnames=["part_num", "part_img_url"])
|
||||||
|
writer.writeheader()
|
||||||
|
for part_num, url in sorted(cache.items()):
|
||||||
|
writer.writerow({"part_num": part_num, "part_img_url": url})
|
||||||
|
|
||||||
|
|
||||||
|
def build_part_img_lookup(
|
||||||
|
part_numbers: Iterable[str],
|
||||||
|
fetcher: Callable[[str], str],
|
||||||
|
cache_path: Path | None = None,
|
||||||
|
existing_cache: Dict[str, str] | None = None,
|
||||||
|
delay_seconds: float = 1.6,
|
||||||
|
) -> Dict[str, str]:
|
||||||
|
"""Construit un index part_num -> URL d'image en espaçant les requêtes."""
|
||||||
|
cache = dict(existing_cache or {})
|
||||||
|
unique_parts = sorted(set(part_numbers))
|
||||||
|
for part_num in unique_parts:
|
||||||
|
if part_num in cache:
|
||||||
|
continue
|
||||||
|
cache[part_num] = fetcher(part_num)
|
||||||
|
if cache_path is not None:
|
||||||
|
persist_part_img_cache(cache_path, cache)
|
||||||
|
time.sleep(delay_seconds)
|
||||||
|
return cache
|
||||||
|
|
||||||
|
|
||||||
|
def add_part_img_urls(minifigs_rows: Iterable[dict], part_img_lookup: Dict[str, str]) -> List[dict]:
|
||||||
|
"""Ajoute part_img_url aux lignes minifigs_by_set."""
|
||||||
|
enriched: List[dict] = []
|
||||||
|
for row in minifigs_rows:
|
||||||
|
existing = row.get("part_img_url", "").strip()
|
||||||
|
part_img_url = existing if existing != "" else part_img_lookup[row["part_num"]]
|
||||||
|
enriched.append(
|
||||||
|
{
|
||||||
|
"set_num": row["set_num"],
|
||||||
|
"part_num": row["part_num"],
|
||||||
|
"known_character": row["known_character"],
|
||||||
|
"fig_num": row["fig_num"],
|
||||||
|
"gender": row["gender"],
|
||||||
|
"part_img_url": part_img_url,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return enriched
|
||||||
|
|
||||||
|
|
||||||
|
def write_minifigs_by_set_with_images(destination_path: Path, rows: Sequence[dict]) -> None:
|
||||||
|
"""Réécrit le CSV minifigs_by_set avec la colonne part_img_url."""
|
||||||
|
ensure_parent_dir(destination_path)
|
||||||
|
fieldnames = ["set_num", "part_num", "known_character", "fig_num", "gender", "part_img_url"]
|
||||||
|
with destination_path.open("w", newline="") as csv_file:
|
||||||
|
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
|
||||||
|
writer.writeheader()
|
||||||
|
for row in rows:
|
||||||
|
writer.writerow(row)
|
||||||
|
|
||||||
|
|
||||||
|
def sanitize_name(raw_name: str) -> str:
|
||||||
|
"""Nettoie un nom pour construire un chemin de fichier sûr."""
|
||||||
|
cleaned = re.sub(r"[^A-Za-z0-9]+", "_", raw_name).strip("_")
|
||||||
|
if cleaned == "":
|
||||||
|
return "Unknown"
|
||||||
|
return cleaned
|
||||||
|
|
||||||
|
|
||||||
|
def build_download_plan(
|
||||||
|
sets_rows: Iterable[dict],
|
||||||
|
minifigs_rows: Iterable[dict],
|
||||||
|
minifigs_catalog: Dict[str, dict],
|
||||||
|
base_dir: Path,
|
||||||
|
) -> List[dict]:
|
||||||
|
"""Construit la liste des fichiers à télécharger (sets, minifigs, têtes)."""
|
||||||
|
plan: List[dict] = []
|
||||||
|
sets_list = list(sets_rows)
|
||||||
|
set_ids: Dict[str, str] = {row["set_num"]: row["set_id"] for row in sets_list}
|
||||||
|
for set_row in sets_list:
|
||||||
|
set_dir = base_dir / set_row["set_id"]
|
||||||
|
plan.append({"url": set_row["img_url"], "path": set_dir / "set.jpg"})
|
||||||
|
for row in minifigs_rows:
|
||||||
|
if (row.get("known_character") or "").strip().lower() == "figurant":
|
||||||
|
continue
|
||||||
|
set_dir = base_dir / set_ids[row["set_num"]]
|
||||||
|
character_dir = set_dir / sanitize_name(row["known_character"] or "Unknown")
|
||||||
|
minifig = minifigs_catalog[row["fig_num"]]
|
||||||
|
plan.append({"url": minifig["img_url"], "path": character_dir / "minifig.jpg"})
|
||||||
|
plan.append({"url": row["part_img_url"], "path": character_dir / "head.jpg"})
|
||||||
|
return plan
|
||||||
|
|
||||||
|
|
||||||
|
def download_binary(url: str, destination_path: Path, session: requests.Session) -> bool:
|
||||||
|
"""Télécharge un binaire vers un chemin donné. Retourne False si 404."""
|
||||||
|
ensure_parent_dir(destination_path)
|
||||||
|
response = session.get(url, stream=True)
|
||||||
|
if response.status_code == 404:
|
||||||
|
return False
|
||||||
|
response.raise_for_status()
|
||||||
|
with destination_path.open("wb") as target_file:
|
||||||
|
for chunk in response.iter_content(chunk_size=8192):
|
||||||
|
target_file.write(chunk)
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def download_resources(
|
||||||
|
plan: Iterable[dict],
|
||||||
|
downloader: Callable[[str, Path], bool],
|
||||||
|
delay_seconds: float = 0.35,
|
||||||
|
log_path: Path | None = None,
|
||||||
|
) -> None:
|
||||||
|
"""Exécute les téléchargements en espaçant les requêtes et journalise les statuts."""
|
||||||
|
cache: Dict[str, Path] = {}
|
||||||
|
log_rows: List[dict] = []
|
||||||
|
for item in plan:
|
||||||
|
if item["path"].exists():
|
||||||
|
time.sleep(delay_seconds)
|
||||||
|
continue
|
||||||
|
if item["url"] in cache and cache[item["url"]].exists():
|
||||||
|
ensure_parent_dir(item["path"])
|
||||||
|
shutil.copy2(cache[item["url"]], item["path"])
|
||||||
|
else:
|
||||||
|
success = downloader(item["url"], item["path"])
|
||||||
|
if success:
|
||||||
|
cache[item["url"]] = item["path"]
|
||||||
|
else:
|
||||||
|
log_rows.append({"url": item["url"], "path": str(item["path"]), "status": "missing"})
|
||||||
|
time.sleep(delay_seconds)
|
||||||
|
if log_path is not None:
|
||||||
|
ensure_parent_dir(log_path)
|
||||||
|
with log_path.open("w", newline="") as csv_file:
|
||||||
|
writer = csv.DictWriter(csv_file, fieldnames=["url", "path", "status"])
|
||||||
|
writer.writeheader()
|
||||||
|
for row in log_rows:
|
||||||
|
writer.writerow(row)
|
||||||
72
scripts/download_resources.py
Normal file
72
scripts/download_resources.py
Normal file
@ -0,0 +1,72 @@
|
|||||||
|
"""Télécharge les ressources (sets, minifigs, têtes) pour les sets filtrés."""
|
||||||
|
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
from lib.rebrickable.resources import (
|
||||||
|
add_part_img_urls,
|
||||||
|
build_download_plan,
|
||||||
|
build_part_img_lookup,
|
||||||
|
download_binary,
|
||||||
|
download_resources,
|
||||||
|
fetch_part_img_url,
|
||||||
|
load_minifigs_by_set,
|
||||||
|
load_minifigs_catalog,
|
||||||
|
load_sets_enriched,
|
||||||
|
load_part_img_cache,
|
||||||
|
persist_part_img_cache,
|
||||||
|
write_minifigs_by_set_with_images,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
SETS_PATH = Path("data/intermediate/sets_enriched.csv")
|
||||||
|
MINIFIGS_BY_SET_PATH = Path("data/intermediate/minifigs_by_set.csv")
|
||||||
|
MINIFIGS_CATALOG_PATH = Path("data/raw/minifigs.csv")
|
||||||
|
RESOURCES_DIR = Path("figures/rebrickable")
|
||||||
|
REQUEST_DELAY_SECONDS_IMAGES = 0.35
|
||||||
|
PART_IMG_CACHE_PATH = Path("data/intermediate/part_img_cache.csv")
|
||||||
|
DOWNLOAD_LOG_PATH = Path("data/intermediate/resources_download_log.csv")
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
"""Construit les URLs manquantes et télécharge les images associées."""
|
||||||
|
load_dotenv()
|
||||||
|
token = os.environ["REBRICKABLE_TOKEN"]
|
||||||
|
session = requests.Session()
|
||||||
|
|
||||||
|
sets = load_sets_enriched(SETS_PATH)
|
||||||
|
minifigs_by_set = load_minifigs_by_set(MINIFIGS_BY_SET_PATH)
|
||||||
|
minifigs_catalog = load_minifigs_catalog(MINIFIGS_CATALOG_PATH)
|
||||||
|
cache = load_part_img_cache(PART_IMG_CACHE_PATH)
|
||||||
|
|
||||||
|
missing_part_numbers = {
|
||||||
|
row["part_num"]
|
||||||
|
for row in minifigs_by_set
|
||||||
|
if row.get("part_img_url", "").strip() == "" and row["part_num"] not in cache
|
||||||
|
}
|
||||||
|
part_img_lookup = build_part_img_lookup(
|
||||||
|
missing_part_numbers,
|
||||||
|
fetcher=lambda part_num: fetch_part_img_url(part_num, token, session),
|
||||||
|
cache_path=PART_IMG_CACHE_PATH,
|
||||||
|
existing_cache=cache,
|
||||||
|
)
|
||||||
|
if cache:
|
||||||
|
part_img_lookup.update(cache)
|
||||||
|
persist_part_img_cache(PART_IMG_CACHE_PATH, part_img_lookup)
|
||||||
|
minifigs_with_imgs = add_part_img_urls(minifigs_by_set, part_img_lookup)
|
||||||
|
write_minifigs_by_set_with_images(MINIFIGS_BY_SET_PATH, minifigs_with_imgs)
|
||||||
|
|
||||||
|
plan = build_download_plan(sets, minifigs_with_imgs, minifigs_catalog, RESOURCES_DIR)
|
||||||
|
download_resources(
|
||||||
|
plan,
|
||||||
|
downloader=lambda url, path: download_binary(url, path, session),
|
||||||
|
delay_seconds=REQUEST_DELAY_SECONDS_IMAGES,
|
||||||
|
log_path=DOWNLOAD_LOG_PATH,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
108
tests/test_resources.py
Normal file
108
tests/test_resources.py
Normal file
@ -0,0 +1,108 @@
|
|||||||
|
"""Tests des outils de téléchargement de ressources."""
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from lib.rebrickable.resources import (
|
||||||
|
add_part_img_urls,
|
||||||
|
build_download_plan,
|
||||||
|
build_part_img_lookup,
|
||||||
|
download_resources,
|
||||||
|
sanitize_name,
|
||||||
|
write_minifigs_by_set_with_images,
|
||||||
|
)
|
||||||
|
from lib.filesystem import ensure_parent_dir
|
||||||
|
from lib.rebrickable.stats import read_rows
|
||||||
|
|
||||||
|
|
||||||
|
def test_build_part_img_lookup_calls_fetcher_once_per_part() -> None:
|
||||||
|
"""Construit un index en appelant le fetcher sur les références uniques."""
|
||||||
|
calls: list[str] = []
|
||||||
|
|
||||||
|
def fetcher(part_num: str) -> str:
|
||||||
|
calls.append(part_num)
|
||||||
|
return f"url-{part_num}"
|
||||||
|
|
||||||
|
lookup = build_part_img_lookup(["p1", "p2", "p1"], fetcher, delay_seconds=0)
|
||||||
|
|
||||||
|
assert lookup == {"p1": "url-p1", "p2": "url-p2"}
|
||||||
|
assert calls == ["p1", "p2"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_add_part_img_urls_and_write(tmp_path: Path) -> None:
|
||||||
|
"""Ajoute les URLs de tête et réécrit minifigs_by_set."""
|
||||||
|
rows = [
|
||||||
|
{"set_num": "123-1", "part_num": "p1", "known_character": "Alice", "fig_num": "f1", "gender": "female"},
|
||||||
|
]
|
||||||
|
lookup = {"p1": "http://img/p1.jpg"}
|
||||||
|
|
||||||
|
enriched = add_part_img_urls(rows, lookup)
|
||||||
|
destination = tmp_path / "minifigs_by_set.csv"
|
||||||
|
write_minifigs_by_set_with_images(destination, enriched)
|
||||||
|
|
||||||
|
assert read_rows(destination) == [
|
||||||
|
{
|
||||||
|
"set_num": "123-1",
|
||||||
|
"part_num": "p1",
|
||||||
|
"known_character": "Alice",
|
||||||
|
"fig_num": "f1",
|
||||||
|
"gender": "female",
|
||||||
|
"part_img_url": "http://img/p1.jpg",
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_build_download_plan_and_download(tmp_path: Path) -> None:
|
||||||
|
"""Construit le plan et télécharge les binaires via un downloader stub."""
|
||||||
|
sets_rows = [
|
||||||
|
{"set_num": "123-1", "set_id": "123", "img_url": "http://set.img", "name": "A", "year": "2020"},
|
||||||
|
]
|
||||||
|
minifigs_rows = [
|
||||||
|
{"set_num": "123-1", "part_num": "p1", "known_character": "Bob", "fig_num": "fig-1", "gender": "male", "part_img_url": "http://head.img"}
|
||||||
|
]
|
||||||
|
minifigs_catalog = {"fig-1": {"img_url": "http://fig.img"}}
|
||||||
|
base_dir = tmp_path / "resources"
|
||||||
|
|
||||||
|
plan = build_download_plan(sets_rows, minifigs_rows, minifigs_catalog, base_dir)
|
||||||
|
downloaded: list[tuple[str, Path]] = []
|
||||||
|
|
||||||
|
def downloader(url: str, path: Path) -> bool:
|
||||||
|
downloaded.append((url, path))
|
||||||
|
path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
path.write_bytes(b"data")
|
||||||
|
return True
|
||||||
|
|
||||||
|
download_resources(plan, downloader, delay_seconds=0, log_path=tmp_path / "log.csv")
|
||||||
|
|
||||||
|
assert downloaded == [
|
||||||
|
("http://set.img", base_dir / "123" / "set.jpg"),
|
||||||
|
("http://fig.img", base_dir / "123" / "Bob" / "minifig.jpg"),
|
||||||
|
("http://head.img", base_dir / "123" / "Bob" / "head.jpg"),
|
||||||
|
]
|
||||||
|
assert (base_dir / "123" / "Bob" / "head.jpg").exists()
|
||||||
|
|
||||||
|
|
||||||
|
def test_download_resources_duplicates_from_cache(tmp_path: Path) -> None:
|
||||||
|
"""Duplique les fichiers déjà téléchargés pour d'autres sets."""
|
||||||
|
plan = [
|
||||||
|
{"url": "http://same.img", "path": tmp_path / "resources" / "111" / "set.jpg"},
|
||||||
|
{"url": "http://same.img", "path": tmp_path / "resources" / "222" / "set.jpg"},
|
||||||
|
]
|
||||||
|
downloads: list[tuple[str, Path]] = []
|
||||||
|
|
||||||
|
def downloader(url: str, path: Path) -> bool:
|
||||||
|
downloads.append((url, path))
|
||||||
|
ensure_parent_dir(path)
|
||||||
|
path.write_bytes(b"img")
|
||||||
|
return True
|
||||||
|
|
||||||
|
download_resources(plan, downloader, delay_seconds=0, log_path=tmp_path / "log.csv")
|
||||||
|
|
||||||
|
assert downloads == [("http://same.img", tmp_path / "resources" / "111" / "set.jpg")]
|
||||||
|
assert (tmp_path / "resources" / "222" / "set.jpg").exists()
|
||||||
|
|
||||||
|
|
||||||
|
def test_sanitize_name_handles_special_chars() -> None:
|
||||||
|
"""Nettoie les noms en enlevant les caractères spéciaux."""
|
||||||
|
assert sanitize_name("Owen Grady") == "Owen_Grady"
|
||||||
|
assert sanitize_name("Kayla-Watts!") == "Kayla_Watts"
|
||||||
|
assert sanitize_name("") == "Unknown"
|
||||||
Loading…
x
Reference in New Issue
Block a user