Fiabiliser le téléchargement des autocollants sans URL
This commit is contained in:
parent
241f48d48f
commit
f757bfa6bf
76
scripts/download_sticker_resources.py
Normal file
76
scripts/download_sticker_resources.py
Normal file
@ -0,0 +1,76 @@
|
||||
"""Télécharge les images des planches d'autocollants des sets filtrés."""
|
||||
|
||||
import csv
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
from dotenv import load_dotenv
|
||||
|
||||
from lib.filesystem import ensure_parent_dir
|
||||
from lib.rebrickable.resources import (
|
||||
build_part_img_lookup,
|
||||
download_binary,
|
||||
download_resources,
|
||||
fetch_part_img_url,
|
||||
load_part_img_cache,
|
||||
persist_part_img_cache,
|
||||
)
|
||||
from lib.rebrickable.stats import read_rows
|
||||
|
||||
|
||||
STICKER_PARTS_PATH = Path("data/intermediate/sticker_parts.csv")
|
||||
RESOURCES_DIR = Path("figures/rebrickable")
|
||||
PART_IMG_CACHE_PATH = Path("data/intermediate/part_img_cache.csv")
|
||||
DOWNLOAD_LOG_PATH = Path("data/intermediate/sticker_download_log.csv")
|
||||
REQUEST_DELAY_SECONDS_IMAGES = 0.35
|
||||
REQUEST_DELAY_SECONDS_LOOKUP = 0.6
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""Construit les URLs manquantes et télécharge les planches d'autocollants."""
|
||||
load_dotenv()
|
||||
token = os.environ["REBRICKABLE_TOKEN"]
|
||||
session = requests.Session()
|
||||
|
||||
stickers = read_rows(STICKER_PARTS_PATH)
|
||||
cache = load_part_img_cache(PART_IMG_CACHE_PATH)
|
||||
part_img_lookup = build_part_img_lookup(
|
||||
{row["part_num"] for row in stickers},
|
||||
fetcher=lambda part_num: fetch_part_img_url(part_num, token, session),
|
||||
cache_path=PART_IMG_CACHE_PATH,
|
||||
existing_cache=cache,
|
||||
delay_seconds=REQUEST_DELAY_SECONDS_LOOKUP,
|
||||
)
|
||||
if cache:
|
||||
part_img_lookup.update(cache)
|
||||
persist_part_img_cache(PART_IMG_CACHE_PATH, part_img_lookup)
|
||||
|
||||
plan = []
|
||||
missing_log = []
|
||||
for row in stickers:
|
||||
url = part_img_lookup.get(row["part_num"])
|
||||
path = RESOURCES_DIR / row["set_id"] / "stickers" / f"{row['part_num']}.jpg"
|
||||
if not url or not str(url).startswith("http"):
|
||||
missing_log.append({"url": url or "", "path": str(path), "status": "missing_url"})
|
||||
continue
|
||||
plan.append({"url": url, "path": path})
|
||||
|
||||
download_resources(
|
||||
plan,
|
||||
downloader=lambda url, path: download_binary(url, path, session),
|
||||
delay_seconds=REQUEST_DELAY_SECONDS_IMAGES,
|
||||
log_path=DOWNLOAD_LOG_PATH if not missing_log else None,
|
||||
)
|
||||
|
||||
if missing_log:
|
||||
ensure_parent_dir(DOWNLOAD_LOG_PATH)
|
||||
with DOWNLOAD_LOG_PATH.open("w", newline="") as csv_file:
|
||||
writer = csv.DictWriter(csv_file, fieldnames=["url", "path", "status"])
|
||||
writer.writeheader()
|
||||
for row in missing_log:
|
||||
writer.writerow(row)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
x
Reference in New Issue
Block a user