You've already forked etude_lego_jurassic_world
Rend le téléchargement des ressources plus patient et cache
This commit is contained in:
72
scripts/download_resources.py
Normal file
72
scripts/download_resources.py
Normal file
@@ -0,0 +1,72 @@
|
||||
"""Télécharge les ressources (sets, minifigs, têtes) pour les sets filtrés."""
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
from dotenv import load_dotenv
|
||||
|
||||
from lib.rebrickable.resources import (
|
||||
add_part_img_urls,
|
||||
build_download_plan,
|
||||
build_part_img_lookup,
|
||||
download_binary,
|
||||
download_resources,
|
||||
fetch_part_img_url,
|
||||
load_minifigs_by_set,
|
||||
load_minifigs_catalog,
|
||||
load_sets_enriched,
|
||||
load_part_img_cache,
|
||||
persist_part_img_cache,
|
||||
write_minifigs_by_set_with_images,
|
||||
)
|
||||
|
||||
|
||||
SETS_PATH = Path("data/intermediate/sets_enriched.csv")
|
||||
MINIFIGS_BY_SET_PATH = Path("data/intermediate/minifigs_by_set.csv")
|
||||
MINIFIGS_CATALOG_PATH = Path("data/raw/minifigs.csv")
|
||||
RESOURCES_DIR = Path("figures/rebrickable")
|
||||
REQUEST_DELAY_SECONDS_IMAGES = 0.35
|
||||
PART_IMG_CACHE_PATH = Path("data/intermediate/part_img_cache.csv")
|
||||
DOWNLOAD_LOG_PATH = Path("data/intermediate/resources_download_log.csv")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""Construit les URLs manquantes et télécharge les images associées."""
|
||||
load_dotenv()
|
||||
token = os.environ["REBRICKABLE_TOKEN"]
|
||||
session = requests.Session()
|
||||
|
||||
sets = load_sets_enriched(SETS_PATH)
|
||||
minifigs_by_set = load_minifigs_by_set(MINIFIGS_BY_SET_PATH)
|
||||
minifigs_catalog = load_minifigs_catalog(MINIFIGS_CATALOG_PATH)
|
||||
cache = load_part_img_cache(PART_IMG_CACHE_PATH)
|
||||
|
||||
missing_part_numbers = {
|
||||
row["part_num"]
|
||||
for row in minifigs_by_set
|
||||
if row.get("part_img_url", "").strip() == "" and row["part_num"] not in cache
|
||||
}
|
||||
part_img_lookup = build_part_img_lookup(
|
||||
missing_part_numbers,
|
||||
fetcher=lambda part_num: fetch_part_img_url(part_num, token, session),
|
||||
cache_path=PART_IMG_CACHE_PATH,
|
||||
existing_cache=cache,
|
||||
)
|
||||
if cache:
|
||||
part_img_lookup.update(cache)
|
||||
persist_part_img_cache(PART_IMG_CACHE_PATH, part_img_lookup)
|
||||
minifigs_with_imgs = add_part_img_urls(minifigs_by_set, part_img_lookup)
|
||||
write_minifigs_by_set_with_images(MINIFIGS_BY_SET_PATH, minifigs_with_imgs)
|
||||
|
||||
plan = build_download_plan(sets, minifigs_with_imgs, minifigs_catalog, RESOURCES_DIR)
|
||||
download_resources(
|
||||
plan,
|
||||
downloader=lambda url, path: download_binary(url, path, session),
|
||||
delay_seconds=REQUEST_DELAY_SECONDS_IMAGES,
|
||||
log_path=DOWNLOAD_LOG_PATH,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user