etude_lego_jurassic_world/lib/rebrickable/minifig_characters.py

"""Agrégation des minifigs par personnage représenté."""

from collections import defaultdict
from pathlib import Path
from typing import Dict, Iterable, List, Sequence, Set

from lib.rebrickable.stats import read_rows
from lib.filesystem import ensure_parent_dir
import csv


def load_minifigs_by_set(path: Path) -> List[dict]:
    """Charge le CSV minifigs_by_set."""
    return read_rows(path)


def aggregate_by_character(rows: Iterable[dict]) -> List[dict]:
    """Compte les minifigs distinctes par personnage (fig_num unique)."""
    fig_nums_by_character: Dict[str, set] = defaultdict(set)
    for row in rows:
        character = row["known_character"].strip()
        fig_num = row["fig_num"].strip()
        if character == "" or fig_num == "":
            continue
        fig_nums_by_character[character].add(fig_num)
    aggregates: List[dict] = []
    for character, fig_nums in fig_nums_by_character.items():
        aggregates.append({"known_character": character, "minifig_count": len(fig_nums)})
    aggregates.sort(key=lambda r: (-r["minifig_count"], r["known_character"]))
    return aggregates


def write_character_counts(path: Path, rows: Sequence[dict]) -> None:
    """Écrit le CSV des comptes par personnage."""
    ensure_parent_dir(path)
    fieldnames = ["known_character", "minifig_count"]
    with path.open("w", newline="") as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()
        for row in rows:
            writer.writerow(row)


def load_sets_enriched(path: Path) -> Dict[str, str]:
    """Indexe les années par set_num."""
    lookup: Dict[str, str] = {}
    with path.open() as sets_file:
        reader = csv.DictReader(sets_file)
        for row in reader:
            lookup[row["set_num"]] = row["year"]
    return lookup


def aggregate_presence_by_year(
    minifigs_rows: Iterable[dict],
    sets_years: Dict[str, str],
    excluded_characters: Sequence[str] | None = None,
) -> List[dict]:
    """Compte le nombre total de minifigs par personnage et par année (hors figurants)."""
    excluded = set(excluded_characters or [])
    counts: Dict[tuple[str, int], int] = defaultdict(int)
    years_all = {int(year) for year in sets_years.values()}
    characters_all: Set[str] = set()
    for row in minifigs_rows:
        character = row["known_character"].strip()
        fig_num = row["fig_num"].strip()
        if character == "" or fig_num == "":
            continue
        if character in excluded:
            continue
        year = sets_years.get(row["set_num"])
        if year is None:
            continue
        year_int = int(year)
        counts[(character, year_int)] += 1
        characters_all.add(character)
    years = sorted(years_all)
    characters = sorted(characters_all)
    results: List[dict] = []
    for character in characters:
        for year in years:
            count = counts.get((character, year), 0)
            results.append(
                {
                    "known_character": character,
                    "year": str(year),
                    "minifig_count": str(count),
                }
            )
    return results


def write_presence_by_year(path: Path, rows: Sequence[dict]) -> None:
    """Écrit la matrice présence binaire année/personnage."""
    ensure_parent_dir(path)
    fieldnames = ["known_character", "year", "minifig_count"]
    with path.open("w", newline="") as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()
        for row in rows:
            writer.writerow(row)


def aggregate_character_spans(
    minifigs_rows: Iterable[dict],
    sets_years: Dict[str, str],
    excluded_characters: Sequence[str] | None = None,
) -> List[dict]:
    """Calcule la période d'apparition de chaque personnage (bornes min/max des années observées)."""
    excluded = set(excluded_characters or [])
    spans: Dict[str, Dict[str, int]] = {}
    total_counts: Dict[str, int] = defaultdict(int)
    for row in minifigs_rows:
        character = row["known_character"].strip()
        fig_num = row["fig_num"].strip()
        if character == "" or fig_num == "":
            continue
        if character in excluded:
            continue
        year = sets_years.get(row["set_num"])
        if year is None:
            continue
        year_int = int(year)
        total_counts[character] += 1
        current = spans.get(character)
        if current is None:
            spans[character] = {"start": year_int, "end": year_int}
        else:
            spans[character]["start"] = min(current["start"], year_int)
            spans[character]["end"] = max(current["end"], year_int)
    results: List[dict] = []
    for character, bounds in spans.items():
        results.append(
            {
                "known_character": character,
                "start_year": str(bounds["start"]),
                "end_year": str(bounds["end"]),
                "total_minifigs": str(total_counts[character]),
            }
        )
    results.sort(key=lambda r: (int(r["start_year"]), int(r["end_year"]), r["known_character"]))
    return results


def write_character_spans(path: Path, rows: Sequence[dict]) -> None:
    """Écrit le CSV des bornes min/max par personnage."""
    ensure_parent_dir(path)
    fieldnames = ["known_character", "start_year", "end_year", "total_minifigs"]
    with path.open("w", newline="") as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()
        for row in rows:
            writer.writerow(row)