diff --git a/README.md b/README.md index 6f73cba..a1667da 100644 --- a/README.md +++ b/README.md @@ -261,3 +261,10 @@ Le script lit `data/intermediate/minifigs_by_set.csv` et `data/intermediate/sets 2. `python -m scripts.plot_minifig_gender_share` Le script lit `data/intermediate/minifigs_by_set.csv`, agrège le nombre de minifigs distinctes par genre (basé sur `config/known_character_genders.csv`), écrit `data/intermediate/minifig_gender_counts.csv`, puis trace `figures/step25/minifig_gender_share.png` (donut indiquant la part des personnages féminins, masculins ou inconnus). + +### Étape 26 : corrélation pièces / minifigs + +1. `source .venv/bin/activate` +2. `python -m scripts.plot_minifig_parts_correlation` + +Le script lit `data/intermediate/minifig_counts_by_set.csv`, `data/intermediate/sets_enriched.csv`, `data/raw/sets.csv`, `data/raw/inventories.csv` et `data/raw/inventory_minifigs.csv`, produit `data/intermediate/minifig_parts_correlation.csv` (pièces vs minifigs pour le catalogue global et les thèmes filtrés), puis trace `figures/step26/minifig_parts_correlation.png` en superposant les nuages de points et leurs tendances linéaires. diff --git a/lib/plots/minifig_parts_correlation.py b/lib/plots/minifig_parts_correlation.py new file mode 100644 index 0000000..4e30716 --- /dev/null +++ b/lib/plots/minifig_parts_correlation.py @@ -0,0 +1,85 @@ +"""Diagramme de corrélation entre pièces et minifigs par set.""" + +from pathlib import Path +from typing import Iterable, Tuple + +import matplotlib.pyplot as plt + +from lib.filesystem import ensure_parent_dir +from lib.rebrickable.stats import read_rows + + +def load_points(path: Path, scope: str) -> Tuple[list[int], list[int]]: + """Charge les points (x=num_parts, y=minifig_count) pour un scope donné.""" + rows = read_rows(path) + xs: list[int] = [] + ys: list[int] = [] + for row in rows: + if row["scope"] != scope: + continue + xs.append(int(row["num_parts"])) + ys.append(int(row["minifig_count"])) + return xs, ys + + +def compute_regression(points: Iterable[Tuple[int, int]]) -> Tuple[float, float]: + """Calcule une régression linéaire simple (pente, ordonnée à l'origine).""" + xs = [x for x, _ in points] + ys = [y for _, y in points] + n = len(xs) + mean_x = sum(xs) / n + mean_y = sum(ys) / n + numerator = 0.0 + denominator = 0.0 + for x, y in points: + dx = x - mean_x + dy = y - mean_y + numerator += dx * dy + denominator += dx * dx + slope = numerator / denominator if denominator != 0 else 0.0 + intercept = mean_y - slope * mean_x + return slope, intercept + + +def plot_minifig_parts_correlation(correlation_path: Path, destination_path: Path) -> None: + """Trace la corrélation pièces/minifigs pour les sets filtrés vs catalogue global.""" + filtered_x, filtered_y = load_points(correlation_path, "filtered") + catalog_x, catalog_y = load_points(correlation_path, "catalog") + filtered_points = list(zip(filtered_x, filtered_y)) + catalog_points = list(zip(catalog_x, catalog_y)) + if not filtered_points or not catalog_points: + return + filtered_slope, filtered_intercept = compute_regression(filtered_points) + catalog_slope, catalog_intercept = compute_regression(catalog_points) + x_min = min(min(filtered_x), min(catalog_x)) + x_max = max(max(filtered_x), max(catalog_x)) + + fig, ax = plt.subplots(figsize=(10, 7)) + ax.scatter(catalog_x, catalog_y, color="#bbbbbb", alpha=0.25, s=18, label="Catalogue global") + ax.scatter(filtered_x, filtered_y, color="#1f77b4", alpha=0.8, s=28, label="Thèmes filtrés") + ax.plot( + [x_min, x_max], + [catalog_slope * x_min + catalog_intercept, catalog_slope * x_max + catalog_intercept], + color="#555555", + linestyle="--", + linewidth=1.4, + label=f"Tendance globale (pente {catalog_slope:.3f})", + ) + ax.plot( + [x_min, x_max], + [filtered_slope * x_min + filtered_intercept, filtered_slope * x_max + filtered_intercept], + color="#1f77b4", + linestyle="-", + linewidth=1.6, + label=f"Tendance thèmes filtrés (pente {filtered_slope:.3f})", + ) + ax.set_xlabel("Nombre de pièces du set") + ax.set_ylabel("Nombre de minifigs") + ax.set_title("Corrélation pièces / minifigs") + ax.grid(True, linestyle="--", alpha=0.3) + ax.legend(loc="upper left") + + ensure_parent_dir(destination_path) + fig.tight_layout() + fig.savefig(destination_path, dpi=160) + plt.close(fig) diff --git a/lib/rebrickable/minifig_parts_correlation.py b/lib/rebrickable/minifig_parts_correlation.py new file mode 100644 index 0000000..4a2ad05 --- /dev/null +++ b/lib/rebrickable/minifig_parts_correlation.py @@ -0,0 +1,96 @@ +"""Prépare les données de corrélation pièces/minifigs par set.""" + +import csv +from pathlib import Path +from typing import Dict, List, Sequence + +from lib.filesystem import ensure_parent_dir +from lib.rebrickable.parts_inventory import index_inventory_minifigs_by_inventory, select_latest_inventories +from lib.rebrickable.stats import read_rows + + +def load_minifig_counts_by_set(path: Path) -> Dict[str, int]: + """Indexe le nombre de minifigs par set filtré.""" + lookup: Dict[str, int] = {} + with path.open() as csv_file: + reader = csv.DictReader(csv_file) + for row in reader: + lookup[row["set_num"]] = int(row["minifig_count"]) + return lookup + + +def load_num_parts(path: Path) -> Dict[str, int]: + """Indexe le nombre de pièces par set.""" + lookup: Dict[str, int] = {} + with path.open() as csv_file: + reader = csv.DictReader(csv_file) + for row in reader: + lookup[row["set_num"]] = int(row["num_parts"]) + return lookup + + +def build_global_minifig_counts(inventories_path: Path, inventory_minifigs_path: Path) -> Dict[str, int]: + """Calcule le nombre de minifigs par set pour le catalogue complet.""" + inventories = select_latest_inventories(inventories_path) + minifigs_by_inventory = index_inventory_minifigs_by_inventory(inventory_minifigs_path) + counts: Dict[str, int] = {} + for set_num, inventory in inventories.items(): + total = 0 + for row in minifigs_by_inventory.get(inventory["id"], []): + total += int(row["quantity"]) + counts[set_num] = total + return counts + + +def build_correlation_rows( + filtered_counts_path: Path, + filtered_sets_path: Path, + all_sets_path: Path, + inventories_path: Path, + inventory_minifigs_path: Path, +) -> List[dict]: + """Construit les lignes de corrélation pièces/minifigs pour sets filtrés et catalogue.""" + filtered_counts = load_minifig_counts_by_set(filtered_counts_path) + filtered_parts = load_num_parts(filtered_sets_path) + rows: List[dict] = [] + for set_num, minifig_count in filtered_counts.items(): + num_parts = filtered_parts[set_num] + rows.append( + { + "scope": "filtered", + "set_num": set_num, + "num_parts": str(num_parts), + "minifig_count": str(minifig_count), + } + ) + global_parts = load_num_parts(all_sets_path) + global_minifigs = build_global_minifig_counts(inventories_path, inventory_minifigs_path) + for set_num, num_parts in global_parts.items(): + if num_parts <= 0: + continue + minifig_count = global_minifigs.get(set_num, 0) + rows.append( + { + "scope": "catalog", + "set_num": set_num, + "num_parts": str(num_parts), + "minifig_count": str(minifig_count), + } + ) + return rows + + +def write_correlation_rows(path: Path, rows: Sequence[dict]) -> None: + """Écrit les lignes de corrélation pièces/minifigs.""" + ensure_parent_dir(path) + fieldnames = ["scope", "set_num", "num_parts", "minifig_count"] + with path.open("w", newline="") as csv_file: + writer = csv.DictWriter(csv_file, fieldnames=fieldnames) + writer.writeheader() + for row in rows: + writer.writerow(row) + + +def load_correlation_rows(path: Path) -> List[dict]: + """Charge le CSV de corrélation pièces/minifigs.""" + return read_rows(path) diff --git a/scripts/plot_minifig_parts_correlation.py b/scripts/plot_minifig_parts_correlation.py new file mode 100644 index 0000000..d0828d3 --- /dev/null +++ b/scripts/plot_minifig_parts_correlation.py @@ -0,0 +1,32 @@ +"""Trace la corrélation entre nombre de pièces et nombre de minifigs par set.""" + +from pathlib import Path + +from lib.plots.minifig_parts_correlation import plot_minifig_parts_correlation +from lib.rebrickable.minifig_parts_correlation import build_correlation_rows, write_correlation_rows + + +FILTERED_MINIFIG_COUNTS_PATH = Path("data/intermediate/minifig_counts_by_set.csv") +FILTERED_SETS_PATH = Path("data/intermediate/sets_enriched.csv") +ALL_SETS_PATH = Path("data/raw/sets.csv") +INVENTORIES_PATH = Path("data/raw/inventories.csv") +INVENTORY_MINIFIGS_PATH = Path("data/raw/inventory_minifigs.csv") +CORRELATION_PATH = Path("data/intermediate/minifig_parts_correlation.csv") +DESTINATION_PATH = Path("figures/step26/minifig_parts_correlation.png") + + +def main() -> None: + """Construit le CSV de corrélation et trace le diagramme comparatif.""" + rows = build_correlation_rows( + FILTERED_MINIFIG_COUNTS_PATH, + FILTERED_SETS_PATH, + ALL_SETS_PATH, + INVENTORIES_PATH, + INVENTORY_MINIFIGS_PATH, + ) + write_correlation_rows(CORRELATION_PATH, rows) + plot_minifig_parts_correlation(CORRELATION_PATH, DESTINATION_PATH) + + +if __name__ == "__main__": + main() diff --git a/tests/test_minifig_parts_correlation.py b/tests/test_minifig_parts_correlation.py new file mode 100644 index 0000000..c0f40a8 --- /dev/null +++ b/tests/test_minifig_parts_correlation.py @@ -0,0 +1,70 @@ +"""Tests de la préparation de corrélation pièces/minifigs.""" + +from pathlib import Path + +from lib.rebrickable.minifig_parts_correlation import build_correlation_rows + + +def write_csv(path: Path, content: str) -> None: + """Écrit un CSV brut.""" + path.write_text(content) + + +def test_build_correlation_rows_merges_filtered_and_catalog(tmp_path: Path) -> None: + """Construit les lignes de corrélation pour filtrés et catalogue global.""" + filtered_counts_path = tmp_path / "minifig_counts_by_set.csv" + write_csv( + filtered_counts_path, + "set_num,set_id,name,year,minifig_count\n" + "123-1,123,Set A,2020,2\n" + "124-1,124,Set B,2021,1\n", + ) + filtered_sets_path = tmp_path / "sets_enriched.csv" + write_csv( + filtered_sets_path, + "set_num,num_parts\n" + "123-1,300\n" + "124-1,150\n", + ) + all_sets_path = tmp_path / "sets.csv" + write_csv( + all_sets_path, + "set_num,name,year,theme_id,num_parts\n" + "123-1,Set A,2020,1,300\n" + "124-1,Set B,2021,1,150\n" + "200-1,Set C,2019,1,100\n", + ) + inventories_path = tmp_path / "inventories.csv" + write_csv( + inventories_path, + "id,version,set_num\n" + "10,1,123-1\n" + "20,2,123-1\n" + "30,1,124-1\n" + "40,1,200-1\n", + ) + inventory_minifigs_path = tmp_path / "inventory_minifigs.csv" + write_csv( + inventory_minifigs_path, + "inventory_id,fig_num,quantity\n" + "10,fig-a,1\n" + "20,fig-a,2\n" + "30,fig-b,1\n" + "40,fig-c,3\n", + ) + + rows = build_correlation_rows( + filtered_counts_path, + filtered_sets_path, + all_sets_path, + inventories_path, + inventory_minifigs_path, + ) + + assert rows == [ + {"scope": "filtered", "set_num": "123-1", "num_parts": "300", "minifig_count": "2"}, + {"scope": "filtered", "set_num": "124-1", "num_parts": "150", "minifig_count": "1"}, + {"scope": "catalog", "set_num": "123-1", "num_parts": "300", "minifig_count": "2"}, + {"scope": "catalog", "set_num": "124-1", "num_parts": "150", "minifig_count": "1"}, + {"scope": "catalog", "set_num": "200-1", "num_parts": "100", "minifig_count": "3"}, + ] diff --git a/tests/test_minifig_parts_correlation_plot.py b/tests/test_minifig_parts_correlation_plot.py new file mode 100644 index 0000000..978cef1 --- /dev/null +++ b/tests/test_minifig_parts_correlation_plot.py @@ -0,0 +1,28 @@ +"""Tests du graphique de corrélation pièces/minifigs.""" + +import matplotlib +from pathlib import Path + +from lib.plots.minifig_parts_correlation import plot_minifig_parts_correlation + + +matplotlib.use("Agg") + + +def test_plot_minifig_parts_correlation(tmp_path: Path) -> None: + """Génère le graphique comparatif pièces/minifigs.""" + correlation_path = tmp_path / "minifig_parts_correlation.csv" + destination = tmp_path / "figures" / "step26" / "minifig_parts_correlation.png" + correlation_path.write_text( + "scope,set_num,num_parts,minifig_count\n" + "filtered,123-1,300,2\n" + "filtered,124-1,150,1\n" + "catalog,123-1,300,2\n" + "catalog,124-1,150,1\n" + "catalog,200-1,100,3\n" + ) + + plot_minifig_parts_correlation(correlation_path, destination) + + assert destination.exists() + assert destination.stat().st_size > 0