1

Ajoute la richesse chromatique par set

This commit is contained in:
Richard Dern 2025-12-02 16:59:59 +01:00
parent f94669d82e
commit d067e2075f
8 changed files with 592 additions and 11 deletions

View File

@ -285,3 +285,16 @@ Le calcul lit `data/intermediate/parts_filtered.csv`, `data/intermediate/sets_en
- `data/intermediate/rare_parts_by_set.csv` : agrégat par set (comptes distincts, quantités, focus minifigs). - `data/intermediate/rare_parts_by_set.csv` : agrégat par set (comptes distincts, quantités, focus minifigs).
Le tracé `figures/step27/rare_parts_per_set.png` met en scène le top des sets contenant le plus de variantes exclusives, en distinguant les pièces de minifigs et létat de possession. Le tracé `figures/step27/rare_parts_per_set.png` met en scène le top des sets contenant le plus de variantes exclusives, en distinguant les pièces de minifigs et létat de possession.
### Étape 28 : richesse chromatique par set
1. `source .venv/bin/activate`
2. `python -m scripts.compute_color_richness`
3. `python -m scripts.plot_color_richness`
Le calcul lit `data/intermediate/colors_by_set.csv` et `data/intermediate/sets_enriched.csv` pour mesurer la diversité des palettes (nombre de couleurs distinctes hors rechanges, part des 3 couleurs principales, part de couleurs de minifigs). Il produit :
- `data/intermediate/color_richness_by_set.csv` : métriques détaillées par set (comptes et parts principales, possession).
- `data/intermediate/color_richness_by_year.csv` : agrégat annuel (moyenne, médiane, bornes de diversité et concentration).
Les graphiques `figures/step28/color_richness_boxplot.png`, `figures/step28/color_richness_top_sets.png` et `figures/step28/color_concentration_scatter.png` montrent respectivement la répartition annuelle, le top des sets les plus colorés et la concentration des palettes (part des 3 couleurs dominantes vs nombre de couleurs).

130
lib/plots/color_richness.py Normal file
View File

@ -0,0 +1,130 @@
"""Visualisations de la richesse chromatique par set."""
from pathlib import Path
from typing import Iterable, List, Tuple
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.patches import Patch
from lib.filesystem import ensure_parent_dir
from lib.rebrickable.stats import read_rows
def load_richness_rows(path: Path) -> List[dict]:
"""Charge les métriques de richesse chromatique."""
return read_rows(path)
def build_boxplot_data(rows: Iterable[dict]) -> Tuple[List[List[int]], List[str]]:
"""Prépare les valeurs de boxplot par année."""
grouped: dict[str, List[int]] = {}
for row in rows:
year_rows = grouped.get(row["year"])
if year_rows is None:
year_rows = []
grouped[row["year"]] = year_rows
year_rows.append(int(row["colors_distinct"]))
years = sorted(grouped.keys(), key=int)
data = [grouped[year] for year in years]
return data, years
def plot_richness_boxplot(richness_path: Path, destination_path: Path) -> None:
"""Trace le boxplot du nombre de couleurs distinctes par set et par année."""
rows = load_richness_rows(richness_path)
if not rows:
return
data, years = build_boxplot_data(rows)
fig, ax = plt.subplots(figsize=(12, 7))
box = ax.boxplot(
data,
orientation="vertical",
patch_artist=True,
tick_labels=years,
boxprops=dict(facecolor="#1f77b4", alpha=0.3),
medianprops=dict(color="#0d0d0d", linewidth=1.5),
whiskerprops=dict(color="#555555", linestyle="--"),
capprops=dict(color="#555555"),
)
for patch in box["boxes"]:
patch.set_edgecolor("#1f77b4")
ax.set_xlabel("Année")
ax.set_ylabel("Nombre de couleurs distinctes (hors rechanges)")
ax.set_title("Richesse chromatique par set (répartition annuelle)")
ax.grid(axis="y", linestyle="--", alpha=0.3)
ensure_parent_dir(destination_path)
fig.tight_layout()
fig.savefig(destination_path, dpi=170)
plt.close(fig)
def select_top_sets(rows: Iterable[dict], limit: int = 15) -> List[dict]:
"""Retient les sets les plus colorés et les plus concentrés."""
sorted_rows = sorted(
rows,
key=lambda row: (-int(row["colors_distinct"]), float(row["top3_share"]), row["set_num"]),
)
return sorted_rows[:limit]
def plot_richness_top_sets(richness_path: Path, destination_path: Path) -> None:
"""Trace le top des sets les plus riches en couleurs."""
rows = load_richness_rows(richness_path)
if not rows:
return
top_rows = select_top_sets(rows)
y_positions = np.arange(len(top_rows))
counts = [int(row["colors_distinct"]) for row in top_rows]
labels = [f"{row['set_num']} · {row['name']} ({row['year']})" for row in top_rows]
owned_mask = [row["in_collection"] == "true" for row in top_rows]
fig, ax = plt.subplots(figsize=(11, 8))
for y, value, owned in zip(y_positions, counts, owned_mask):
alpha = 0.92 if owned else 0.45
ax.barh(y, value, color="#2ca02c", alpha=alpha)
ax.set_yticks(y_positions)
ax.set_yticklabels(labels)
ax.invert_yaxis()
ax.set_xlabel("Couleurs distinctes (hors rechanges)")
ax.set_title("Top des sets les plus colorés")
ax.grid(axis="x", linestyle="--", alpha=0.3)
legend = [
Patch(facecolor="#2ca02c", edgecolor="none", alpha=0.92, label="Set possédé"),
Patch(facecolor="#2ca02c", edgecolor="none", alpha=0.45, label="Set manquant"),
]
ax.legend(handles=legend, loc="lower right", frameon=False)
ensure_parent_dir(destination_path)
fig.tight_layout()
fig.savefig(destination_path, dpi=170)
plt.close(fig)
def plot_concentration_scatter(richness_path: Path, destination_path: Path) -> None:
"""Visualise la concentration de palette vs nombre de couleurs."""
rows = load_richness_rows(richness_path)
if not rows:
return
x_values = [int(row["colors_distinct"]) for row in rows]
y_values = [float(row["top3_share"]) for row in rows]
owned_mask = [row["in_collection"] == "true" for row in rows]
colors = ["#1f77b4" if owned else "#bbbbbb" for owned in owned_mask]
fig, ax = plt.subplots(figsize=(10, 7))
ax.scatter(x_values, y_values, c=colors, alpha=0.7, s=32)
ax.set_xlabel("Nombre de couleurs distinctes (hors rechanges)")
ax.set_ylabel("Part des 3 couleurs principales")
ax.set_title("Concentration des palettes")
ax.grid(True, linestyle="--", alpha=0.3)
legend = [
Patch(facecolor="#1f77b4", edgecolor="none", alpha=0.7, label="Set possédé"),
Patch(facecolor="#bbbbbb", edgecolor="none", alpha=0.7, label="Set manquant"),
]
ax.legend(handles=legend, loc="upper right", frameon=False)
ensure_parent_dir(destination_path)
fig.tight_layout()
fig.savefig(destination_path, dpi=170)
plt.close(fig)

View File

@ -0,0 +1,150 @@
"""Métriques de richesse chromatique par set."""
import csv
from pathlib import Path
from typing import Dict, Iterable, List, Sequence
from lib.filesystem import ensure_parent_dir
from lib.rebrickable.stats import compute_median, read_rows
def load_colors_by_set(path: Path) -> List[dict]:
"""Charge colors_by_set.csv en mémoire."""
return read_rows(path)
def load_sets(path: Path) -> Dict[str, dict]:
"""Indexe les sets enrichis par set_num."""
sets: Dict[str, dict] = {}
with path.open() as csv_file:
reader = csv.DictReader(csv_file)
for row in reader:
sets[row["set_num"]] = row
return sets
def group_by_set(rows: Iterable[dict]) -> Dict[str, List[dict]]:
"""Regroupe les couleurs par set."""
grouped: Dict[str, List[dict]] = {}
for row in rows:
set_rows = grouped.get(row["set_num"])
if set_rows is None:
set_rows = []
grouped[row["set_num"]] = set_rows
set_rows.append(row)
return grouped
def build_richness_by_set(
colors_by_set_path: Path,
sets_enriched_path: Path,
) -> List[dict]:
"""Construit les métriques de richesse chromatique par set."""
colors = load_colors_by_set(colors_by_set_path)
sets_lookup = load_sets(sets_enriched_path)
grouped = group_by_set(colors)
richness: List[dict] = []
for set_num, set_rows in grouped.items():
total_non_spare = sum(int(row["quantity_non_spare"]) for row in set_rows)
colors_distinct = len(set_rows)
colors_minifig = sum(1 for row in set_rows if int(row["quantity_minifig"]) > 0)
colors_non_minifig = sum(1 for row in set_rows if int(row["quantity_non_minifig"]) > 0)
sorted_by_quantity = sorted(set_rows, key=lambda row: int(row["quantity_non_spare"]), reverse=True)
top_color = sorted_by_quantity[0]
top3_total = sum(int(row["quantity_non_spare"]) for row in sorted_by_quantity[:3])
top_share = int(top_color["quantity_non_spare"]) / total_non_spare
top3_share = top3_total / total_non_spare
set_row = sets_lookup[set_num]
richness.append(
{
"set_num": set_num,
"set_id": set_row["set_id"],
"name": set_row["name"],
"year": set_row["year"],
"in_collection": set_row["in_collection"],
"colors_distinct": str(colors_distinct),
"colors_minifig": str(colors_minifig),
"colors_non_minifig": str(colors_non_minifig),
"total_parts_non_spare": str(total_non_spare),
"top_color_name": top_color["color_name"],
"top_color_share": f"{top_share:.4f}",
"top3_share": f"{top3_share:.4f}",
}
)
richness.sort(key=lambda row: (-int(row["colors_distinct"]), row["set_num"]))
return richness
def build_richness_by_year(richness_rows: Iterable[dict]) -> List[dict]:
"""Agrège les métriques de richesse par année."""
grouped: Dict[str, List[dict]] = {}
for row in richness_rows:
year_rows = grouped.get(row["year"])
if year_rows is None:
year_rows = []
grouped[row["year"]] = year_rows
year_rows.append(row)
yearly: List[dict] = []
for year, rows in grouped.items():
distinct_counts = [int(row["colors_distinct"]) for row in rows]
top3_shares = [float(row["top3_share"]) for row in rows]
average_distinct = sum(distinct_counts) / len(distinct_counts)
median_distinct = compute_median(distinct_counts)
average_top3 = sum(top3_shares) / len(top3_shares)
median_top3 = compute_median([int(share * 10000) for share in top3_shares]) / 10000
yearly.append(
{
"year": year,
"average_colors_distinct": f"{average_distinct:.2f}",
"median_colors_distinct": f"{median_distinct:.2f}",
"max_colors_distinct": str(max(distinct_counts)),
"min_colors_distinct": str(min(distinct_counts)),
"average_top3_share": f"{average_top3:.4f}",
"median_top3_share": f"{median_top3:.4f}",
}
)
yearly.sort(key=lambda row: int(row["year"]))
return yearly
def write_richness_by_set(destination_path: Path, rows: Sequence[dict]) -> None:
"""Écrit le CSV des métriques par set."""
ensure_parent_dir(destination_path)
fieldnames = [
"set_num",
"set_id",
"name",
"year",
"in_collection",
"colors_distinct",
"colors_minifig",
"colors_non_minifig",
"total_parts_non_spare",
"top_color_name",
"top_color_share",
"top3_share",
]
with destination_path.open("w", newline="") as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader()
for row in rows:
writer.writerow(row)
def write_richness_by_year(destination_path: Path, rows: Sequence[dict]) -> None:
"""Écrit le CSV agrégé par année."""
ensure_parent_dir(destination_path)
fieldnames = [
"year",
"average_colors_distinct",
"median_colors_distinct",
"max_colors_distinct",
"min_colors_distinct",
"average_top3_share",
"median_top3_share",
]
with destination_path.open("w", newline="") as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader()
for row in rows:
writer.writerow(row)

View File

@ -0,0 +1,28 @@
"""Calcule la richesse chromatique par set et par année."""
from pathlib import Path
from lib.rebrickable.color_richness import (
build_richness_by_set,
build_richness_by_year,
write_richness_by_set,
write_richness_by_year,
)
COLORS_BY_SET_PATH = Path("data/intermediate/colors_by_set.csv")
SETS_ENRICHED_PATH = Path("data/intermediate/sets_enriched.csv")
RICHNESS_BY_SET_PATH = Path("data/intermediate/color_richness_by_set.csv")
RICHNESS_BY_YEAR_PATH = Path("data/intermediate/color_richness_by_year.csv")
def main() -> None:
"""Construit les CSV de richesse chromatique."""
richness_by_set = build_richness_by_set(COLORS_BY_SET_PATH, SETS_ENRICHED_PATH)
richness_by_year = build_richness_by_year(richness_by_set)
write_richness_by_set(RICHNESS_BY_SET_PATH, richness_by_set)
write_richness_by_year(RICHNESS_BY_YEAR_PATH, richness_by_year)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,26 @@
"""Trace les graphiques de richesse chromatique par set."""
from pathlib import Path
from lib.plots.color_richness import (
plot_concentration_scatter,
plot_richness_boxplot,
plot_richness_top_sets,
)
RICHNESS_PATH = Path("data/intermediate/color_richness_by_set.csv")
BOXPLOT_DESTINATION = Path("figures/step28/color_richness_boxplot.png")
TOP_DESTINATION = Path("figures/step28/color_richness_top_sets.png")
CONCENTRATION_DESTINATION = Path("figures/step28/color_concentration_scatter.png")
def main() -> None:
"""Génère les visuels de richesse chromatique."""
plot_richness_boxplot(RICHNESS_PATH, BOXPLOT_DESTINATION)
plot_richness_top_sets(RICHNESS_PATH, TOP_DESTINATION)
plot_concentration_scatter(RICHNESS_PATH, CONCENTRATION_DESTINATION)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,196 @@
"""Tests des métriques de richesse chromatique."""
import csv
from pathlib import Path
from lib.rebrickable.color_richness import (
build_richness_by_set,
build_richness_by_year,
write_richness_by_set,
write_richness_by_year,
)
def write_csv(path: Path, headers: list[str], rows: list[list[str]]) -> None:
"""Écrit un CSV simple pour les besoins de tests."""
with path.open("w", newline="") as csv_file:
writer = csv.writer(csv_file)
writer.writerow(headers)
writer.writerows(rows)
def test_build_richness_by_set_computes_shares_and_counts(tmp_path: Path) -> None:
"""Calcule les partages de couleurs principales et les dénombrements."""
colors_by_set = tmp_path / "colors_by_set.csv"
write_csv(
colors_by_set,
[
"set_num",
"set_id",
"year",
"color_rgb",
"is_translucent",
"color_name",
"quantity_total",
"quantity_non_spare",
"quantity_minifig",
"quantity_non_minifig",
],
[
["1000-1", "1000", "2020", "AAAAAA", "false", "Gray", "10", "10", "0", "10"],
["1000-1", "1000", "2020", "BBBBBB", "false", "Blue", "5", "5", "5", "0"],
["2000-1", "2000", "2021", "CCCCCC", "true", "Trans", "3", "3", "0", "3"],
],
)
sets_enriched = tmp_path / "sets_enriched.csv"
write_csv(
sets_enriched,
["set_num", "set_id", "name", "year", "in_collection"],
[
["1000-1", "1000", "Set A", "2020", "true"],
["2000-1", "2000", "Set B", "2021", "false"],
],
)
richness = build_richness_by_set(colors_by_set, sets_enriched)
assert richness == [
{
"set_num": "1000-1",
"set_id": "1000",
"name": "Set A",
"year": "2020",
"in_collection": "true",
"colors_distinct": "2",
"colors_minifig": "1",
"colors_non_minifig": "1",
"total_parts_non_spare": "15",
"top_color_name": "Gray",
"top_color_share": "0.6667",
"top3_share": "1.0000",
},
{
"set_num": "2000-1",
"set_id": "2000",
"name": "Set B",
"year": "2021",
"in_collection": "false",
"colors_distinct": "1",
"colors_minifig": "0",
"colors_non_minifig": "1",
"total_parts_non_spare": "3",
"top_color_name": "Trans",
"top_color_share": "1.0000",
"top3_share": "1.0000",
},
]
def test_build_richness_by_year_aggregates_metrics(tmp_path: Path) -> None:
"""Agrège les métriques par année."""
richness_rows = [
{
"set_num": "s1",
"set_id": "1",
"name": "A",
"year": "2020",
"in_collection": "true",
"colors_distinct": "4",
"colors_minifig": "1",
"colors_non_minifig": "3",
"total_parts_non_spare": "10",
"top_color_name": "Red",
"top_color_share": "0.5000",
"top3_share": "0.9000",
},
{
"set_num": "s2",
"set_id": "2",
"name": "B",
"year": "2020",
"in_collection": "false",
"colors_distinct": "2",
"colors_minifig": "0",
"colors_non_minifig": "2",
"total_parts_non_spare": "5",
"top_color_name": "Blue",
"top_color_share": "0.6000",
"top3_share": "1.0000",
},
{
"set_num": "s3",
"set_id": "3",
"name": "C",
"year": "2021",
"in_collection": "true",
"colors_distinct": "3",
"colors_minifig": "1",
"colors_non_minifig": "3",
"total_parts_non_spare": "7",
"top_color_name": "Green",
"top_color_share": "0.5714",
"top3_share": "1.0000",
},
]
yearly = build_richness_by_year(richness_rows)
assert yearly == [
{
"year": "2020",
"average_colors_distinct": "3.00",
"median_colors_distinct": "3.00",
"max_colors_distinct": "4",
"min_colors_distinct": "2",
"average_top3_share": "0.9500",
"median_top3_share": "0.9500",
},
{
"year": "2021",
"average_colors_distinct": "3.00",
"median_colors_distinct": "3.00",
"max_colors_distinct": "3",
"min_colors_distinct": "3",
"average_top3_share": "1.0000",
"median_top3_share": "1.0000",
},
]
def test_write_richness_outputs_csv(tmp_path: Path) -> None:
"""Sérialise les métriques par set et par année."""
by_set_path = tmp_path / "color_richness_by_set.csv"
by_year_path = tmp_path / "color_richness_by_year.csv"
sample_set_rows = [
{
"set_num": "s1",
"set_id": "1",
"name": "A",
"year": "2020",
"in_collection": "true",
"colors_distinct": "1",
"colors_minifig": "1",
"colors_non_minifig": "1",
"total_parts_non_spare": "5",
"top_color_name": "Red",
"top_color_share": "1.0000",
"top3_share": "1.0000",
}
]
sample_year_rows = [
{
"year": "2020",
"average_colors_distinct": "1.00",
"median_colors_distinct": "1.00",
"max_colors_distinct": "1",
"min_colors_distinct": "1",
"average_top3_share": "1.0000",
"median_top3_share": "1.0000",
}
]
write_richness_by_set(by_set_path, sample_set_rows)
write_richness_by_year(by_year_path, sample_year_rows)
assert by_set_path.exists()
assert by_year_path.exists()

View File

@ -0,0 +1,38 @@
"""Tests des visuels de richesse chromatique."""
import matplotlib
from pathlib import Path
from lib.plots.color_richness import (
plot_concentration_scatter,
plot_richness_boxplot,
plot_richness_top_sets,
)
matplotlib.use("Agg")
def test_plot_richness_outputs_images(tmp_path: Path) -> None:
"""Génère les trois graphiques principaux."""
richness_path = tmp_path / "color_richness_by_set.csv"
richness_path.write_text(
"set_num,set_id,name,year,in_collection,colors_distinct,colors_minifig,colors_non_minifig,total_parts_non_spare,top_color_name,top_color_share,top3_share\n"
"1000-1,1000,Set A,2020,true,6,2,5,50,Red,0.4000,0.6500\n"
"2000-1,2000,Set B,2021,false,4,1,3,30,Blue,0.5000,0.7500\n"
"3000-1,3000,Set C,2021,true,5,1,4,40,Green,0.3000,0.5500\n"
)
boxplot_dest = tmp_path / "figures" / "step28" / "color_richness_boxplot.png"
top_dest = tmp_path / "figures" / "step28" / "color_richness_top_sets.png"
scatter_dest = tmp_path / "figures" / "step28" / "color_concentration_scatter.png"
plot_richness_boxplot(richness_path, boxplot_dest)
plot_richness_top_sets(richness_path, top_dest)
plot_concentration_scatter(richness_path, scatter_dest)
assert boxplot_dest.exists()
assert top_dest.exists()
assert scatter_dest.exists()
assert boxplot_dest.stat().st_size > 0
assert top_dest.stat().st_size > 0
assert scatter_dest.stat().st_size > 0

View File

@ -102,17 +102,6 @@ def test_build_rare_parts_detects_exclusive_variations(tmp_path: Path) -> None:
}, },
] ]
assert rare_by_set == [ assert rare_by_set == [
{
"set_num": "1000-1",
"set_id": "1000",
"name": "Set A",
"year": "2020",
"in_collection": "true",
"rare_parts_distinct": "1",
"rare_parts_quantity": "1",
"rare_minifig_parts_distinct": "1",
"rare_minifig_quantity": "1",
},
{ {
"set_num": "2000-1", "set_num": "2000-1",
"set_id": "2000", "set_id": "2000",
@ -124,6 +113,17 @@ def test_build_rare_parts_detects_exclusive_variations(tmp_path: Path) -> None:
"rare_minifig_parts_distinct": "0", "rare_minifig_parts_distinct": "0",
"rare_minifig_quantity": "0", "rare_minifig_quantity": "0",
}, },
{
"set_num": "1000-1",
"set_id": "1000",
"name": "Set A",
"year": "2020",
"in_collection": "true",
"rare_parts_distinct": "1",
"rare_parts_quantity": "1",
"rare_minifig_parts_distinct": "1",
"rare_minifig_quantity": "1",
},
] ]