1

Batch de tags pour les critiques via Wikidata

This commit is contained in:
2025-11-11 01:03:43 +01:00
parent e58ea7ee23
commit 0cedbc049b
46 changed files with 2226 additions and 215 deletions

719
tools/sync_wiki_metadata.js Normal file
View File

@@ -0,0 +1,719 @@
#!/usr/bin/env node
/**
* Synchronise Hugo critique frontmatter with Wikidata metadata.
*
* The script:
* 1. Reads the critique bundle (expects an index.md with YAML frontmatter).
* 2. Uses the frontmatter title (or --query) plus the bundle type (film, série, etc.)
* to search Wikidata and lets the user confirm the right entity.
* 3. Fetches structured data (genres, cast, crew, companies...) according to the
* Hugo taxonomies currently available in config/_default/taxonomies.yaml.
* 4. Adds the missing taxonomy terms to the frontmatter without removing anything.
*/
const fs = require("node:fs");
const path = require("node:path");
const yaml = require("js-yaml");
const readline = require("node:readline/promises");
const { stdin, stdout } = require("node:process");
const LANGUAGE_FALLBACK = ["fr", "fr-ca", "fr-fr", "en", "en-gb", "en-ca"];
const MAX_WIKIDATA_IDS_PER_REQUEST = 50;
const WIKIDATA_ID_FIELD = "wikidata_id";
const PROJECT_ROOT = path.resolve(__dirname, "..");
const DEFAULT_CRITIQUES_ROOT = path.join(PROJECT_ROOT, "content", "critiques");
const WIKIPEDIA_PREFERRED_LANGS = ["fr", "en"];
const OFFICIAL_SITE_PROPERTY = "P856";
const WIKIDATA_ID_PATTERN = /^Q\d+$/i;
const TYPE_CONFIG = {
films: {
label: "film",
queryAugment: "film",
descriptionHints: ["film", "movie"],
taxonomyMap: {
genres: ["P136"],
personnalites: ["P57", "P58", "P162", "P86", "P161"],
entreprises: ["P272", "P750"],
},
roleQualifiers: [{ claim: "P161", qualifier: "P453" }],
},
series: {
label: "série TV",
queryAugment: '"série télévisée"',
descriptionHints: ["série", "tv series", "télévisée", "television"],
taxonomyMap: {
genres: ["P136"],
personnalites: ["P57", "P58", "P162", "P86", "P161"],
entreprises: ["P272", "P449", "P750"],
},
roleQualifiers: [{ claim: "P161", qualifier: "P453" }],
},
"jeux-video": {
label: "jeu vidéo",
queryAugment: '"jeu vidéo"',
descriptionHints: ["jeu vidéo", "video game", "jeu-vidéo", "jeu video"],
taxonomyMap: {
genres: ["P136"],
personnalites: ["P57", "P58", "P162", "P86", "P161"],
entreprises: ["P178", "P123", "P750"],
},
roleQualifiers: [{ claim: "P161", qualifier: "P453" }],
},
livres: {
label: "livre",
queryAugment: "livre",
descriptionHints: ["roman", "novel", "book", "livre", "comic", "nouvelle", "script"],
taxonomyMap: {
genres: ["P136"],
personnalites: ["P50", "P110"],
entreprises: ["P123"],
},
},
};
function parseArgs(argv) {
const args = argv.slice(2);
const options = {
language: "fr",
limit: 8,
query: null,
autoSelect: false,
};
const targets = [];
for (const arg of args) {
if (arg.startsWith("--lang=")) {
options.language = arg.slice("--lang=".length).trim() || options.language;
} else if (arg.startsWith("--limit=")) {
const value = Number.parseInt(arg.slice("--limit=".length), 10);
if (!Number.isNaN(value) && value > 0) {
options.limit = value;
}
} else if (arg.startsWith("--query=")) {
options.query = arg.slice("--query=".length).trim() || null;
} else if (arg === "--auto" || arg === "--yes") {
options.autoSelect = true;
} else if (arg === "--help" || arg === "-h") {
options.help = true;
} else if (arg.length > 0) {
targets.push(arg);
}
}
return { options, targets };
}
function showUsage() {
console.log(`Usage: node tools/sync_wiki_metadata.js <critique_path>
Options
--lang=fr Primary language used for Wikidata labels (default: fr)
--limit=8 Max number of Wikidata search results to show
--query="..." Override the query derived from the frontmatter title
--auto Skip the interactive prompt and pick the first result
Notes
• Without arguments, every critique bundle under content/critiques is processed.
• Provide one or more bundle paths to limit the scope manually.
Examples
node tools/sync_wiki_metadata.js content/critiques/films/crocodile-dunde-ii
node tools/sync_wiki_metadata.js --lang=en --query="Galaxy Quest film" content/critiques/films/galaxy-quest
`);
}
function resolveArticleDir(targetPath) {
const absolute = path.resolve(targetPath);
if (!fs.existsSync(absolute)) {
throw new Error(`Path not found: ${absolute}`);
}
const stats = fs.statSync(absolute);
if (stats.isDirectory()) {
return absolute;
}
if (stats.isFile()) {
if (path.basename(absolute) !== "index.md") {
throw new Error(`Expected an index.md, got ${absolute}`);
}
return path.dirname(absolute);
}
throw new Error(`Unsupported path type: ${absolute}`);
}
function getIndexPath(articleDir) {
const indexPath = path.join(articleDir, "index.md");
if (!fs.existsSync(indexPath)) {
throw new Error(`Missing index.md in ${articleDir}`);
}
return indexPath;
}
function readFrontmatter(indexPath) {
const raw = fs.readFileSync(indexPath, "utf8");
const match = raw.match(/^---\n([\s\S]+?)\n---\n?([\s\S]*)$/);
if (!match) {
throw new Error(`No valid frontmatter found in ${indexPath}`);
}
const data = yaml.load(match[1]) || {};
const body = match[2] || "";
return { data, body };
}
function detectCritiqueType(articleDir) {
const normalized = articleDir.split(path.sep);
const idx = normalized.lastIndexOf("critiques");
if (idx === -1 || idx + 1 >= normalized.length) {
return null;
}
return normalized[idx + 1];
}
function collectCritiqueBundles(rootDir) {
if (!fs.existsSync(rootDir)) {
return [];
}
const bundles = [];
const stack = [rootDir];
while (stack.length > 0) {
const currentDir = stack.pop();
const entries = fs.readdirSync(currentDir, { withFileTypes: true });
const hasIndex = entries.some((entry) => entry.isFile() && entry.name === "index.md");
if (hasIndex) {
bundles.push(currentDir);
continue;
}
for (const entry of entries) {
if (!entry.isDirectory()) {
continue;
}
if (entry.name.startsWith(".")) {
continue;
}
const nextPath = path.join(currentDir, entry.name);
stack.push(nextPath);
}
}
return bundles.sort((a, b) => a.localeCompare(b, "fr"));
}
function buildLanguageOrder(primary) {
const order = [primary, ...LANGUAGE_FALLBACK];
return order.filter((value, index) => order.indexOf(value) === index);
}
async function wikidataApi(params) {
const url = new URL("https://www.wikidata.org/w/api.php");
for (const [key, value] of Object.entries(params)) {
if (value !== undefined && value !== null) {
url.searchParams.set(key, value);
}
}
url.searchParams.set("format", "json");
const response = await fetch(url);
if (!response.ok) {
throw new Error(`Wikidata API error ${response.status}: ${response.statusText}`);
}
return response.json();
}
async function searchEntities(term, typeConfig, options) {
const queries = [];
if (typeConfig.queryAugment) {
queries.push(`${term} ${typeConfig.queryAugment}`);
}
queries.push(term);
for (const query of queries) {
const data = await wikidataApi({
action: "wbsearchentities",
search: query,
language: options.language,
uselang: options.language,
type: "item",
limit: String(options.limit),
strictlanguage: false,
origin: "*",
});
if (!data.search || data.search.length === 0) {
continue;
}
let results = data.search;
if (typeConfig.descriptionHints && typeConfig.descriptionHints.length > 0) {
const hints = typeConfig.descriptionHints.map((hint) => hint.toLowerCase());
const filtered = results.filter((entry) => {
if (!entry.description) {
return false;
}
const desc = entry.description.toLowerCase();
return hints.some((hint) => desc.includes(hint));
});
if (filtered.length > 0) {
results = filtered;
}
}
return results;
}
return [];
}
async function fetchEntity(entityId, languages) {
const data = await wikidataApi({
action: "wbgetentities",
ids: entityId,
props: "labels|descriptions|claims|sitelinks",
languages: languages.join("|"),
origin: "*",
});
if (!data.entities || !data.entities[entityId]) {
throw new Error(`Unable to load Wikidata entity ${entityId}`);
}
return data.entities[entityId];
}
function collectClaimIds(entity, property) {
const claims = entity.claims?.[property];
if (!claims) {
return [];
}
const ids = [];
for (const claim of claims) {
const value = claim.mainsnak?.datavalue?.value;
if (value && typeof value === "object" && value.id) {
ids.push(value.id);
}
}
return ids;
}
function collectClaimUrls(entity, property) {
const claims = entity.claims?.[property];
if (!claims) {
return [];
}
const urls = [];
for (const claim of claims) {
const value = claim.mainsnak?.datavalue?.value;
if (typeof value === "string") {
urls.push(value);
}
}
return [...new Set(urls)];
}
function collectRoleIds(entity, roleConfig) {
const ids = [];
if (!roleConfig) {
return ids;
}
for (const { claim, qualifier } of roleConfig) {
const claims = entity.claims?.[claim];
if (!claims) {
continue;
}
for (const entry of claims) {
const qualifiers = entry.qualifiers?.[qualifier];
if (!qualifiers) {
continue;
}
for (const qual of qualifiers) {
const value = qual.datavalue?.value;
if (value && typeof value === "object" && value.id) {
ids.push(value.id);
} else if (value && typeof value === "object" && value.text) {
ids.push(value.text);
}
}
}
}
return ids;
}
async function fetchLabels(ids, languages) {
const uniqueIds = [...new Set(ids.filter((value) => typeof value === "string" && value.startsWith("Q")))];
const labelMap = {};
if (uniqueIds.length === 0) {
return labelMap;
}
for (let i = 0; i < uniqueIds.length; i += MAX_WIKIDATA_IDS_PER_REQUEST) {
const chunk = uniqueIds.slice(i, i + MAX_WIKIDATA_IDS_PER_REQUEST);
const data = await wikidataApi({
action: "wbgetentities",
ids: chunk.join("|"),
props: "labels",
languages: languages.join("|"),
origin: "*",
});
for (const [id, entity] of Object.entries(data.entities || {})) {
labelMap[id] = pickLabel(entity.labels, languages) || id;
}
}
return labelMap;
}
function pickLabel(labels = {}, languages) {
for (const lang of languages) {
if (labels[lang]) {
return labels[lang].value;
}
}
const fallback = Object.values(labels)[0];
return fallback ? fallback.value : null;
}
function pickWikipediaLink(sitelinks = {}) {
for (const lang of WIKIPEDIA_PREFERRED_LANGS) {
const key = `${lang}wiki`;
const link = sitelinks[key];
if (!link) {
continue;
}
if (link.url) {
return { lang, url: link.url };
}
if (link.title) {
const encodedTitle = encodeURIComponent(link.title.replace(/ /g, "_"));
return { lang, url: `https://${lang}.wikipedia.org/wiki/${encodedTitle}` };
}
}
return null;
}
function inferLanguageFromUrl(rawUrl) {
try {
const { hostname, pathname } = new URL(rawUrl);
const host = hostname.toLowerCase();
if (host.endsWith(".fr") || host.includes(".fr.")) {
return "fr";
}
if (host.endsWith(".de")) {
return "de";
}
if (host.endsWith(".es")) {
return "es";
}
if (host.endsWith(".it")) {
return "it";
}
if (host.endsWith(".pt") || host.endsWith(".br")) {
return "pt";
}
if (host.endsWith(".co.uk") || host.endsWith(".uk") || host.endsWith(".us") || host.endsWith(".com")) {
// only infer English when explicitly present in the path
if (pathname.toLowerCase().includes("/en/") || pathname.toLowerCase().startsWith("/en")) {
return "en";
}
return null;
}
if (pathname.toLowerCase().startsWith("/fr/") || pathname.toLowerCase().includes("/fr/")) {
return "fr";
}
return null;
} catch {
return null;
}
}
function buildExternalLinks(entity) {
const links = [];
const seen = new Set();
const addLink = (entry) => {
if (!entry || !entry.url || seen.has(entry.url)) {
return;
}
const normalized = {
name: entry.name || "Lien",
url: entry.url,
};
if (entry.lang) {
normalized.lang = entry.lang;
}
links.push(normalized);
seen.add(entry.url);
};
const wikiLink = pickWikipediaLink(entity.sitelinks);
if (wikiLink) {
addLink({
name: "Page Wikipédia",
url: wikiLink.url,
lang: wikiLink.lang,
});
}
const officialUrls = collectClaimUrls(entity, OFFICIAL_SITE_PROPERTY);
for (const url of officialUrls) {
const link = {
name: "Site officiel",
url,
};
const detectedLang = inferLanguageFromUrl(url);
if (detectedLang) {
link.lang = detectedLang;
}
addLink(link);
}
return links;
}
function buildTaxonomyValues(entity, typeConfig, labelLookup) {
const taxonomyData = {};
const addValue = (taxonomy, value) => {
if (!value) {
return;
}
if (!taxonomyData[taxonomy]) {
taxonomyData[taxonomy] = new Set();
}
taxonomyData[taxonomy].add(value);
};
for (const [taxonomy, properties] of Object.entries(typeConfig.taxonomyMap)) {
for (const property of properties) {
const ids = collectClaimIds(entity, property);
for (const id of ids) {
const label = labelLookup(id);
if (!label || WIKIDATA_ID_PATTERN.test(label)) {
continue;
}
addValue(taxonomy, label);
}
}
}
if (typeConfig.roleQualifiers) {
const roleIds = collectRoleIds(entity, typeConfig.roleQualifiers);
for (const roleId of roleIds) {
if (typeof roleId === "string" && WIKIDATA_ID_PATTERN.test(roleId)) {
const resolved = labelLookup(roleId);
if (!resolved || WIKIDATA_ID_PATTERN.test(resolved)) {
continue;
}
addValue("personnages_de_fiction", resolved);
} else {
addValue("personnages_de_fiction", roleId);
}
}
}
return Object.fromEntries(
Object.entries(taxonomyData).map(([taxonomy, values]) => [taxonomy, [...values]])
);
}
function mergeFrontmatter(frontmatter, newValues) {
let updated = false;
for (const [taxonomy, values] of Object.entries(newValues)) {
if (!values || values.length === 0) {
continue;
}
const list = Array.isArray(frontmatter[taxonomy])
? [...frontmatter[taxonomy]]
: frontmatter[taxonomy]
? [frontmatter[taxonomy]]
: [];
const existing = new Set(list);
let added = 0;
for (const value of values) {
if (!existing.has(value)) {
list.push(value);
existing.add(value);
added += 1;
}
}
if (added > 0) {
list.sort((a, b) => a.localeCompare(b, "fr"));
frontmatter[taxonomy] = list;
updated = true;
console.log(` ↳ Added ${added} value(s) to "${taxonomy}"`);
}
}
return updated;
}
function mergeLinks(frontmatter, linksToAdd) {
if (!linksToAdd || linksToAdd.length === 0) {
return false;
}
if (Object.prototype.hasOwnProperty.call(frontmatter, "links")) {
return false;
}
frontmatter.links = [...linksToAdd];
console.log(` ↳ Added ${linksToAdd.length} link(s) to "links"`);
return true;
}
async function promptForSelection(results, rl) {
if (results.length === 1) {
const only = results[0];
const answer = await rl.question(
`Found a single match: ${only.label}${only.description || "sans description"} [${only.id}]. Use it? (Y/n) `
);
if (answer.trim() === "" || /^y(es)?$/i.test(answer.trim())) {
return only;
}
return null;
}
console.log("Sélectionnez l'œuvre correspondante :");
results.forEach((result, index) => {
console.log(
` ${index + 1}. ${result.label}${result.description || "sans description"} [${result.id}]`
);
});
console.log(" 0. Annuler");
while (true) {
const answer = await rl.question("Choix : ");
const choice = Number.parseInt(answer, 10);
if (!Number.isNaN(choice)) {
if (choice === 0) {
return null;
}
if (choice >= 1 && choice <= results.length) {
return results[choice - 1];
}
}
console.log("Veuillez saisir un numéro valide ou 0 pour annuler.");
}
}
async function processCritique(target, options, rl) {
const articleDir = resolveArticleDir(target);
const typeKey = detectCritiqueType(articleDir);
if (!typeKey) {
console.log(`⚠️ Impossible de déduire le type pour ${articleDir}. Ignoré.`);
return;
}
const typeConfig = TYPE_CONFIG[typeKey];
if (!typeConfig) {
console.log(`⚠️ Type "${typeKey}" non pris en charge pour ${articleDir}.`);
return;
}
const indexPath = getIndexPath(articleDir);
const { data: frontmatter, body } = readFrontmatter(indexPath);
const storedEntityId = frontmatter[WIKIDATA_ID_FIELD];
const searchTerm = options.query || frontmatter?.title;
if (!storedEntityId && !searchTerm) {
console.log(`⚠️ Aucun titre trouvé dans ${indexPath}.`);
return;
}
console.log(`\n📄 ${indexPath}`);
console.log(` Type détecté : ${typeKey}`);
let entityId = storedEntityId;
let selection = null;
if (entityId) {
console.log(` 🆔 Identifiant Wikidata déjà enregistré : ${entityId}`);
} else {
console.log(` Recherche Wikidata : "${searchTerm}"`);
const results = await searchEntities(searchTerm, typeConfig, options);
if (!results.length) {
console.log(" ❌ Aucun résultat Wikidata trouvé.");
return;
}
if (options.autoSelect) {
selection = results[0];
console.log(
` ⚙️ Mode automatique: sélection du premier résultat (${selection.label}${selection.description || "sans description"})`
);
} else {
selection = await promptForSelection(results, rl);
}
if (!selection) {
console.log(" ❎ Sélection annulée.");
return;
}
entityId = selection.id;
}
const languages = buildLanguageOrder(options.language);
const entity = await fetchEntity(entityId, languages);
const entityLabel = pickLabel(entity.labels, languages) || selection?.label || entityId;
if (storedEntityId) {
console.log(` ✔ Entité chargée : ${entityLabel} (${entityId})`);
} else {
console.log(` ✔ Entité sélectionnée : ${entityLabel} (${entityId})`);
}
const idsToResolve = new Set();
for (const properties of Object.values(typeConfig.taxonomyMap)) {
for (const property of properties) {
collectClaimIds(entity, property).forEach((id) => idsToResolve.add(id));
}
}
if (typeConfig.roleQualifiers) {
collectRoleIds(entity, typeConfig.roleQualifiers)
.filter((id) => typeof id === "string" && id.startsWith("Q"))
.forEach((id) => idsToResolve.add(id));
}
const labelMap = await fetchLabels([...idsToResolve], languages);
const lookup = (id) => labelMap[id] || id;
const taxonomyValues = buildTaxonomyValues(entity, typeConfig, lookup);
const externalLinks = buildExternalLinks(entity);
let updated = mergeFrontmatter(frontmatter, taxonomyValues);
if (mergeLinks(frontmatter, externalLinks)) {
updated = true;
}
if (frontmatter[WIKIDATA_ID_FIELD] !== entityId) {
frontmatter[WIKIDATA_ID_FIELD] = entityId;
updated = true;
console.log(` ↳ Champ ${WIKIDATA_ID_FIELD} ajouté/mis à jour`);
}
if (!updated) {
console.log(" Aucun nouveau terme à ajouter.");
return;
}
const newFrontmatter = yaml.dump(frontmatter, { lineWidth: -1 });
fs.writeFileSync(indexPath, `---\n${newFrontmatter}---\n${body}`, "utf8");
console.log(" 💾 Frontmatter mis à jour.");
}
async function main() {
const { options, targets: cliTargets } = parseArgs(process.argv);
if (options.help) {
showUsage();
return;
}
let targets = [...cliTargets];
if (targets.length === 0) {
console.log(`🔄 Recherche de critiques dans ${DEFAULT_CRITIQUES_ROOT}...`);
targets = collectCritiqueBundles(DEFAULT_CRITIQUES_ROOT);
if (targets.length === 0) {
console.log("Aucune critique à traiter. Veuillez fournir un chemin explicite.");
return;
}
console.log(`${targets.length} critique(s) détectée(s).`);
}
const rl = options.autoSelect ? null : readline.createInterface({ input: stdin, output: stdout });
try {
for (const target of targets) {
await processCritique(target, options, rl);
}
} catch (error) {
console.error(`Erreur: ${error.message}`);
process.exitCode = 1;
} finally {
if (rl) {
rl.close();
}
}
}
main();