1
Files
2025/tools/sync_wiki_metadata.js

720 lines
21 KiB
JavaScript
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env node
/**
* Synchronise Hugo critique frontmatter with Wikidata metadata.
*
* The script:
* 1. Reads the critique bundle (expects an index.md with YAML frontmatter).
* 2. Uses the frontmatter title (or --query) plus the bundle type (film, série, etc.)
* to search Wikidata and lets the user confirm the right entity.
* 3. Fetches structured data (genres, cast, crew, companies...) according to the
* Hugo taxonomies currently available in config/_default/taxonomies.yaml.
* 4. Adds the missing taxonomy terms to the frontmatter without removing anything.
*/
const fs = require("node:fs");
const path = require("node:path");
const yaml = require("js-yaml");
const readline = require("node:readline/promises");
const { stdin, stdout } = require("node:process");
const LANGUAGE_FALLBACK = ["fr", "fr-ca", "fr-fr", "en", "en-gb", "en-ca"];
const MAX_WIKIDATA_IDS_PER_REQUEST = 50;
const WIKIDATA_ID_FIELD = "wikidata_id";
const PROJECT_ROOT = path.resolve(__dirname, "..");
const DEFAULT_CRITIQUES_ROOT = path.join(PROJECT_ROOT, "content", "critiques");
const WIKIPEDIA_PREFERRED_LANGS = ["fr", "en"];
const OFFICIAL_SITE_PROPERTY = "P856";
const WIKIDATA_ID_PATTERN = /^Q\d+$/i;
const TYPE_CONFIG = {
films: {
label: "film",
queryAugment: "film",
descriptionHints: ["film", "movie"],
taxonomyMap: {
genres: ["P136"],
personnalites: ["P57", "P58", "P162", "P86", "P161"],
entreprises: ["P272", "P750"],
},
roleQualifiers: [{ claim: "P161", qualifier: "P453" }],
},
series: {
label: "série TV",
queryAugment: '"série télévisée"',
descriptionHints: ["série", "tv series", "télévisée", "television"],
taxonomyMap: {
genres: ["P136"],
personnalites: ["P57", "P58", "P162", "P86", "P161"],
entreprises: ["P272", "P449", "P750"],
},
roleQualifiers: [{ claim: "P161", qualifier: "P453" }],
},
"jeux-video": {
label: "jeu vidéo",
queryAugment: '"jeu vidéo"',
descriptionHints: ["jeu vidéo", "video game", "jeu-vidéo", "jeu video"],
taxonomyMap: {
genres: ["P136"],
personnalites: ["P57", "P58", "P162", "P86", "P161"],
entreprises: ["P178", "P123", "P750"],
},
roleQualifiers: [{ claim: "P161", qualifier: "P453" }],
},
livres: {
label: "livre",
queryAugment: "livre",
descriptionHints: ["roman", "novel", "book", "livre", "comic", "nouvelle", "script"],
taxonomyMap: {
genres: ["P136"],
personnalites: ["P50", "P110"],
entreprises: ["P123"],
},
},
};
function parseArgs(argv) {
const args = argv.slice(2);
const options = {
language: "fr",
limit: 8,
query: null,
autoSelect: false,
};
const targets = [];
for (const arg of args) {
if (arg.startsWith("--lang=")) {
options.language = arg.slice("--lang=".length).trim() || options.language;
} else if (arg.startsWith("--limit=")) {
const value = Number.parseInt(arg.slice("--limit=".length), 10);
if (!Number.isNaN(value) && value > 0) {
options.limit = value;
}
} else if (arg.startsWith("--query=")) {
options.query = arg.slice("--query=".length).trim() || null;
} else if (arg === "--auto" || arg === "--yes") {
options.autoSelect = true;
} else if (arg === "--help" || arg === "-h") {
options.help = true;
} else if (arg.length > 0) {
targets.push(arg);
}
}
return { options, targets };
}
function showUsage() {
console.log(`Usage: node tools/sync_wiki_metadata.js <critique_path>
Options
--lang=fr Primary language used for Wikidata labels (default: fr)
--limit=8 Max number of Wikidata search results to show
--query="..." Override the query derived from the frontmatter title
--auto Skip the interactive prompt and pick the first result
Notes
• Without arguments, every critique bundle under content/critiques is processed.
• Provide one or more bundle paths to limit the scope manually.
Examples
node tools/sync_wiki_metadata.js content/critiques/films/crocodile-dunde-ii
node tools/sync_wiki_metadata.js --lang=en --query="Galaxy Quest film" content/critiques/films/galaxy-quest
`);
}
function resolveArticleDir(targetPath) {
const absolute = path.resolve(targetPath);
if (!fs.existsSync(absolute)) {
throw new Error(`Path not found: ${absolute}`);
}
const stats = fs.statSync(absolute);
if (stats.isDirectory()) {
return absolute;
}
if (stats.isFile()) {
if (path.basename(absolute) !== "index.md") {
throw new Error(`Expected an index.md, got ${absolute}`);
}
return path.dirname(absolute);
}
throw new Error(`Unsupported path type: ${absolute}`);
}
function getIndexPath(articleDir) {
const indexPath = path.join(articleDir, "index.md");
if (!fs.existsSync(indexPath)) {
throw new Error(`Missing index.md in ${articleDir}`);
}
return indexPath;
}
function readFrontmatter(indexPath) {
const raw = fs.readFileSync(indexPath, "utf8");
const match = raw.match(/^---\n([\s\S]+?)\n---\n?([\s\S]*)$/);
if (!match) {
throw new Error(`No valid frontmatter found in ${indexPath}`);
}
const data = yaml.load(match[1]) || {};
const body = match[2] || "";
return { data, body };
}
function detectCritiqueType(articleDir) {
const normalized = articleDir.split(path.sep);
const idx = normalized.lastIndexOf("critiques");
if (idx === -1 || idx + 1 >= normalized.length) {
return null;
}
return normalized[idx + 1];
}
function collectCritiqueBundles(rootDir) {
if (!fs.existsSync(rootDir)) {
return [];
}
const bundles = [];
const stack = [rootDir];
while (stack.length > 0) {
const currentDir = stack.pop();
const entries = fs.readdirSync(currentDir, { withFileTypes: true });
const hasIndex = entries.some((entry) => entry.isFile() && entry.name === "index.md");
if (hasIndex) {
bundles.push(currentDir);
continue;
}
for (const entry of entries) {
if (!entry.isDirectory()) {
continue;
}
if (entry.name.startsWith(".")) {
continue;
}
const nextPath = path.join(currentDir, entry.name);
stack.push(nextPath);
}
}
return bundles.sort((a, b) => a.localeCompare(b, "fr"));
}
function buildLanguageOrder(primary) {
const order = [primary, ...LANGUAGE_FALLBACK];
return order.filter((value, index) => order.indexOf(value) === index);
}
async function wikidataApi(params) {
const url = new URL("https://www.wikidata.org/w/api.php");
for (const [key, value] of Object.entries(params)) {
if (value !== undefined && value !== null) {
url.searchParams.set(key, value);
}
}
url.searchParams.set("format", "json");
const response = await fetch(url);
if (!response.ok) {
throw new Error(`Wikidata API error ${response.status}: ${response.statusText}`);
}
return response.json();
}
async function searchEntities(term, typeConfig, options) {
const queries = [];
if (typeConfig.queryAugment) {
queries.push(`${term} ${typeConfig.queryAugment}`);
}
queries.push(term);
for (const query of queries) {
const data = await wikidataApi({
action: "wbsearchentities",
search: query,
language: options.language,
uselang: options.language,
type: "item",
limit: String(options.limit),
strictlanguage: false,
origin: "*",
});
if (!data.search || data.search.length === 0) {
continue;
}
let results = data.search;
if (typeConfig.descriptionHints && typeConfig.descriptionHints.length > 0) {
const hints = typeConfig.descriptionHints.map((hint) => hint.toLowerCase());
const filtered = results.filter((entry) => {
if (!entry.description) {
return false;
}
const desc = entry.description.toLowerCase();
return hints.some((hint) => desc.includes(hint));
});
if (filtered.length > 0) {
results = filtered;
}
}
return results;
}
return [];
}
async function fetchEntity(entityId, languages) {
const data = await wikidataApi({
action: "wbgetentities",
ids: entityId,
props: "labels|descriptions|claims|sitelinks",
languages: languages.join("|"),
origin: "*",
});
if (!data.entities || !data.entities[entityId]) {
throw new Error(`Unable to load Wikidata entity ${entityId}`);
}
return data.entities[entityId];
}
function collectClaimIds(entity, property) {
const claims = entity.claims?.[property];
if (!claims) {
return [];
}
const ids = [];
for (const claim of claims) {
const value = claim.mainsnak?.datavalue?.value;
if (value && typeof value === "object" && value.id) {
ids.push(value.id);
}
}
return ids;
}
function collectClaimUrls(entity, property) {
const claims = entity.claims?.[property];
if (!claims) {
return [];
}
const urls = [];
for (const claim of claims) {
const value = claim.mainsnak?.datavalue?.value;
if (typeof value === "string") {
urls.push(value);
}
}
return [...new Set(urls)];
}
function collectRoleIds(entity, roleConfig) {
const ids = [];
if (!roleConfig) {
return ids;
}
for (const { claim, qualifier } of roleConfig) {
const claims = entity.claims?.[claim];
if (!claims) {
continue;
}
for (const entry of claims) {
const qualifiers = entry.qualifiers?.[qualifier];
if (!qualifiers) {
continue;
}
for (const qual of qualifiers) {
const value = qual.datavalue?.value;
if (value && typeof value === "object" && value.id) {
ids.push(value.id);
} else if (value && typeof value === "object" && value.text) {
ids.push(value.text);
}
}
}
}
return ids;
}
async function fetchLabels(ids, languages) {
const uniqueIds = [...new Set(ids.filter((value) => typeof value === "string" && value.startsWith("Q")))];
const labelMap = {};
if (uniqueIds.length === 0) {
return labelMap;
}
for (let i = 0; i < uniqueIds.length; i += MAX_WIKIDATA_IDS_PER_REQUEST) {
const chunk = uniqueIds.slice(i, i + MAX_WIKIDATA_IDS_PER_REQUEST);
const data = await wikidataApi({
action: "wbgetentities",
ids: chunk.join("|"),
props: "labels",
languages: languages.join("|"),
origin: "*",
});
for (const [id, entity] of Object.entries(data.entities || {})) {
labelMap[id] = pickLabel(entity.labels, languages) || id;
}
}
return labelMap;
}
function pickLabel(labels = {}, languages) {
for (const lang of languages) {
if (labels[lang]) {
return labels[lang].value;
}
}
const fallback = Object.values(labels)[0];
return fallback ? fallback.value : null;
}
function pickWikipediaLink(sitelinks = {}) {
for (const lang of WIKIPEDIA_PREFERRED_LANGS) {
const key = `${lang}wiki`;
const link = sitelinks[key];
if (!link) {
continue;
}
if (link.url) {
return { lang, url: link.url };
}
if (link.title) {
const encodedTitle = encodeURIComponent(link.title.replace(/ /g, "_"));
return { lang, url: `https://${lang}.wikipedia.org/wiki/${encodedTitle}` };
}
}
return null;
}
function inferLanguageFromUrl(rawUrl) {
try {
const { hostname, pathname } = new URL(rawUrl);
const host = hostname.toLowerCase();
if (host.endsWith(".fr") || host.includes(".fr.")) {
return "fr";
}
if (host.endsWith(".de")) {
return "de";
}
if (host.endsWith(".es")) {
return "es";
}
if (host.endsWith(".it")) {
return "it";
}
if (host.endsWith(".pt") || host.endsWith(".br")) {
return "pt";
}
if (host.endsWith(".co.uk") || host.endsWith(".uk") || host.endsWith(".us") || host.endsWith(".com")) {
// only infer English when explicitly present in the path
if (pathname.toLowerCase().includes("/en/") || pathname.toLowerCase().startsWith("/en")) {
return "en";
}
return null;
}
if (pathname.toLowerCase().startsWith("/fr/") || pathname.toLowerCase().includes("/fr/")) {
return "fr";
}
return null;
} catch {
return null;
}
}
function buildExternalLinks(entity) {
const links = [];
const seen = new Set();
const addLink = (entry) => {
if (!entry || !entry.url || seen.has(entry.url)) {
return;
}
const normalized = {
name: entry.name || "Lien",
url: entry.url,
};
if (entry.lang) {
normalized.lang = entry.lang;
}
links.push(normalized);
seen.add(entry.url);
};
const wikiLink = pickWikipediaLink(entity.sitelinks);
if (wikiLink) {
addLink({
name: "Page Wikipédia",
url: wikiLink.url,
lang: wikiLink.lang,
});
}
const officialUrls = collectClaimUrls(entity, OFFICIAL_SITE_PROPERTY);
for (const url of officialUrls) {
const link = {
name: "Site officiel",
url,
};
const detectedLang = inferLanguageFromUrl(url);
if (detectedLang) {
link.lang = detectedLang;
}
addLink(link);
}
return links;
}
function buildTaxonomyValues(entity, typeConfig, labelLookup) {
const taxonomyData = {};
const addValue = (taxonomy, value) => {
if (!value) {
return;
}
if (!taxonomyData[taxonomy]) {
taxonomyData[taxonomy] = new Set();
}
taxonomyData[taxonomy].add(value);
};
for (const [taxonomy, properties] of Object.entries(typeConfig.taxonomyMap)) {
for (const property of properties) {
const ids = collectClaimIds(entity, property);
for (const id of ids) {
const label = labelLookup(id);
if (!label || WIKIDATA_ID_PATTERN.test(label)) {
continue;
}
addValue(taxonomy, label);
}
}
}
if (typeConfig.roleQualifiers) {
const roleIds = collectRoleIds(entity, typeConfig.roleQualifiers);
for (const roleId of roleIds) {
if (typeof roleId === "string" && WIKIDATA_ID_PATTERN.test(roleId)) {
const resolved = labelLookup(roleId);
if (!resolved || WIKIDATA_ID_PATTERN.test(resolved)) {
continue;
}
addValue("personnages_de_fiction", resolved);
} else {
addValue("personnages_de_fiction", roleId);
}
}
}
return Object.fromEntries(
Object.entries(taxonomyData).map(([taxonomy, values]) => [taxonomy, [...values]])
);
}
function mergeFrontmatter(frontmatter, newValues) {
let updated = false;
for (const [taxonomy, values] of Object.entries(newValues)) {
if (!values || values.length === 0) {
continue;
}
const list = Array.isArray(frontmatter[taxonomy])
? [...frontmatter[taxonomy]]
: frontmatter[taxonomy]
? [frontmatter[taxonomy]]
: [];
const existing = new Set(list);
let added = 0;
for (const value of values) {
if (!existing.has(value)) {
list.push(value);
existing.add(value);
added += 1;
}
}
if (added > 0) {
list.sort((a, b) => a.localeCompare(b, "fr"));
frontmatter[taxonomy] = list;
updated = true;
console.log(` ↳ Added ${added} value(s) to "${taxonomy}"`);
}
}
return updated;
}
function mergeLinks(frontmatter, linksToAdd) {
if (!linksToAdd || linksToAdd.length === 0) {
return false;
}
if (Object.prototype.hasOwnProperty.call(frontmatter, "links")) {
return false;
}
frontmatter.links = [...linksToAdd];
console.log(` ↳ Added ${linksToAdd.length} link(s) to "links"`);
return true;
}
async function promptForSelection(results, rl) {
if (results.length === 1) {
const only = results[0];
const answer = await rl.question(
`Found a single match: ${only.label}${only.description || "sans description"} [${only.id}]. Use it? (Y/n) `
);
if (answer.trim() === "" || /^y(es)?$/i.test(answer.trim())) {
return only;
}
return null;
}
console.log("Sélectionnez l'œuvre correspondante :");
results.forEach((result, index) => {
console.log(
` ${index + 1}. ${result.label}${result.description || "sans description"} [${result.id}]`
);
});
console.log(" 0. Annuler");
while (true) {
const answer = await rl.question("Choix : ");
const choice = Number.parseInt(answer, 10);
if (!Number.isNaN(choice)) {
if (choice === 0) {
return null;
}
if (choice >= 1 && choice <= results.length) {
return results[choice - 1];
}
}
console.log("Veuillez saisir un numéro valide ou 0 pour annuler.");
}
}
async function processCritique(target, options, rl) {
const articleDir = resolveArticleDir(target);
const typeKey = detectCritiqueType(articleDir);
if (!typeKey) {
console.log(`⚠️ Impossible de déduire le type pour ${articleDir}. Ignoré.`);
return;
}
const typeConfig = TYPE_CONFIG[typeKey];
if (!typeConfig) {
console.log(`⚠️ Type "${typeKey}" non pris en charge pour ${articleDir}.`);
return;
}
const indexPath = getIndexPath(articleDir);
const { data: frontmatter, body } = readFrontmatter(indexPath);
const storedEntityId = frontmatter[WIKIDATA_ID_FIELD];
const searchTerm = options.query || frontmatter?.title;
if (!storedEntityId && !searchTerm) {
console.log(`⚠️ Aucun titre trouvé dans ${indexPath}.`);
return;
}
console.log(`\n📄 ${indexPath}`);
console.log(` Type détecté : ${typeKey}`);
let entityId = storedEntityId;
let selection = null;
if (entityId) {
console.log(` 🆔 Identifiant Wikidata déjà enregistré : ${entityId}`);
} else {
console.log(` Recherche Wikidata : "${searchTerm}"`);
const results = await searchEntities(searchTerm, typeConfig, options);
if (!results.length) {
console.log(" ❌ Aucun résultat Wikidata trouvé.");
return;
}
if (options.autoSelect) {
selection = results[0];
console.log(
` ⚙️ Mode automatique: sélection du premier résultat (${selection.label}${selection.description || "sans description"})`
);
} else {
selection = await promptForSelection(results, rl);
}
if (!selection) {
console.log(" ❎ Sélection annulée.");
return;
}
entityId = selection.id;
}
const languages = buildLanguageOrder(options.language);
const entity = await fetchEntity(entityId, languages);
const entityLabel = pickLabel(entity.labels, languages) || selection?.label || entityId;
if (storedEntityId) {
console.log(` ✔ Entité chargée : ${entityLabel} (${entityId})`);
} else {
console.log(` ✔ Entité sélectionnée : ${entityLabel} (${entityId})`);
}
const idsToResolve = new Set();
for (const properties of Object.values(typeConfig.taxonomyMap)) {
for (const property of properties) {
collectClaimIds(entity, property).forEach((id) => idsToResolve.add(id));
}
}
if (typeConfig.roleQualifiers) {
collectRoleIds(entity, typeConfig.roleQualifiers)
.filter((id) => typeof id === "string" && id.startsWith("Q"))
.forEach((id) => idsToResolve.add(id));
}
const labelMap = await fetchLabels([...idsToResolve], languages);
const lookup = (id) => labelMap[id] || id;
const taxonomyValues = buildTaxonomyValues(entity, typeConfig, lookup);
const externalLinks = buildExternalLinks(entity);
let updated = mergeFrontmatter(frontmatter, taxonomyValues);
if (mergeLinks(frontmatter, externalLinks)) {
updated = true;
}
if (frontmatter[WIKIDATA_ID_FIELD] !== entityId) {
frontmatter[WIKIDATA_ID_FIELD] = entityId;
updated = true;
console.log(` ↳ Champ ${WIKIDATA_ID_FIELD} ajouté/mis à jour`);
}
if (!updated) {
console.log(" Aucun nouveau terme à ajouter.");
return;
}
const newFrontmatter = yaml.dump(frontmatter, { lineWidth: -1 });
fs.writeFileSync(indexPath, `---\n${newFrontmatter}---\n${body}`, "utf8");
console.log(" 💾 Frontmatter mis à jour.");
}
async function main() {
const { options, targets: cliTargets } = parseArgs(process.argv);
if (options.help) {
showUsage();
return;
}
let targets = [...cliTargets];
if (targets.length === 0) {
console.log(`🔄 Recherche de critiques dans ${DEFAULT_CRITIQUES_ROOT}...`);
targets = collectCritiqueBundles(DEFAULT_CRITIQUES_ROOT);
if (targets.length === 0) {
console.log("Aucune critique à traiter. Veuillez fournir un chemin explicite.");
return;
}
console.log(`${targets.length} critique(s) détectée(s).`);
}
const rl = options.autoSelect ? null : readline.createInterface({ input: stdin, output: stdout });
try {
for (const target of targets) {
await processCritique(target, options, rl);
}
} catch (error) {
console.error(`Erreur: ${error.message}`);
process.exitCode = 1;
} finally {
if (rl) {
rl.close();
}
}
}
main();