Automatisation préliminaire de l'attribution et du linking des mots-clé

2025-11-16 15:48:42 +01:00
parent 0f5f27e40a
commit afc4f45ad7
3 changed files with 1077 additions and 0 deletions
--- a/tools/link_frontmatter_keywords.js
+++ b/tools/link_frontmatter_keywords.js
@@ -0,0 +1,425 @@
+#!/usr/bin/env node
+
+/**
+ * Parcourt tous les articles Markdown du dossier content/ et
+ * crée automatiquement un lien vers la page du mot-clé pour la
+ * première occurrence de chaque terme défini dans les taxonomies
+ * du frontmatter. Les occurrences déjà liées sont ignorées.
+ *
+ * Sort avec un code différent de 0 lorsqu'au moins un fichier est modifié.
+ */
+
+const fs = require("node:fs");
+const path = require("node:path");
+const yaml = require("js-yaml");
+
+const PROJECT_ROOT = path.resolve(__dirname, "..");
+const CONTENT_ROOT = path.join(PROJECT_ROOT, "content");
+const TAXONOMIES_FILE = path.join(PROJECT_ROOT, "config", "_default", "taxonomies.yaml");
+const FRONTMATTER_PATTERN = /^---\n([\s\S]+?)\n---\n?([\s\S]*)$/;
+const WORD_CHAR = /[\p{L}\p{N}]/u;
+const INLINE_FORMATTING_CHARS = ["*", "_"];
+
+main();
+
+function main() {
+  const taxonomyMapping = loadTaxonomyMapping(TAXONOMIES_FILE);
+  const files = collectMarkdownFiles(CONTENT_ROOT);
+
+  if (files.length === 0) {
+    console.log("Aucun article Markdown trouvé sous content/.");
+    return;
+  }
+
+  const changed = [];
+  for (const filePath of files) {
+    if (processFile(filePath, taxonomyMapping)) {
+      changed.push(filePath);
+    }
+  }
+
+  if (changed.length > 0) {
+    for (const filePath of changed) {
+      const rel = path.relative(PROJECT_ROOT, filePath);
+      console.log(`✏️  ${rel}`);
+    }
+    console.log("Des modifications ont été effectuées. Merci de les revoir.");
+    process.exit(2);
+  } else {
+    console.log("Tous les articles sont déjà correctement liés.");
+  }
+}
+
+function processFile(filePath, taxonomyMapping) {
+  let raw;
+  try {
+    raw = fs.readFileSync(filePath, "utf8");
+  } catch (error) {
+    console.warn(`⚠️  Impossible de lire ${filePath}: ${error.message}`);
+    return false;
+  }
+
+  const match = raw.match(FRONTMATTER_PATTERN);
+  if (!match) {
+    return false;
+  }
+
+  let frontmatter;
+  try {
+    frontmatter = yaml.load(match[1]) || {};
+  } catch (error) {
+    console.warn(`⚠️  Frontmatter invalide dans ${filePath}: ${error.message}`);
+    return false;
+  }
+
+  const keywords = extractKeywords(frontmatter, match[1], taxonomyMapping.fieldToCanonical);
+  if (keywords.length === 0) {
+    return false;
+  }
+
+  const { body, changed } = linkKeywordsInBody(match[2], keywords);
+  if (!changed) {
+    return false;
+  }
+
+  const prefixLength = raw.length - match[2].length;
+  const updated = raw.slice(0, prefixLength) + body;
+  fs.writeFileSync(filePath, updated, "utf8");
+  return true;
+}
+
+function loadTaxonomyMapping(configPath) {
+  let raw;
+  try {
+    raw = fs.readFileSync(configPath, "utf8");
+  } catch (error) {
+    console.error(`Impossible de lire ${configPath}: ${error.message}`);
+    process.exit(1);
+  }
+
+  let data;
+  try {
+    data = yaml.load(raw) || {};
+  } catch (error) {
+    console.error(`YAML invalide dans ${configPath}: ${error.message}`);
+    process.exit(1);
+  }
+
+  if (typeof data !== "object" || data === null) {
+    console.error(`Format inattendu dans ${configPath}`);
+    process.exit(1);
+  }
+
+  const fieldToCanonical = new Map();
+  for (const [singular, plural] of Object.entries(data)) {
+    const canonicalName =
+      typeof plural === "string" && plural.trim().length > 0 ? plural.trim() : singular.trim();
+    if (!canonicalName) continue;
+    const candidates = new Set([singular, canonicalName].filter(Boolean));
+    for (const name of candidates) {
+      fieldToCanonical.set(name, canonicalName);
+    }
+  }
+
+  if (fieldToCanonical.size === 0) {
+    console.error("Aucune taxonomie n'est définie dans la configuration.");
+    process.exit(1);
+  }
+
+  return { fieldToCanonical };
+}
+
+function collectMarkdownFiles(root) {
+  const files = [];
+  walk(root, files);
+  return files.sort((a, b) => a.localeCompare(b));
+}
+
+function walk(dir, bucket) {
+  let entries;
+  try {
+    entries = fs.readdirSync(dir, { withFileTypes: true });
+  } catch (error) {
+    console.warn(`⚠️  Impossible de parcourir ${dir}: ${error.message}`);
+    return;
+  }
+
+  for (const entry of entries) {
+    if (entry.name === ".git" || entry.name === "node_modules") {
+      continue;
+    }
+    const absolute = path.join(dir, entry.name);
+    if (entry.isDirectory()) {
+      walk(absolute, bucket);
+    } else if (entry.isFile() && entry.name.toLowerCase().endsWith(".md")) {
+      bucket.push(absolute);
+    }
+  }
+}
+
+function extractKeywords(frontmatter, frontmatterRaw, fieldToCanonical) {
+  const keywords = [];
+  const seen = new Set();
+
+  function addKeyword(taxonomy, term) {
+    if (!taxonomy || typeof term !== "string") return;
+    const normalized = term.trim();
+    if (!normalized) return;
+    const key = `${taxonomy}::${normalized.toLowerCase()}`;
+    if (seen.has(key)) return;
+    const slug = slugify(normalized);
+    if (!slug) return;
+    seen.add(key);
+    keywords.push({
+      taxonomy,
+      term: normalized,
+      url: `/${taxonomy}/${slug}/`,
+    });
+  }
+
+  if (typeof frontmatter === "object" && frontmatter !== null) {
+    for (const [field, value] of Object.entries(frontmatter)) {
+      const canonical = fieldToCanonical.get(field);
+      if (!canonical) continue;
+      const terms = normalizeTerms(value);
+      for (const term of terms) {
+        addKeyword(canonical, term);
+      }
+    }
+  }
+
+  for (const entry of extractCommentedTerms(frontmatterRaw, fieldToCanonical)) {
+    addKeyword(entry.taxonomy, entry.term);
+  }
+
+  return keywords;
+}
+
+function normalizeTerms(value) {
+  if (Array.isArray(value)) {
+    return value.map((item) => normalizeTerm(item)).filter(Boolean);
+  }
+  const single = normalizeTerm(value);
+  return single ? [single] : [];
+}
+
+function normalizeTerm(value) {
+  if (typeof value !== "string") return null;
+  const trimmed = value.trim();
+  return trimmed.length > 0 ? trimmed : null;
+}
+
+function extractCommentedTerms(frontmatterRaw, fieldToCanonical) {
+  if (typeof frontmatterRaw !== "string" || frontmatterRaw.length === 0) {
+    return [];
+  }
+
+  const results = [];
+  const lines = frontmatterRaw.split(/\r?\n/);
+  let currentCanonical = null;
+  let currentIndent = 0;
+
+  for (const line of lines) {
+    const indent = getIndentation(line);
+    const fieldMatch = line.match(/^\s*([A-Za-z0-9_]+):\s*(?:#.*)?$/);
+    if (fieldMatch) {
+      const fieldName = fieldMatch[1];
+      currentCanonical = fieldToCanonical.get(fieldName) || null;
+      currentIndent = indent;
+      continue;
+    }
+
+    if (!currentCanonical) continue;
+    const commentMatch = line.match(/^\s*#\s*-\s+(.*)$/);
+    if (!commentMatch) continue;
+    if (indent <= currentIndent) continue;
+    const term = commentMatch[1].trim();
+    if (!term) continue;
+    results.push({ taxonomy: currentCanonical, term });
+  }
+
+  return results;
+}
+
+function linkKeywordsInBody(body, keywords) {
+  if (typeof body !== "string" || body.length === 0 || keywords.length === 0) {
+    return { body, changed: false };
+  }
+
+  let updated = body;
+  let changed = false;
+  let linkRanges = computeLinkRanges(updated);
+
+  for (const keyword of keywords) {
+    const occurrence = findKeywordOccurrence(updated, keyword.term, linkRanges);
+    if (!occurrence) continue;
+    const expanded = includeFormattingCharacters(updated, occurrence.start, occurrence.end);
+    const before = updated.slice(0, expanded.start);
+    const label = updated.slice(expanded.start, expanded.end);
+    const after = updated.slice(expanded.end);
+    updated = `${before}[${label}](${keyword.url})${after}`;
+    changed = true;
+    linkRanges = computeLinkRanges(updated);
+  }
+
+  return { body: updated, changed };
+}
+
+function findKeywordOccurrence(text, keyword, linkRanges) {
+  if (!keyword) return null;
+  const escaped = escapeRegExp(keyword);
+  if (!escaped) return null;
+  const regex = new RegExp(escaped, "giu");
+  let match;
+
+  while ((match = regex.exec(text)) !== null) {
+    const start = match.index;
+    const end = start + match[0].length;
+    if (isInsideExistingLink(start, end, linkRanges)) {
+      continue;
+    }
+    if (!hasWordBoundaries(text, start, end)) {
+      continue;
+    }
+    return { start, end, text: match[0] };
+  }
+  return null;
+}
+
+function computeLinkRanges(text) {
+  const ranges = [];
+  if (typeof text !== "string" || text.length === 0) {
+    return ranges;
+  }
+
+  for (let i = 0; i < text.length; i++) {
+    let isImage = false;
+    if (text[i] === "!" && text[i + 1] === "[") {
+      isImage = true;
+      i += 1;
+    }
+    if (text[i] !== "[") continue;
+
+    const openBracket = i;
+    const closeBracket = findMatchingPair(text, openBracket, "[", "]");
+    if (closeBracket === -1) continue;
+
+    let pointer = closeBracket + 1;
+    while (pointer < text.length && /\s/.test(text[pointer])) pointer++;
+    if (pointer >= text.length || text[pointer] !== "(") {
+      i = closeBracket;
+      continue;
+    }
+
+    const openParen = pointer;
+    const closeParen = findMatchingPair(text, openParen, "(", ")");
+    if (closeParen === -1) break;
+
+    ranges.push({
+      textStart: openBracket + 1,
+      textEnd: closeBracket,
+      destStart: openParen + 1,
+      destEnd: closeParen,
+      isImage,
+    });
+    i = closeParen;
+  }
+
+  return ranges;
+}
+
+function findMatchingPair(text, startIndex, openChar, closeChar) {
+  let depth = 0;
+  for (let i = startIndex; i < text.length; i++) {
+    const ch = text[i];
+    if (ch === "\\") {
+      i++;
+      continue;
+    }
+    if (ch === openChar) {
+      depth++;
+    } else if (ch === closeChar) {
+      depth--;
+      if (depth === 0) {
+        return i;
+      }
+    }
+  }
+  return -1;
+}
+
+function isInsideExistingLink(start, end, ranges) {
+  return ranges.some((range) => {
+    const overlapsText = start < range.textEnd && end > range.textStart;
+    const overlapsDest =
+      typeof range.destStart === "number" &&
+      typeof range.destEnd === "number" &&
+      start < range.destEnd &&
+      end > range.destStart;
+    return overlapsText || overlapsDest;
+  });
+}
+
+function hasWordBoundaries(text, start, end) {
+  const before = start > 0 ? text[start - 1] : "";
+  const after = end < text.length ? text[end] : "";
+  const startChar = text[start];
+  const endChar = text[end - 1];
+
+  if (isWordChar(startChar) && isWordChar(before)) {
+    return false;
+  }
+  if (isWordChar(endChar) && isWordChar(after)) {
+    return false;
+  }
+  return true;
+}
+
+function isWordChar(ch) {
+  return Boolean(ch && WORD_CHAR.test(ch));
+}
+
+function slugify(value) {
+  return value
+    .normalize("NFD")
+    .replace(/\p{Diacritic}/gu, "")
+    .toLowerCase()
+    .replace(/[^a-z0-9]+/g, "-")
+    .replace(/^-+|-+$/g, "")
+    .replace(/-{2,}/g, "-");
+}
+
+function escapeRegExp(value) {
+  return value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
+}
+
+function includeFormattingCharacters(text, start, end) {
+  let newStart = start;
+  let newEnd = end;
+
+  for (const marker of INLINE_FORMATTING_CHARS) {
+    let prefixCount = 0;
+    while (newStart - prefixCount - 1 >= 0 && text[newStart - prefixCount - 1] === marker) {
+      prefixCount++;
+    }
+
+    let suffixCount = 0;
+    while (newEnd + suffixCount < text.length && text[newEnd + suffixCount] === marker) {
+      suffixCount++;
+    }
+
+    const count = Math.min(prefixCount, suffixCount);
+    if (count > 0) {
+      newStart -= count;
+      newEnd += count;
+    }
+  }
+
+  return { start: newStart, end: newEnd };
+}
+
+function getIndentation(line) {
+  if (typeof line !== "string" || line.length === 0) return 0;
+  const match = line.match(/^\s*/);
+  return match ? match[0].length : 0;
+}