Automatisation préliminaire de l'attribution et du linking des mots-clé

2025-11-16 15:48:42 +01:00
parent 0f5f27e40a
commit afc4f45ad7
3 changed files with 1077 additions and 0 deletions
--- a/tools/link_frontmatter_keywords.js
+++ b/tools/link_frontmatter_keywords.js
@@ -0,0 +1,425 @@
+#!/usr/bin/env node
+
+/**
+ * Parcourt tous les articles Markdown du dossier content/ et
+ * crée automatiquement un lien vers la page du mot-clé pour la
+ * première occurrence de chaque terme défini dans les taxonomies
+ * du frontmatter. Les occurrences déjà liées sont ignorées.
+ *
+ * Sort avec un code différent de 0 lorsqu'au moins un fichier est modifié.
+ */
+
+const fs = require("node:fs");
+const path = require("node:path");
+const yaml = require("js-yaml");
+
+const PROJECT_ROOT = path.resolve(__dirname, "..");
+const CONTENT_ROOT = path.join(PROJECT_ROOT, "content");
+const TAXONOMIES_FILE = path.join(PROJECT_ROOT, "config", "_default", "taxonomies.yaml");
+const FRONTMATTER_PATTERN = /^---\n([\s\S]+?)\n---\n?([\s\S]*)$/;
+const WORD_CHAR = /[\p{L}\p{N}]/u;
+const INLINE_FORMATTING_CHARS = ["*", "_"];
+
+main();
+
+function main() {
+  const taxonomyMapping = loadTaxonomyMapping(TAXONOMIES_FILE);
+  const files = collectMarkdownFiles(CONTENT_ROOT);
+
+  if (files.length === 0) {
+    console.log("Aucun article Markdown trouvé sous content/.");
+    return;
+  }
+
+  const changed = [];
+  for (const filePath of files) {
+    if (processFile(filePath, taxonomyMapping)) {
+      changed.push(filePath);
+    }
+  }
+
+  if (changed.length > 0) {
+    for (const filePath of changed) {
+      const rel = path.relative(PROJECT_ROOT, filePath);
+      console.log(`✏️  ${rel}`);
+    }
+    console.log("Des modifications ont été effectuées. Merci de les revoir.");
+    process.exit(2);
+  } else {
+    console.log("Tous les articles sont déjà correctement liés.");
+  }
+}
+
+function processFile(filePath, taxonomyMapping) {
+  let raw;
+  try {
+    raw = fs.readFileSync(filePath, "utf8");
+  } catch (error) {
+    console.warn(`⚠️  Impossible de lire ${filePath}: ${error.message}`);
+    return false;
+  }
+
+  const match = raw.match(FRONTMATTER_PATTERN);
+  if (!match) {
+    return false;
+  }
+
+  let frontmatter;
+  try {
+    frontmatter = yaml.load(match[1]) || {};
+  } catch (error) {
+    console.warn(`⚠️  Frontmatter invalide dans ${filePath}: ${error.message}`);
+    return false;
+  }
+
+  const keywords = extractKeywords(frontmatter, match[1], taxonomyMapping.fieldToCanonical);
+  if (keywords.length === 0) {
+    return false;
+  }
+
+  const { body, changed } = linkKeywordsInBody(match[2], keywords);
+  if (!changed) {
+    return false;
+  }
+
+  const prefixLength = raw.length - match[2].length;
+  const updated = raw.slice(0, prefixLength) + body;
+  fs.writeFileSync(filePath, updated, "utf8");
+  return true;
+}
+
+function loadTaxonomyMapping(configPath) {
+  let raw;
+  try {
+    raw = fs.readFileSync(configPath, "utf8");
+  } catch (error) {
+    console.error(`Impossible de lire ${configPath}: ${error.message}`);
+    process.exit(1);
+  }
+
+  let data;
+  try {
+    data = yaml.load(raw) || {};
+  } catch (error) {
+    console.error(`YAML invalide dans ${configPath}: ${error.message}`);
+    process.exit(1);
+  }
+
+  if (typeof data !== "object" || data === null) {
+    console.error(`Format inattendu dans ${configPath}`);
+    process.exit(1);
+  }
+
+  const fieldToCanonical = new Map();
+  for (const [singular, plural] of Object.entries(data)) {
+    const canonicalName =
+      typeof plural === "string" && plural.trim().length > 0 ? plural.trim() : singular.trim();
+    if (!canonicalName) continue;
+    const candidates = new Set([singular, canonicalName].filter(Boolean));
+    for (const name of candidates) {
+      fieldToCanonical.set(name, canonicalName);
+    }
+  }
+
+  if (fieldToCanonical.size === 0) {
+    console.error("Aucune taxonomie n'est définie dans la configuration.");
+    process.exit(1);
+  }
+
+  return { fieldToCanonical };
+}
+
+function collectMarkdownFiles(root) {
+  const files = [];
+  walk(root, files);
+  return files.sort((a, b) => a.localeCompare(b));
+}
+
+function walk(dir, bucket) {
+  let entries;
+  try {
+    entries = fs.readdirSync(dir, { withFileTypes: true });
+  } catch (error) {
+    console.warn(`⚠️  Impossible de parcourir ${dir}: ${error.message}`);
+    return;
+  }
+
+  for (const entry of entries) {
+    if (entry.name === ".git" || entry.name === "node_modules") {
+      continue;
+    }
+    const absolute = path.join(dir, entry.name);
+    if (entry.isDirectory()) {
+      walk(absolute, bucket);
+    } else if (entry.isFile() && entry.name.toLowerCase().endsWith(".md")) {
+      bucket.push(absolute);
+    }
+  }
+}
+
+function extractKeywords(frontmatter, frontmatterRaw, fieldToCanonical) {
+  const keywords = [];
+  const seen = new Set();
+
+  function addKeyword(taxonomy, term) {
+    if (!taxonomy || typeof term !== "string") return;
+    const normalized = term.trim();
+    if (!normalized) return;
+    const key = `${taxonomy}::${normalized.toLowerCase()}`;
+    if (seen.has(key)) return;
+    const slug = slugify(normalized);
+    if (!slug) return;
+    seen.add(key);
+    keywords.push({
+      taxonomy,
+      term: normalized,
+      url: `/${taxonomy}/${slug}/`,
+    });
+  }
+
+  if (typeof frontmatter === "object" && frontmatter !== null) {
+    for (const [field, value] of Object.entries(frontmatter)) {
+      const canonical = fieldToCanonical.get(field);
+      if (!canonical) continue;
+      const terms = normalizeTerms(value);
+      for (const term of terms) {
+        addKeyword(canonical, term);
+      }
+    }
+  }
+
+  for (const entry of extractCommentedTerms(frontmatterRaw, fieldToCanonical)) {
+    addKeyword(entry.taxonomy, entry.term);
+  }
+
+  return keywords;
+}
+
+function normalizeTerms(value) {
+  if (Array.isArray(value)) {
+    return value.map((item) => normalizeTerm(item)).filter(Boolean);
+  }
+  const single = normalizeTerm(value);
+  return single ? [single] : [];
+}
+
+function normalizeTerm(value) {
+  if (typeof value !== "string") return null;
+  const trimmed = value.trim();
+  return trimmed.length > 0 ? trimmed : null;
+}
+
+function extractCommentedTerms(frontmatterRaw, fieldToCanonical) {
+  if (typeof frontmatterRaw !== "string" || frontmatterRaw.length === 0) {
+    return [];
+  }
+
+  const results = [];
+  const lines = frontmatterRaw.split(/\r?\n/);
+  let currentCanonical = null;
+  let currentIndent = 0;
+
+  for (const line of lines) {
+    const indent = getIndentation(line);
+    const fieldMatch = line.match(/^\s*([A-Za-z0-9_]+):\s*(?:#.*)?$/);
+    if (fieldMatch) {
+      const fieldName = fieldMatch[1];
+      currentCanonical = fieldToCanonical.get(fieldName) || null;
+      currentIndent = indent;
+      continue;
+    }
+
+    if (!currentCanonical) continue;
+    const commentMatch = line.match(/^\s*#\s*-\s+(.*)$/);
+    if (!commentMatch) continue;
+    if (indent <= currentIndent) continue;
+    const term = commentMatch[1].trim();
+    if (!term) continue;
+    results.push({ taxonomy: currentCanonical, term });
+  }
+
+  return results;
+}
+
+function linkKeywordsInBody(body, keywords) {
+  if (typeof body !== "string" || body.length === 0 || keywords.length === 0) {
+    return { body, changed: false };
+  }
+
+  let updated = body;
+  let changed = false;
+  let linkRanges = computeLinkRanges(updated);
+
+  for (const keyword of keywords) {
+    const occurrence = findKeywordOccurrence(updated, keyword.term, linkRanges);
+    if (!occurrence) continue;
+    const expanded = includeFormattingCharacters(updated, occurrence.start, occurrence.end);
+    const before = updated.slice(0, expanded.start);
+    const label = updated.slice(expanded.start, expanded.end);
+    const after = updated.slice(expanded.end);
+    updated = `${before}[${label}](${keyword.url})${after}`;
+    changed = true;
+    linkRanges = computeLinkRanges(updated);
+  }
+
+  return { body: updated, changed };
+}
+
+function findKeywordOccurrence(text, keyword, linkRanges) {
+  if (!keyword) return null;
+  const escaped = escapeRegExp(keyword);
+  if (!escaped) return null;
+  const regex = new RegExp(escaped, "giu");
+  let match;
+
+  while ((match = regex.exec(text)) !== null) {
+    const start = match.index;
+    const end = start + match[0].length;
+    if (isInsideExistingLink(start, end, linkRanges)) {
+      continue;
+    }
+    if (!hasWordBoundaries(text, start, end)) {
+      continue;
+    }
+    return { start, end, text: match[0] };
+  }
+  return null;
+}
+
+function computeLinkRanges(text) {
+  const ranges = [];
+  if (typeof text !== "string" || text.length === 0) {
+    return ranges;
+  }
+
+  for (let i = 0; i < text.length; i++) {
+    let isImage = false;
+    if (text[i] === "!" && text[i + 1] === "[") {
+      isImage = true;
+      i += 1;
+    }
+    if (text[i] !== "[") continue;
+
+    const openBracket = i;
+    const closeBracket = findMatchingPair(text, openBracket, "[", "]");
+    if (closeBracket === -1) continue;
+
+    let pointer = closeBracket + 1;
+    while (pointer < text.length && /\s/.test(text[pointer])) pointer++;
+    if (pointer >= text.length || text[pointer] !== "(") {
+      i = closeBracket;
+      continue;
+    }
+
+    const openParen = pointer;
+    const closeParen = findMatchingPair(text, openParen, "(", ")");
+    if (closeParen === -1) break;
+
+    ranges.push({
+      textStart: openBracket + 1,
+      textEnd: closeBracket,
+      destStart: openParen + 1,
+      destEnd: closeParen,
+      isImage,
+    });
+    i = closeParen;
+  }
+
+  return ranges;
+}
+
+function findMatchingPair(text, startIndex, openChar, closeChar) {
+  let depth = 0;
+  for (let i = startIndex; i < text.length; i++) {
+    const ch = text[i];
+    if (ch === "\\") {
+      i++;
+      continue;
+    }
+    if (ch === openChar) {
+      depth++;
+    } else if (ch === closeChar) {
+      depth--;
+      if (depth === 0) {
+        return i;
+      }
+    }
+  }
+  return -1;
+}
+
+function isInsideExistingLink(start, end, ranges) {
+  return ranges.some((range) => {
+    const overlapsText = start < range.textEnd && end > range.textStart;
+    const overlapsDest =
+      typeof range.destStart === "number" &&
+      typeof range.destEnd === "number" &&
+      start < range.destEnd &&
+      end > range.destStart;
+    return overlapsText || overlapsDest;
+  });
+}
+
+function hasWordBoundaries(text, start, end) {
+  const before = start > 0 ? text[start - 1] : "";
+  const after = end < text.length ? text[end] : "";
+  const startChar = text[start];
+  const endChar = text[end - 1];
+
+  if (isWordChar(startChar) && isWordChar(before)) {
+    return false;
+  }
+  if (isWordChar(endChar) && isWordChar(after)) {
+    return false;
+  }
+  return true;
+}
+
+function isWordChar(ch) {
+  return Boolean(ch && WORD_CHAR.test(ch));
+}
+
+function slugify(value) {
+  return value
+    .normalize("NFD")
+    .replace(/\p{Diacritic}/gu, "")
+    .toLowerCase()
+    .replace(/[^a-z0-9]+/g, "-")
+    .replace(/^-+|-+$/g, "")
+    .replace(/-{2,}/g, "-");
+}
+
+function escapeRegExp(value) {
+  return value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
+}
+
+function includeFormattingCharacters(text, start, end) {
+  let newStart = start;
+  let newEnd = end;
+
+  for (const marker of INLINE_FORMATTING_CHARS) {
+    let prefixCount = 0;
+    while (newStart - prefixCount - 1 >= 0 && text[newStart - prefixCount - 1] === marker) {
+      prefixCount++;
+    }
+
+    let suffixCount = 0;
+    while (newEnd + suffixCount < text.length && text[newEnd + suffixCount] === marker) {
+      suffixCount++;
+    }
+
+    const count = Math.min(prefixCount, suffixCount);
+    if (count > 0) {
+      newStart -= count;
+      newEnd += count;
+    }
+  }
+
+  return { start: newStart, end: newEnd };
+}
+
+function getIndentation(line) {
+  if (typeof line !== "string" || line.length === 0) return 0;
+  const match = line.match(/^\s*/);
+  return match ? match[0].length : 0;
+}
--- a/tools/link_taxonomy_terms.js
+++ b/tools/link_taxonomy_terms.js
@@ -0,0 +1,646 @@
+#!/usr/bin/env node
+
+/**
+ * Automatically attaches taxonomy terms to Hugo articles by scanning the body
+ * of each Markdown file for known keywords that already exist in frontmatter.
+ *
+ * Usage:
+ *   node tools/link_taxonomy_terms.js [--dry-run] [paths...]
+ *
+ * Without arguments every Markdown file under content/ is processed.
+ */
+
+const fs = require("node:fs");
+const path = require("node:path");
+const yaml = require("js-yaml");
+
+const PROJECT_ROOT = path.resolve(__dirname, "..");
+const CONTENT_ROOT = path.join(PROJECT_ROOT, "content");
+const TAXONOMIES_PATH = path.join(PROJECT_ROOT, "config", "_default", "taxonomies.yaml");
+const FRONTMATTER_PATTERN = /^---\n([\s\S]+?)\n---\n?([\s\S]*)$/;
+const collator = new Intl.Collator("fr", { sensitivity: "base", usage: "sort" });
+
+function main() {
+  const { options, targets } = parseArgs(process.argv.slice(2));
+
+  const taxonomyMapping = loadTaxonomyMapping(TAXONOMIES_PATH);
+  if (taxonomyMapping.canonicalNames.length === 0) {
+    console.error("❌ No taxonomies found in config/_default/taxonomies.yaml");
+    process.exit(1);
+  }
+
+  const files = collectMarkdownFiles(targets);
+  if (files.length === 0) {
+    console.log("No Markdown content found to analyse.");
+    return;
+  }
+
+  const articles = files
+    .map((filePath) => parseArticle(filePath))
+    .filter((article) => article !== null);
+
+  if (articles.length === 0) {
+    console.log("No articles with valid YAML frontmatter were found.");
+    return;
+  }
+
+  const { catalog, totalKeywords } = buildKeywordCatalog(articles, taxonomyMapping);
+  if (totalKeywords === 0) {
+    console.log("No taxonomy keywords available to propagate.");
+    return;
+  }
+
+  console.log(
+    `Catalogued ${totalKeywords} keyword${totalKeywords > 1 ? "s" : ""} across ${
+      catalog.size
+    } taxonomie${catalog.size > 1 ? "s" : ""}.`
+  );
+
+  const modifications = applyTaxonomies(articles, catalog, taxonomyMapping, options);
+  if (modifications.length === 0) {
+    console.log("No taxonomy updates required.");
+    if (options.dryRun) {
+      console.log("Dry-run only: no files would be modified.");
+    }
+    return;
+  }
+
+  for (const change of modifications) {
+    const relPath = path.relative(PROJECT_ROOT, change.path);
+    console.log(`✏️  ${relPath}`);
+    for (const [taxonomy, values] of change.additions.entries()) {
+      console.log(`    ${taxonomy}: ${values.join(", ")}`);
+    }
+  }
+
+  if (options.dryRun) {
+    console.log(`Dry-run complete. ${modifications.length} article(s) would be updated.`);
+  } else {
+    console.log(`Updated ${modifications.length} article(s).`);
+    console.log(`Vérifier les modifications.`);
+    process.exit(2);
+  }
+}
+
+function parseArgs(argv) {
+  const options = { dryRun: false };
+  const targets = [];
+
+  for (const arg of argv) {
+    if (arg === "--dry-run" || arg === "--check") {
+      options.dryRun = true;
+    } else if (arg === "--help" || arg === "-h") {
+      showUsage();
+      process.exit(0);
+    } else if (arg.startsWith("-")) {
+      console.error(`Unknown option: ${arg}`);
+      showUsage();
+      process.exit(1);
+    } else {
+      targets.push(arg);
+    }
+  }
+
+  return { options, targets };
+}
+
+function showUsage() {
+  console.log(`Usage: node tools/link_taxonomy_terms.js [--dry-run] [path...]
+
+Options
+  --dry-run    Analyse files but do not rewrite anything
+  --help       Show this message
+
+Examples
+  node tools/link_taxonomy_terms.js --dry-run
+  node tools/link_taxonomy_terms.js content/interets/paleontologie`);
+}
+
+function loadTaxonomyMapping(configPath) {
+  let raw;
+  try {
+    raw = fs.readFileSync(configPath, "utf8");
+  } catch (error) {
+    throw new Error(`Unable to read ${configPath}: ${error.message}`);
+  }
+
+  let data;
+  try {
+    data = yaml.load(raw) || {};
+  } catch (error) {
+    throw new Error(`Invalid YAML in ${configPath}: ${error.message}`);
+  }
+
+  if (typeof data !== "object" || Array.isArray(data)) {
+    throw new Error(`Unexpected taxonomies format in ${configPath}`);
+  }
+
+  const fieldToCanonical = new Map();
+  const canonicalToFields = new Map();
+
+  for (const [singular, plural] of Object.entries(data)) {
+    const canonicalName = typeof plural === "string" && plural.trim().length > 0 ? plural.trim() : singular;
+    if (!canonicalName) continue;
+    const candidateNames = new Set([singular, canonicalName].filter(Boolean));
+    for (const name of candidateNames) {
+      fieldToCanonical.set(name, canonicalName);
+      if (!canonicalToFields.has(canonicalName)) {
+        canonicalToFields.set(canonicalName, new Set());
+      }
+      canonicalToFields.get(canonicalName).add(name);
+    }
+  }
+
+  return {
+    fieldToCanonical,
+    canonicalToFields,
+    canonicalNames: Array.from(canonicalToFields.keys()),
+  };
+}
+
+function collectMarkdownFiles(targets) {
+  const files = new Set();
+
+  if (targets.length === 0) {
+    walkContentTree(CONTENT_ROOT, files);
+    return Array.from(files).sort();
+  }
+
+  for (const target of targets) {
+    const absolute = path.resolve(PROJECT_ROOT, target);
+    if (!fs.existsSync(absolute)) {
+      console.warn(`⚠️  Skipping missing path: ${target}`);
+      continue;
+    }
+    const stats = fs.statSync(absolute);
+    if (stats.isDirectory()) {
+      walkContentTree(absolute, files);
+    } else if (stats.isFile() && absolute.toLowerCase().endsWith(".md")) {
+      files.add(absolute);
+    }
+  }
+
+  return Array.from(files).sort();
+}
+
+function walkContentTree(dir, fileSet) {
+  let entries;
+  try {
+    entries = fs.readdirSync(dir, { withFileTypes: true });
+  } catch (error) {
+    console.warn(`⚠️  Cannot read ${dir}: ${error.message}`);
+    return;
+  }
+
+  for (const entry of entries) {
+    const fullPath = path.join(dir, entry.name);
+    if (entry.isDirectory()) {
+      if (entry.name === ".git" || entry.name === "node_modules") continue;
+      walkContentTree(fullPath, fileSet);
+    } else if (entry.isFile() && entry.name.toLowerCase().endsWith(".md")) {
+      fileSet.add(fullPath);
+    }
+  }
+}
+
+function parseArticle(filePath) {
+  let raw;
+  try {
+    raw = fs.readFileSync(filePath, "utf8");
+  } catch (error) {
+    console.warn(`⚠️  Unable to read ${filePath}: ${error.message}`);
+    return null;
+  }
+
+  const match = raw.match(FRONTMATTER_PATTERN);
+  if (!match) {
+    console.warn(`⚠️  ${path.relative(PROJECT_ROOT, filePath)} is missing YAML frontmatter. Skipping.`);
+    return null;
+  }
+
+  let data = {};
+  try {
+    data = yaml.load(match[1]) || {};
+  } catch (error) {
+    console.warn(`⚠️  Failed to parse frontmatter in ${filePath}: ${error.message}`);
+    return null;
+  }
+
+  if (typeof data !== "object" || Array.isArray(data)) {
+    console.warn(`⚠️  Unexpected frontmatter structure in ${filePath}. Skipping.`);
+    return null;
+  }
+
+  return {
+    path: filePath,
+    frontmatter: data,
+    frontmatterRaw: match[1],
+    body: match[2] || "",
+  };
+}
+
+function buildKeywordCatalog(articles, taxonomyMapping) {
+  const keywordMaps = new Map();
+
+  for (const canonical of taxonomyMapping.canonicalNames) {
+    keywordMaps.set(canonical, new Map());
+  }
+
+  for (const article of articles) {
+    const frontmatter = article.frontmatter;
+    for (const [field, value] of Object.entries(frontmatter)) {
+      const canonical = taxonomyMapping.fieldToCanonical.get(field);
+      if (!canonical) continue;
+
+      const strings = toStringArray(value);
+      if (strings.length === 0) continue;
+
+      const lookup = keywordMaps.get(canonical);
+      for (const entry of strings) {
+        const normalized = normalizeTerm(entry);
+        if (!normalized || lookup.has(normalized)) continue;
+        lookup.set(normalized, entry);
+      }
+    }
+  }
+
+  const catalog = new Map();
+  let totalKeywords = 0;
+
+  for (const [canonical, map] of keywordMaps.entries()) {
+    if (map.size === 0) continue;
+    const sortedValues = Array.from(map.values()).sort(compareKeywords);
+    const entries = [];
+    for (const value of sortedValues) {
+      const pattern = buildKeywordPattern(value);
+      if (!pattern) continue;
+      entries.push({ value, pattern });
+    }
+    if (entries.length === 0) continue;
+    totalKeywords += entries.length;
+    catalog.set(canonical, entries);
+  }
+
+  return { catalog, totalKeywords };
+}
+
+function applyTaxonomies(articles, catalog, taxonomyMapping, options) {
+  const changes = [];
+
+  for (const article of articles) {
+    const additions = new Map();
+    let mutated = false;
+    const occupiedRanges = [];
+    const taxonomyStates = new Map();
+    const keywordTasks = [];
+    const ignoredKeywords = extractIgnoredKeywords(article.frontmatterRaw, taxonomyMapping);
+
+    for (const [canonical, keywordEntries] of catalog.entries()) {
+      if (keywordEntries.length === 0) continue;
+      const fieldName = resolveFieldName(article.frontmatter, canonical, taxonomyMapping);
+      const currentValues = toStringArray(article.frontmatter[fieldName]);
+      const normalizedExisting = new Set(currentValues.map((value) => normalizeTerm(value)));
+      const state = {
+        canonical,
+        fieldName,
+        currentValues,
+        normalizedExisting,
+      };
+      taxonomyStates.set(canonical, state);
+
+      for (const entry of keywordEntries) {
+        keywordTasks.push({
+          canonical,
+          value: entry.value,
+          pattern: entry.pattern,
+          state,
+        });
+      }
+    }
+
+    keywordTasks.sort((a, b) => compareKeywords(a.value, b.value));
+
+    const urlRanges = collectMarkdownUrlRanges(article.body);
+    const searchableBody = normalizeTypographyForSearch(article.body);
+
+    for (const task of keywordTasks) {
+      const { state, canonical, value, pattern } = task;
+      const regex = new RegExp(pattern, "gu");
+      const matchRange = findAvailableMatchRange(regex, searchableBody, occupiedRanges, urlRanges);
+      if (!matchRange) {
+        continue;
+      }
+
+      if (shouldSkipSingleWordMatch(value, article.body, matchRange)) {
+        occupiedRanges.push(matchRange);
+        continue;
+      }
+
+      occupiedRanges.push(matchRange);
+
+      const normalized = normalizeTerm(value);
+      if (state.normalizedExisting.has(normalized)) {
+        continue;
+      }
+
+      if (isIgnoredKeyword(canonical, normalized, ignoredKeywords)) {
+        continue;
+      }
+
+      state.currentValues.push(value);
+      state.normalizedExisting.add(normalized);
+      mutated = true;
+      article.frontmatter[state.fieldName] = state.currentValues;
+
+      if (!additions.has(canonical)) {
+        additions.set(canonical, []);
+      }
+      additions.get(canonical).push(value);
+    }
+
+    if (mutated) {
+      if (!options.dryRun) {
+        writeArticle(article);
+      }
+      changes.push({ path: article.path, additions });
+    }
+  }
+
+  return changes;
+}
+
+function resolveFieldName(frontmatter, canonicalName, taxonomyMapping) {
+  const candidateSet = taxonomyMapping.canonicalToFields.get(canonicalName);
+  if (candidateSet) {
+    for (const key of Object.keys(frontmatter)) {
+      if (candidateSet.has(key)) {
+        return key;
+      }
+    }
+  }
+  return canonicalName;
+}
+
+function writeArticle(article) {
+  const yamlContent = yaml.dump(article.frontmatter, { lineWidth: 120, sortKeys: false });
+  const finalBody = article.body || "";
+  const next = `---\n${yamlContent}---\n${finalBody}`;
+  fs.writeFileSync(article.path, next, "utf8");
+}
+
+function toStringArray(value) {
+  if (Array.isArray(value)) {
+    return value
+      .map((entry) => transformToString(entry))
+      .filter((entry) => entry.length > 0);
+  }
+  const single = transformToString(value);
+  return single.length > 0 ? [single] : [];
+}
+
+function transformToString(value) {
+  if (value === null || value === undefined) {
+    return "";
+  }
+  if (typeof value === "string") {
+    return value.trim();
+  }
+  if (typeof value === "number") {
+    return String(value);
+  }
+  return "";
+}
+
+function normalizeTerm(value) {
+  return transformToString(value).normalize("NFKC").toLocaleLowerCase("fr");
+}
+
+function compareKeywords(a, b) {
+  const diff = b.length - a.length;
+  if (diff !== 0) {
+    return diff;
+  }
+  return collator.compare(a, b);
+}
+
+function escapeRegExp(value) {
+  return value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
+}
+
+function buildKeywordPattern(value) {
+  const keyword = transformToString(value);
+  if (!keyword) {
+    return null;
+  }
+
+  const characters = Array.from(keyword);
+  if (characters.length === 0) {
+    return null;
+  }
+
+  const firstChar = characters[0];
+  const restChars = characters.slice(1);
+
+  const firstPattern = buildFirstCharacterPattern(firstChar);
+  const restPattern = buildRemainingPattern(restChars);
+  return `(?<![\\p{L}\\p{N}_])${firstPattern}${restPattern}(?![\\p{L}\\p{N}_])`;
+}
+
+function buildFirstCharacterPattern(char) {
+  if (!/\p{L}/u.test(char)) {
+    return escapeRegExp(char);
+  }
+
+  const variants = new Set([char, char.toLocaleLowerCase("fr"), char.toLocaleUpperCase("fr")]);
+  const entries = Array.from(variants)
+    .filter((variant) => variant.length > 0)
+    .map((variant) => ({
+      raw: variant,
+      escaped: escapeRegExp(variant),
+      runeLength: Array.from(variant).length,
+    }));
+
+  if (entries.length === 1) {
+    return entries[0].escaped;
+  }
+
+  if (entries.every((entry) => entry.runeLength === 1)) {
+    return `[${entries.map((entry) => entry.escaped).join("")}]`;
+  }
+
+  return `(?:${entries.map((entry) => entry.escaped).join("|")})`;
+}
+
+function buildRemainingPattern(characters) {
+  if (characters.length === 0) {
+    return "";
+  }
+
+  let pattern = "";
+  let previousWasWhitespace = false;
+
+  for (const char of characters) {
+    if (/\s/u.test(char)) {
+      if (!previousWasWhitespace) {
+        pattern += "\\s+";
+        previousWasWhitespace = true;
+      }
+      continue;
+    }
+    pattern += escapeRegExp(char);
+    previousWasWhitespace = false;
+  }
+
+  return pattern;
+}
+
+function findAvailableMatchRange(regex, text, occupiedRanges, urlRanges) {
+  regex.lastIndex = 0;
+  let match;
+  while ((match = regex.exec(text)) !== null) {
+    const start = match.index;
+    const end = start + match[0].length;
+    if (rangeOverlaps(urlRanges, start, end)) {
+      continue;
+    }
+    if (!overlapsExistingRange(occupiedRanges, start, end)) {
+      return [start, end];
+    }
+  }
+  return null;
+}
+
+function overlapsExistingRange(ranges, start, end) {
+  for (const [existingStart, existingEnd] of ranges) {
+    if (start === existingStart && end === existingEnd) {
+      continue;
+    }
+    if (start < existingEnd && end > existingStart) {
+      return true;
+    }
+  }
+  return false;
+}
+
+function collectMarkdownUrlRanges(markdown) {
+  const ranges = [];
+  if (!markdown) {
+    return ranges;
+  }
+
+  const linkPattern = /\[[^\]]*\]\(([^)]+)\)/g;
+  let match;
+  while ((match = linkPattern.exec(markdown)) !== null) {
+    const relativeParen = match[0].indexOf("(");
+    if (relativeParen === -1) {
+      continue;
+    }
+    const urlStart = match.index + relativeParen + 1;
+    const urlEnd = urlStart + (match[1] ? match[1].length : 0);
+    ranges.push([urlStart, urlEnd]);
+  }
+  return ranges;
+}
+
+function rangeOverlaps(ranges, start, end) {
+  for (const [rangeStart, rangeEnd] of ranges) {
+    if (start < rangeEnd && end > rangeStart) {
+      return true;
+    }
+  }
+  return false;
+}
+
+function normalizeTypographyForSearch(text) {
+  if (!text) {
+    return "";
+  }
+  return text.replace(/[*_]/g, " ");
+}
+
+function shouldSkipSingleWordMatch(keyword, body, range) {
+  if (!keyword || /\s/.test(keyword)) {
+    return false;
+  }
+  const [, end] = range;
+  const lookahead = body.slice(end);
+  return /^\s+[A-Z\u00C0-\u017F]\./u.test(lookahead);
+}
+
+function extractIgnoredKeywords(rawFrontmatter, taxonomyMapping) {
+  const ignoreMap = new Map();
+  if (!rawFrontmatter) {
+    return ignoreMap;
+  }
+
+  const lines = rawFrontmatter.split(/\r?\n/);
+  let currentField = null;
+
+  for (const line of lines) {
+    const trimmed = line.trim();
+    if (trimmed.length === 0) {
+      continue;
+    }
+
+    const fieldMatch = line.match(/^([A-Za-z0-9_-]+):\s*(.*)$/);
+    if (fieldMatch && !line.trimStart().startsWith("#")) {
+      const fieldName = fieldMatch[1];
+      const remainder = fieldMatch[2];
+      if (remainder.trim().length === 0) {
+        currentField = fieldName;
+      } else {
+        currentField = null;
+      }
+      continue;
+    }
+
+    if (!currentField) {
+      continue;
+    }
+
+    const commentMatch = line.match(/^\s*#\s*-\s*(.+?)\s*$/);
+    if (!commentMatch) {
+      continue;
+    }
+
+    let value = commentMatch[1].trim();
+    if (!value) {
+      continue;
+    }
+    if ((value.startsWith('"') && value.endsWith('"')) || (value.startsWith("'") && value.endsWith("'"))) {
+      value = value.slice(1, -1).trim();
+    }
+    if (!value) {
+      continue;
+    }
+
+    const canonical = taxonomyMapping.fieldToCanonical.get(currentField);
+    if (!canonical) {
+      continue;
+    }
+
+    const normalized = normalizeTerm(value);
+    if (!normalized) {
+      continue;
+    }
+    if (!ignoreMap.has(canonical)) {
+      ignoreMap.set(canonical, new Set());
+    }
+    ignoreMap.get(canonical).add(normalized);
+  }
+
+  return ignoreMap;
+}
+
+function isIgnoredKeyword(canonical, normalizedValue, ignoreMap) {
+  if (!canonical || !normalizedValue) {
+    return false;
+  }
+  const values = ignoreMap.get(canonical);
+  if (!values) {
+    return false;
+  }
+  return values.has(normalizedValue);
+}
+
+main();