From afc4f45ad73134955011c2d8b6e36f3e87ba1f06 Mon Sep 17 00:00:00 2001 From: Richard Dern Date: Sun, 16 Nov 2025 15:48:42 +0100 Subject: [PATCH] =?UTF-8?q?Automatisation=20pr=C3=A9liminaire=20de=20l'att?= =?UTF-8?q?ribution=20et=20du=20linking=20des=20mots-cl=C3=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- deploy.sh | 6 + tools/link_frontmatter_keywords.js | 425 +++++++++++++++++++ tools/link_taxonomy_terms.js | 646 +++++++++++++++++++++++++++++ 3 files changed, 1077 insertions(+) create mode 100644 tools/link_frontmatter_keywords.js create mode 100644 tools/link_taxonomy_terms.js diff --git a/deploy.sh b/deploy.sh index 99d71da6..ec8cc81a 100755 --- a/deploy.sh +++ b/deploy.sh @@ -16,6 +16,12 @@ node "$SCRIPT_DIR/tools/check_external_links.js" echo "==> Vérification des liens internes" node "$SCRIPT_DIR/tools/check_internal_links.js" +echo "==> Application des taxonomies et mots-clés" +node "$SCRIPT_DIR/tools/link_taxonomy_terms.js" + +echo "==> Ajout des liens vers les mots-clés du frontmatter" +node "$SCRIPT_DIR/tools/link_frontmatter_keywords.js" + echo "==> Génération du site Hugo pour l'environnement $HUGO_ENV (avec nettoyage de destination)" hugo --environment "$HUGO_ENV" --cleanDestinationDir diff --git a/tools/link_frontmatter_keywords.js b/tools/link_frontmatter_keywords.js new file mode 100644 index 00000000..a0e7d84e --- /dev/null +++ b/tools/link_frontmatter_keywords.js @@ -0,0 +1,425 @@ +#!/usr/bin/env node + +/** + * Parcourt tous les articles Markdown du dossier content/ et + * crée automatiquement un lien vers la page du mot-clé pour la + * première occurrence de chaque terme défini dans les taxonomies + * du frontmatter. Les occurrences déjà liées sont ignorées. + * + * Sort avec un code différent de 0 lorsqu'au moins un fichier est modifié. + */ + +const fs = require("node:fs"); +const path = require("node:path"); +const yaml = require("js-yaml"); + +const PROJECT_ROOT = path.resolve(__dirname, ".."); +const CONTENT_ROOT = path.join(PROJECT_ROOT, "content"); +const TAXONOMIES_FILE = path.join(PROJECT_ROOT, "config", "_default", "taxonomies.yaml"); +const FRONTMATTER_PATTERN = /^---\n([\s\S]+?)\n---\n?([\s\S]*)$/; +const WORD_CHAR = /[\p{L}\p{N}]/u; +const INLINE_FORMATTING_CHARS = ["*", "_"]; + +main(); + +function main() { + const taxonomyMapping = loadTaxonomyMapping(TAXONOMIES_FILE); + const files = collectMarkdownFiles(CONTENT_ROOT); + + if (files.length === 0) { + console.log("Aucun article Markdown trouvé sous content/."); + return; + } + + const changed = []; + for (const filePath of files) { + if (processFile(filePath, taxonomyMapping)) { + changed.push(filePath); + } + } + + if (changed.length > 0) { + for (const filePath of changed) { + const rel = path.relative(PROJECT_ROOT, filePath); + console.log(`✏️ ${rel}`); + } + console.log("Des modifications ont été effectuées. Merci de les revoir."); + process.exit(2); + } else { + console.log("Tous les articles sont déjà correctement liés."); + } +} + +function processFile(filePath, taxonomyMapping) { + let raw; + try { + raw = fs.readFileSync(filePath, "utf8"); + } catch (error) { + console.warn(`⚠️ Impossible de lire ${filePath}: ${error.message}`); + return false; + } + + const match = raw.match(FRONTMATTER_PATTERN); + if (!match) { + return false; + } + + let frontmatter; + try { + frontmatter = yaml.load(match[1]) || {}; + } catch (error) { + console.warn(`⚠️ Frontmatter invalide dans ${filePath}: ${error.message}`); + return false; + } + + const keywords = extractKeywords(frontmatter, match[1], taxonomyMapping.fieldToCanonical); + if (keywords.length === 0) { + return false; + } + + const { body, changed } = linkKeywordsInBody(match[2], keywords); + if (!changed) { + return false; + } + + const prefixLength = raw.length - match[2].length; + const updated = raw.slice(0, prefixLength) + body; + fs.writeFileSync(filePath, updated, "utf8"); + return true; +} + +function loadTaxonomyMapping(configPath) { + let raw; + try { + raw = fs.readFileSync(configPath, "utf8"); + } catch (error) { + console.error(`Impossible de lire ${configPath}: ${error.message}`); + process.exit(1); + } + + let data; + try { + data = yaml.load(raw) || {}; + } catch (error) { + console.error(`YAML invalide dans ${configPath}: ${error.message}`); + process.exit(1); + } + + if (typeof data !== "object" || data === null) { + console.error(`Format inattendu dans ${configPath}`); + process.exit(1); + } + + const fieldToCanonical = new Map(); + for (const [singular, plural] of Object.entries(data)) { + const canonicalName = + typeof plural === "string" && plural.trim().length > 0 ? plural.trim() : singular.trim(); + if (!canonicalName) continue; + const candidates = new Set([singular, canonicalName].filter(Boolean)); + for (const name of candidates) { + fieldToCanonical.set(name, canonicalName); + } + } + + if (fieldToCanonical.size === 0) { + console.error("Aucune taxonomie n'est définie dans la configuration."); + process.exit(1); + } + + return { fieldToCanonical }; +} + +function collectMarkdownFiles(root) { + const files = []; + walk(root, files); + return files.sort((a, b) => a.localeCompare(b)); +} + +function walk(dir, bucket) { + let entries; + try { + entries = fs.readdirSync(dir, { withFileTypes: true }); + } catch (error) { + console.warn(`⚠️ Impossible de parcourir ${dir}: ${error.message}`); + return; + } + + for (const entry of entries) { + if (entry.name === ".git" || entry.name === "node_modules") { + continue; + } + const absolute = path.join(dir, entry.name); + if (entry.isDirectory()) { + walk(absolute, bucket); + } else if (entry.isFile() && entry.name.toLowerCase().endsWith(".md")) { + bucket.push(absolute); + } + } +} + +function extractKeywords(frontmatter, frontmatterRaw, fieldToCanonical) { + const keywords = []; + const seen = new Set(); + + function addKeyword(taxonomy, term) { + if (!taxonomy || typeof term !== "string") return; + const normalized = term.trim(); + if (!normalized) return; + const key = `${taxonomy}::${normalized.toLowerCase()}`; + if (seen.has(key)) return; + const slug = slugify(normalized); + if (!slug) return; + seen.add(key); + keywords.push({ + taxonomy, + term: normalized, + url: `/${taxonomy}/${slug}/`, + }); + } + + if (typeof frontmatter === "object" && frontmatter !== null) { + for (const [field, value] of Object.entries(frontmatter)) { + const canonical = fieldToCanonical.get(field); + if (!canonical) continue; + const terms = normalizeTerms(value); + for (const term of terms) { + addKeyword(canonical, term); + } + } + } + + for (const entry of extractCommentedTerms(frontmatterRaw, fieldToCanonical)) { + addKeyword(entry.taxonomy, entry.term); + } + + return keywords; +} + +function normalizeTerms(value) { + if (Array.isArray(value)) { + return value.map((item) => normalizeTerm(item)).filter(Boolean); + } + const single = normalizeTerm(value); + return single ? [single] : []; +} + +function normalizeTerm(value) { + if (typeof value !== "string") return null; + const trimmed = value.trim(); + return trimmed.length > 0 ? trimmed : null; +} + +function extractCommentedTerms(frontmatterRaw, fieldToCanonical) { + if (typeof frontmatterRaw !== "string" || frontmatterRaw.length === 0) { + return []; + } + + const results = []; + const lines = frontmatterRaw.split(/\r?\n/); + let currentCanonical = null; + let currentIndent = 0; + + for (const line of lines) { + const indent = getIndentation(line); + const fieldMatch = line.match(/^\s*([A-Za-z0-9_]+):\s*(?:#.*)?$/); + if (fieldMatch) { + const fieldName = fieldMatch[1]; + currentCanonical = fieldToCanonical.get(fieldName) || null; + currentIndent = indent; + continue; + } + + if (!currentCanonical) continue; + const commentMatch = line.match(/^\s*#\s*-\s+(.*)$/); + if (!commentMatch) continue; + if (indent <= currentIndent) continue; + const term = commentMatch[1].trim(); + if (!term) continue; + results.push({ taxonomy: currentCanonical, term }); + } + + return results; +} + +function linkKeywordsInBody(body, keywords) { + if (typeof body !== "string" || body.length === 0 || keywords.length === 0) { + return { body, changed: false }; + } + + let updated = body; + let changed = false; + let linkRanges = computeLinkRanges(updated); + + for (const keyword of keywords) { + const occurrence = findKeywordOccurrence(updated, keyword.term, linkRanges); + if (!occurrence) continue; + const expanded = includeFormattingCharacters(updated, occurrence.start, occurrence.end); + const before = updated.slice(0, expanded.start); + const label = updated.slice(expanded.start, expanded.end); + const after = updated.slice(expanded.end); + updated = `${before}[${label}](${keyword.url})${after}`; + changed = true; + linkRanges = computeLinkRanges(updated); + } + + return { body: updated, changed }; +} + +function findKeywordOccurrence(text, keyword, linkRanges) { + if (!keyword) return null; + const escaped = escapeRegExp(keyword); + if (!escaped) return null; + const regex = new RegExp(escaped, "giu"); + let match; + + while ((match = regex.exec(text)) !== null) { + const start = match.index; + const end = start + match[0].length; + if (isInsideExistingLink(start, end, linkRanges)) { + continue; + } + if (!hasWordBoundaries(text, start, end)) { + continue; + } + return { start, end, text: match[0] }; + } + return null; +} + +function computeLinkRanges(text) { + const ranges = []; + if (typeof text !== "string" || text.length === 0) { + return ranges; + } + + for (let i = 0; i < text.length; i++) { + let isImage = false; + if (text[i] === "!" && text[i + 1] === "[") { + isImage = true; + i += 1; + } + if (text[i] !== "[") continue; + + const openBracket = i; + const closeBracket = findMatchingPair(text, openBracket, "[", "]"); + if (closeBracket === -1) continue; + + let pointer = closeBracket + 1; + while (pointer < text.length && /\s/.test(text[pointer])) pointer++; + if (pointer >= text.length || text[pointer] !== "(") { + i = closeBracket; + continue; + } + + const openParen = pointer; + const closeParen = findMatchingPair(text, openParen, "(", ")"); + if (closeParen === -1) break; + + ranges.push({ + textStart: openBracket + 1, + textEnd: closeBracket, + destStart: openParen + 1, + destEnd: closeParen, + isImage, + }); + i = closeParen; + } + + return ranges; +} + +function findMatchingPair(text, startIndex, openChar, closeChar) { + let depth = 0; + for (let i = startIndex; i < text.length; i++) { + const ch = text[i]; + if (ch === "\\") { + i++; + continue; + } + if (ch === openChar) { + depth++; + } else if (ch === closeChar) { + depth--; + if (depth === 0) { + return i; + } + } + } + return -1; +} + +function isInsideExistingLink(start, end, ranges) { + return ranges.some((range) => { + const overlapsText = start < range.textEnd && end > range.textStart; + const overlapsDest = + typeof range.destStart === "number" && + typeof range.destEnd === "number" && + start < range.destEnd && + end > range.destStart; + return overlapsText || overlapsDest; + }); +} + +function hasWordBoundaries(text, start, end) { + const before = start > 0 ? text[start - 1] : ""; + const after = end < text.length ? text[end] : ""; + const startChar = text[start]; + const endChar = text[end - 1]; + + if (isWordChar(startChar) && isWordChar(before)) { + return false; + } + if (isWordChar(endChar) && isWordChar(after)) { + return false; + } + return true; +} + +function isWordChar(ch) { + return Boolean(ch && WORD_CHAR.test(ch)); +} + +function slugify(value) { + return value + .normalize("NFD") + .replace(/\p{Diacritic}/gu, "") + .toLowerCase() + .replace(/[^a-z0-9]+/g, "-") + .replace(/^-+|-+$/g, "") + .replace(/-{2,}/g, "-"); +} + +function escapeRegExp(value) { + return value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); +} + +function includeFormattingCharacters(text, start, end) { + let newStart = start; + let newEnd = end; + + for (const marker of INLINE_FORMATTING_CHARS) { + let prefixCount = 0; + while (newStart - prefixCount - 1 >= 0 && text[newStart - prefixCount - 1] === marker) { + prefixCount++; + } + + let suffixCount = 0; + while (newEnd + suffixCount < text.length && text[newEnd + suffixCount] === marker) { + suffixCount++; + } + + const count = Math.min(prefixCount, suffixCount); + if (count > 0) { + newStart -= count; + newEnd += count; + } + } + + return { start: newStart, end: newEnd }; +} + +function getIndentation(line) { + if (typeof line !== "string" || line.length === 0) return 0; + const match = line.match(/^\s*/); + return match ? match[0].length : 0; +} diff --git a/tools/link_taxonomy_terms.js b/tools/link_taxonomy_terms.js new file mode 100644 index 00000000..a93f2799 --- /dev/null +++ b/tools/link_taxonomy_terms.js @@ -0,0 +1,646 @@ +#!/usr/bin/env node + +/** + * Automatically attaches taxonomy terms to Hugo articles by scanning the body + * of each Markdown file for known keywords that already exist in frontmatter. + * + * Usage: + * node tools/link_taxonomy_terms.js [--dry-run] [paths...] + * + * Without arguments every Markdown file under content/ is processed. + */ + +const fs = require("node:fs"); +const path = require("node:path"); +const yaml = require("js-yaml"); + +const PROJECT_ROOT = path.resolve(__dirname, ".."); +const CONTENT_ROOT = path.join(PROJECT_ROOT, "content"); +const TAXONOMIES_PATH = path.join(PROJECT_ROOT, "config", "_default", "taxonomies.yaml"); +const FRONTMATTER_PATTERN = /^---\n([\s\S]+?)\n---\n?([\s\S]*)$/; +const collator = new Intl.Collator("fr", { sensitivity: "base", usage: "sort" }); + +function main() { + const { options, targets } = parseArgs(process.argv.slice(2)); + + const taxonomyMapping = loadTaxonomyMapping(TAXONOMIES_PATH); + if (taxonomyMapping.canonicalNames.length === 0) { + console.error("❌ No taxonomies found in config/_default/taxonomies.yaml"); + process.exit(1); + } + + const files = collectMarkdownFiles(targets); + if (files.length === 0) { + console.log("No Markdown content found to analyse."); + return; + } + + const articles = files + .map((filePath) => parseArticle(filePath)) + .filter((article) => article !== null); + + if (articles.length === 0) { + console.log("No articles with valid YAML frontmatter were found."); + return; + } + + const { catalog, totalKeywords } = buildKeywordCatalog(articles, taxonomyMapping); + if (totalKeywords === 0) { + console.log("No taxonomy keywords available to propagate."); + return; + } + + console.log( + `Catalogued ${totalKeywords} keyword${totalKeywords > 1 ? "s" : ""} across ${ + catalog.size + } taxonomie${catalog.size > 1 ? "s" : ""}.` + ); + + const modifications = applyTaxonomies(articles, catalog, taxonomyMapping, options); + if (modifications.length === 0) { + console.log("No taxonomy updates required."); + if (options.dryRun) { + console.log("Dry-run only: no files would be modified."); + } + return; + } + + for (const change of modifications) { + const relPath = path.relative(PROJECT_ROOT, change.path); + console.log(`✏️ ${relPath}`); + for (const [taxonomy, values] of change.additions.entries()) { + console.log(` ${taxonomy}: ${values.join(", ")}`); + } + } + + if (options.dryRun) { + console.log(`Dry-run complete. ${modifications.length} article(s) would be updated.`); + } else { + console.log(`Updated ${modifications.length} article(s).`); + console.log(`Vérifier les modifications.`); + process.exit(2); + } +} + +function parseArgs(argv) { + const options = { dryRun: false }; + const targets = []; + + for (const arg of argv) { + if (arg === "--dry-run" || arg === "--check") { + options.dryRun = true; + } else if (arg === "--help" || arg === "-h") { + showUsage(); + process.exit(0); + } else if (arg.startsWith("-")) { + console.error(`Unknown option: ${arg}`); + showUsage(); + process.exit(1); + } else { + targets.push(arg); + } + } + + return { options, targets }; +} + +function showUsage() { + console.log(`Usage: node tools/link_taxonomy_terms.js [--dry-run] [path...] + +Options + --dry-run Analyse files but do not rewrite anything + --help Show this message + +Examples + node tools/link_taxonomy_terms.js --dry-run + node tools/link_taxonomy_terms.js content/interets/paleontologie`); +} + +function loadTaxonomyMapping(configPath) { + let raw; + try { + raw = fs.readFileSync(configPath, "utf8"); + } catch (error) { + throw new Error(`Unable to read ${configPath}: ${error.message}`); + } + + let data; + try { + data = yaml.load(raw) || {}; + } catch (error) { + throw new Error(`Invalid YAML in ${configPath}: ${error.message}`); + } + + if (typeof data !== "object" || Array.isArray(data)) { + throw new Error(`Unexpected taxonomies format in ${configPath}`); + } + + const fieldToCanonical = new Map(); + const canonicalToFields = new Map(); + + for (const [singular, plural] of Object.entries(data)) { + const canonicalName = typeof plural === "string" && plural.trim().length > 0 ? plural.trim() : singular; + if (!canonicalName) continue; + const candidateNames = new Set([singular, canonicalName].filter(Boolean)); + for (const name of candidateNames) { + fieldToCanonical.set(name, canonicalName); + if (!canonicalToFields.has(canonicalName)) { + canonicalToFields.set(canonicalName, new Set()); + } + canonicalToFields.get(canonicalName).add(name); + } + } + + return { + fieldToCanonical, + canonicalToFields, + canonicalNames: Array.from(canonicalToFields.keys()), + }; +} + +function collectMarkdownFiles(targets) { + const files = new Set(); + + if (targets.length === 0) { + walkContentTree(CONTENT_ROOT, files); + return Array.from(files).sort(); + } + + for (const target of targets) { + const absolute = path.resolve(PROJECT_ROOT, target); + if (!fs.existsSync(absolute)) { + console.warn(`⚠️ Skipping missing path: ${target}`); + continue; + } + const stats = fs.statSync(absolute); + if (stats.isDirectory()) { + walkContentTree(absolute, files); + } else if (stats.isFile() && absolute.toLowerCase().endsWith(".md")) { + files.add(absolute); + } + } + + return Array.from(files).sort(); +} + +function walkContentTree(dir, fileSet) { + let entries; + try { + entries = fs.readdirSync(dir, { withFileTypes: true }); + } catch (error) { + console.warn(`⚠️ Cannot read ${dir}: ${error.message}`); + return; + } + + for (const entry of entries) { + const fullPath = path.join(dir, entry.name); + if (entry.isDirectory()) { + if (entry.name === ".git" || entry.name === "node_modules") continue; + walkContentTree(fullPath, fileSet); + } else if (entry.isFile() && entry.name.toLowerCase().endsWith(".md")) { + fileSet.add(fullPath); + } + } +} + +function parseArticle(filePath) { + let raw; + try { + raw = fs.readFileSync(filePath, "utf8"); + } catch (error) { + console.warn(`⚠️ Unable to read ${filePath}: ${error.message}`); + return null; + } + + const match = raw.match(FRONTMATTER_PATTERN); + if (!match) { + console.warn(`⚠️ ${path.relative(PROJECT_ROOT, filePath)} is missing YAML frontmatter. Skipping.`); + return null; + } + + let data = {}; + try { + data = yaml.load(match[1]) || {}; + } catch (error) { + console.warn(`⚠️ Failed to parse frontmatter in ${filePath}: ${error.message}`); + return null; + } + + if (typeof data !== "object" || Array.isArray(data)) { + console.warn(`⚠️ Unexpected frontmatter structure in ${filePath}. Skipping.`); + return null; + } + + return { + path: filePath, + frontmatter: data, + frontmatterRaw: match[1], + body: match[2] || "", + }; +} + +function buildKeywordCatalog(articles, taxonomyMapping) { + const keywordMaps = new Map(); + + for (const canonical of taxonomyMapping.canonicalNames) { + keywordMaps.set(canonical, new Map()); + } + + for (const article of articles) { + const frontmatter = article.frontmatter; + for (const [field, value] of Object.entries(frontmatter)) { + const canonical = taxonomyMapping.fieldToCanonical.get(field); + if (!canonical) continue; + + const strings = toStringArray(value); + if (strings.length === 0) continue; + + const lookup = keywordMaps.get(canonical); + for (const entry of strings) { + const normalized = normalizeTerm(entry); + if (!normalized || lookup.has(normalized)) continue; + lookup.set(normalized, entry); + } + } + } + + const catalog = new Map(); + let totalKeywords = 0; + + for (const [canonical, map] of keywordMaps.entries()) { + if (map.size === 0) continue; + const sortedValues = Array.from(map.values()).sort(compareKeywords); + const entries = []; + for (const value of sortedValues) { + const pattern = buildKeywordPattern(value); + if (!pattern) continue; + entries.push({ value, pattern }); + } + if (entries.length === 0) continue; + totalKeywords += entries.length; + catalog.set(canonical, entries); + } + + return { catalog, totalKeywords }; +} + +function applyTaxonomies(articles, catalog, taxonomyMapping, options) { + const changes = []; + + for (const article of articles) { + const additions = new Map(); + let mutated = false; + const occupiedRanges = []; + const taxonomyStates = new Map(); + const keywordTasks = []; + const ignoredKeywords = extractIgnoredKeywords(article.frontmatterRaw, taxonomyMapping); + + for (const [canonical, keywordEntries] of catalog.entries()) { + if (keywordEntries.length === 0) continue; + const fieldName = resolveFieldName(article.frontmatter, canonical, taxonomyMapping); + const currentValues = toStringArray(article.frontmatter[fieldName]); + const normalizedExisting = new Set(currentValues.map((value) => normalizeTerm(value))); + const state = { + canonical, + fieldName, + currentValues, + normalizedExisting, + }; + taxonomyStates.set(canonical, state); + + for (const entry of keywordEntries) { + keywordTasks.push({ + canonical, + value: entry.value, + pattern: entry.pattern, + state, + }); + } + } + + keywordTasks.sort((a, b) => compareKeywords(a.value, b.value)); + + const urlRanges = collectMarkdownUrlRanges(article.body); + const searchableBody = normalizeTypographyForSearch(article.body); + + for (const task of keywordTasks) { + const { state, canonical, value, pattern } = task; + const regex = new RegExp(pattern, "gu"); + const matchRange = findAvailableMatchRange(regex, searchableBody, occupiedRanges, urlRanges); + if (!matchRange) { + continue; + } + + if (shouldSkipSingleWordMatch(value, article.body, matchRange)) { + occupiedRanges.push(matchRange); + continue; + } + + occupiedRanges.push(matchRange); + + const normalized = normalizeTerm(value); + if (state.normalizedExisting.has(normalized)) { + continue; + } + + if (isIgnoredKeyword(canonical, normalized, ignoredKeywords)) { + continue; + } + + state.currentValues.push(value); + state.normalizedExisting.add(normalized); + mutated = true; + article.frontmatter[state.fieldName] = state.currentValues; + + if (!additions.has(canonical)) { + additions.set(canonical, []); + } + additions.get(canonical).push(value); + } + + if (mutated) { + if (!options.dryRun) { + writeArticle(article); + } + changes.push({ path: article.path, additions }); + } + } + + return changes; +} + +function resolveFieldName(frontmatter, canonicalName, taxonomyMapping) { + const candidateSet = taxonomyMapping.canonicalToFields.get(canonicalName); + if (candidateSet) { + for (const key of Object.keys(frontmatter)) { + if (candidateSet.has(key)) { + return key; + } + } + } + return canonicalName; +} + +function writeArticle(article) { + const yamlContent = yaml.dump(article.frontmatter, { lineWidth: 120, sortKeys: false }); + const finalBody = article.body || ""; + const next = `---\n${yamlContent}---\n${finalBody}`; + fs.writeFileSync(article.path, next, "utf8"); +} + +function toStringArray(value) { + if (Array.isArray(value)) { + return value + .map((entry) => transformToString(entry)) + .filter((entry) => entry.length > 0); + } + const single = transformToString(value); + return single.length > 0 ? [single] : []; +} + +function transformToString(value) { + if (value === null || value === undefined) { + return ""; + } + if (typeof value === "string") { + return value.trim(); + } + if (typeof value === "number") { + return String(value); + } + return ""; +} + +function normalizeTerm(value) { + return transformToString(value).normalize("NFKC").toLocaleLowerCase("fr"); +} + +function compareKeywords(a, b) { + const diff = b.length - a.length; + if (diff !== 0) { + return diff; + } + return collator.compare(a, b); +} + +function escapeRegExp(value) { + return value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); +} + +function buildKeywordPattern(value) { + const keyword = transformToString(value); + if (!keyword) { + return null; + } + + const characters = Array.from(keyword); + if (characters.length === 0) { + return null; + } + + const firstChar = characters[0]; + const restChars = characters.slice(1); + + const firstPattern = buildFirstCharacterPattern(firstChar); + const restPattern = buildRemainingPattern(restChars); + return `(? variant.length > 0) + .map((variant) => ({ + raw: variant, + escaped: escapeRegExp(variant), + runeLength: Array.from(variant).length, + })); + + if (entries.length === 1) { + return entries[0].escaped; + } + + if (entries.every((entry) => entry.runeLength === 1)) { + return `[${entries.map((entry) => entry.escaped).join("")}]`; + } + + return `(?:${entries.map((entry) => entry.escaped).join("|")})`; +} + +function buildRemainingPattern(characters) { + if (characters.length === 0) { + return ""; + } + + let pattern = ""; + let previousWasWhitespace = false; + + for (const char of characters) { + if (/\s/u.test(char)) { + if (!previousWasWhitespace) { + pattern += "\\s+"; + previousWasWhitespace = true; + } + continue; + } + pattern += escapeRegExp(char); + previousWasWhitespace = false; + } + + return pattern; +} + +function findAvailableMatchRange(regex, text, occupiedRanges, urlRanges) { + regex.lastIndex = 0; + let match; + while ((match = regex.exec(text)) !== null) { + const start = match.index; + const end = start + match[0].length; + if (rangeOverlaps(urlRanges, start, end)) { + continue; + } + if (!overlapsExistingRange(occupiedRanges, start, end)) { + return [start, end]; + } + } + return null; +} + +function overlapsExistingRange(ranges, start, end) { + for (const [existingStart, existingEnd] of ranges) { + if (start === existingStart && end === existingEnd) { + continue; + } + if (start < existingEnd && end > existingStart) { + return true; + } + } + return false; +} + +function collectMarkdownUrlRanges(markdown) { + const ranges = []; + if (!markdown) { + return ranges; + } + + const linkPattern = /\[[^\]]*\]\(([^)]+)\)/g; + let match; + while ((match = linkPattern.exec(markdown)) !== null) { + const relativeParen = match[0].indexOf("("); + if (relativeParen === -1) { + continue; + } + const urlStart = match.index + relativeParen + 1; + const urlEnd = urlStart + (match[1] ? match[1].length : 0); + ranges.push([urlStart, urlEnd]); + } + return ranges; +} + +function rangeOverlaps(ranges, start, end) { + for (const [rangeStart, rangeEnd] of ranges) { + if (start < rangeEnd && end > rangeStart) { + return true; + } + } + return false; +} + +function normalizeTypographyForSearch(text) { + if (!text) { + return ""; + } + return text.replace(/[*_]/g, " "); +} + +function shouldSkipSingleWordMatch(keyword, body, range) { + if (!keyword || /\s/.test(keyword)) { + return false; + } + const [, end] = range; + const lookahead = body.slice(end); + return /^\s+[A-Z\u00C0-\u017F]\./u.test(lookahead); +} + +function extractIgnoredKeywords(rawFrontmatter, taxonomyMapping) { + const ignoreMap = new Map(); + if (!rawFrontmatter) { + return ignoreMap; + } + + const lines = rawFrontmatter.split(/\r?\n/); + let currentField = null; + + for (const line of lines) { + const trimmed = line.trim(); + if (trimmed.length === 0) { + continue; + } + + const fieldMatch = line.match(/^([A-Za-z0-9_-]+):\s*(.*)$/); + if (fieldMatch && !line.trimStart().startsWith("#")) { + const fieldName = fieldMatch[1]; + const remainder = fieldMatch[2]; + if (remainder.trim().length === 0) { + currentField = fieldName; + } else { + currentField = null; + } + continue; + } + + if (!currentField) { + continue; + } + + const commentMatch = line.match(/^\s*#\s*-\s*(.+?)\s*$/); + if (!commentMatch) { + continue; + } + + let value = commentMatch[1].trim(); + if (!value) { + continue; + } + if ((value.startsWith('"') && value.endsWith('"')) || (value.startsWith("'") && value.endsWith("'"))) { + value = value.slice(1, -1).trim(); + } + if (!value) { + continue; + } + + const canonical = taxonomyMapping.fieldToCanonical.get(currentField); + if (!canonical) { + continue; + } + + const normalized = normalizeTerm(value); + if (!normalized) { + continue; + } + if (!ignoreMap.has(canonical)) { + ignoreMap.set(canonical, new Set()); + } + ignoreMap.get(canonical).add(normalized); + } + + return ignoreMap; +} + +function isIgnoredKeyword(canonical, normalizedValue, ignoreMap) { + if (!canonical || !normalizedValue) { + return false; + } + const values = ignoreMap.get(canonical); + if (!values) { + return false; + } + return values.has(normalizedValue); +} + +main();