#!/usr/bin/env node /** * Automatically attaches taxonomy terms to Hugo articles by scanning the body * of each Markdown file for known keywords that already exist in frontmatter. * * Usage: * node tools/link_taxonomy_terms.js [--dry-run] [paths...] * * Without arguments every Markdown file under content/ is processed. */ const fs = require("node:fs"); const path = require("node:path"); const yaml = require("js-yaml"); const PROJECT_ROOT = path.resolve(__dirname, ".."); const CONTENT_ROOT = path.join(PROJECT_ROOT, "content"); const TAXONOMIES_PATH = path.join(PROJECT_ROOT, "config", "_default", "taxonomies.yaml"); const FRONTMATTER_PATTERN = /^---\n([\s\S]+?)\n---\n?([\s\S]*)$/; const collator = new Intl.Collator("fr", { sensitivity: "base", usage: "sort" }); function main() { const { options, targets } = parseArgs(process.argv.slice(2)); const taxonomyMapping = loadTaxonomyMapping(TAXONOMIES_PATH); if (taxonomyMapping.canonicalNames.length === 0) { console.error("❌ No taxonomies found in config/_default/taxonomies.yaml"); process.exit(1); } const files = collectMarkdownFiles(targets); if (files.length === 0) { console.log("No Markdown content found to analyse."); return; } const articles = files .map((filePath) => parseArticle(filePath)) .filter((article) => article !== null); if (articles.length === 0) { console.log("No articles with valid YAML frontmatter were found."); return; } const { catalog, totalKeywords } = buildKeywordCatalog(articles, taxonomyMapping); if (totalKeywords === 0) { console.log("No taxonomy keywords available to propagate."); return; } console.log( `Catalogued ${totalKeywords} keyword${totalKeywords > 1 ? "s" : ""} across ${ catalog.size } taxonomie${catalog.size > 1 ? "s" : ""}.` ); const modifications = applyTaxonomies(articles, catalog, taxonomyMapping, options); if (modifications.length === 0) { console.log("No taxonomy updates required."); if (options.dryRun) { console.log("Dry-run only: no files would be modified."); } return; } for (const change of modifications) { const relPath = path.relative(PROJECT_ROOT, change.path); console.log(`✏️ ${relPath}`); for (const [taxonomy, values] of change.additions.entries()) { console.log(` ${taxonomy}: ${values.join(", ")}`); } } if (options.dryRun) { console.log(`Dry-run complete. ${modifications.length} article(s) would be updated.`); } else { console.log(`Updated ${modifications.length} article(s).`); console.log(`Vérifier les modifications.`); process.exit(2); } } function parseArgs(argv) { const options = { dryRun: false }; const targets = []; for (const arg of argv) { if (arg === "--dry-run" || arg === "--check") { options.dryRun = true; } else if (arg === "--help" || arg === "-h") { showUsage(); process.exit(0); } else if (arg.startsWith("-")) { console.error(`Unknown option: ${arg}`); showUsage(); process.exit(1); } else { targets.push(arg); } } return { options, targets }; } function showUsage() { console.log(`Usage: node tools/link_taxonomy_terms.js [--dry-run] [path...] Options --dry-run Analyse files but do not rewrite anything --help Show this message Examples node tools/link_taxonomy_terms.js --dry-run node tools/link_taxonomy_terms.js content/interets/paleontologie`); } function loadTaxonomyMapping(configPath) { let raw; try { raw = fs.readFileSync(configPath, "utf8"); } catch (error) { throw new Error(`Unable to read ${configPath}: ${error.message}`); } let data; try { data = yaml.load(raw) || {}; } catch (error) { throw new Error(`Invalid YAML in ${configPath}: ${error.message}`); } if (typeof data !== "object" || Array.isArray(data)) { throw new Error(`Unexpected taxonomies format in ${configPath}`); } const fieldToCanonical = new Map(); const canonicalToFields = new Map(); for (const [singular, plural] of Object.entries(data)) { const canonicalName = typeof plural === "string" && plural.trim().length > 0 ? plural.trim() : singular; if (!canonicalName) continue; const candidateNames = new Set([singular, canonicalName].filter(Boolean)); for (const name of candidateNames) { fieldToCanonical.set(name, canonicalName); if (!canonicalToFields.has(canonicalName)) { canonicalToFields.set(canonicalName, new Set()); } canonicalToFields.get(canonicalName).add(name); } } return { fieldToCanonical, canonicalToFields, canonicalNames: Array.from(canonicalToFields.keys()), }; } function collectMarkdownFiles(targets) { const files = new Set(); if (targets.length === 0) { walkContentTree(CONTENT_ROOT, files); return Array.from(files).sort(); } for (const target of targets) { const absolute = path.resolve(PROJECT_ROOT, target); if (!fs.existsSync(absolute)) { console.warn(`⚠️ Skipping missing path: ${target}`); continue; } const stats = fs.statSync(absolute); if (stats.isDirectory()) { walkContentTree(absolute, files); } else if (stats.isFile() && absolute.toLowerCase().endsWith(".md")) { files.add(absolute); } } return Array.from(files).sort(); } function walkContentTree(dir, fileSet) { let entries; try { entries = fs.readdirSync(dir, { withFileTypes: true }); } catch (error) { console.warn(`⚠️ Cannot read ${dir}: ${error.message}`); return; } for (const entry of entries) { const fullPath = path.join(dir, entry.name); if (entry.isDirectory()) { if (entry.name === ".git" || entry.name === "node_modules") continue; walkContentTree(fullPath, fileSet); } else if (entry.isFile() && entry.name.toLowerCase().endsWith(".md")) { fileSet.add(fullPath); } } } function parseArticle(filePath) { let raw; try { raw = fs.readFileSync(filePath, "utf8"); } catch (error) { console.warn(`⚠️ Unable to read ${filePath}: ${error.message}`); return null; } const match = raw.match(FRONTMATTER_PATTERN); if (!match) { console.warn(`⚠️ ${path.relative(PROJECT_ROOT, filePath)} is missing YAML frontmatter. Skipping.`); return null; } let data = {}; try { data = yaml.load(match[1]) || {}; } catch (error) { console.warn(`⚠️ Failed to parse frontmatter in ${filePath}: ${error.message}`); return null; } if (typeof data !== "object" || Array.isArray(data)) { console.warn(`⚠️ Unexpected frontmatter structure in ${filePath}. Skipping.`); return null; } return { path: filePath, frontmatter: data, frontmatterRaw: match[1], body: match[2] || "", }; } function buildKeywordCatalog(articles, taxonomyMapping) { const keywordMaps = new Map(); for (const canonical of taxonomyMapping.canonicalNames) { keywordMaps.set(canonical, new Map()); } for (const article of articles) { const frontmatter = article.frontmatter; for (const [field, value] of Object.entries(frontmatter)) { const canonical = taxonomyMapping.fieldToCanonical.get(field); if (!canonical) continue; const strings = toStringArray(value); if (strings.length === 0) continue; const lookup = keywordMaps.get(canonical); for (const entry of strings) { const normalized = normalizeTerm(entry); if (!normalized || lookup.has(normalized)) continue; lookup.set(normalized, entry); } } } const catalog = new Map(); let totalKeywords = 0; for (const [canonical, map] of keywordMaps.entries()) { if (map.size === 0) continue; const sortedValues = Array.from(map.values()).sort(compareKeywords); const entries = []; for (const value of sortedValues) { const pattern = buildKeywordPattern(value); if (!pattern) continue; entries.push({ value, pattern }); } if (entries.length === 0) continue; totalKeywords += entries.length; catalog.set(canonical, entries); } return { catalog, totalKeywords }; } function applyTaxonomies(articles, catalog, taxonomyMapping, options) { const changes = []; for (const article of articles) { const additions = new Map(); let mutated = false; const occupiedRanges = []; const taxonomyStates = new Map(); const keywordTasks = []; const ignoredKeywords = extractIgnoredKeywords(article.frontmatterRaw, taxonomyMapping); for (const [canonical, keywordEntries] of catalog.entries()) { if (keywordEntries.length === 0) continue; const fieldName = resolveFieldName(article.frontmatter, canonical, taxonomyMapping); const currentValues = toStringArray(article.frontmatter[fieldName]); const normalizedExisting = new Set(currentValues.map((value) => normalizeTerm(value))); const state = { canonical, fieldName, currentValues, normalizedExisting, }; taxonomyStates.set(canonical, state); for (const entry of keywordEntries) { keywordTasks.push({ canonical, value: entry.value, pattern: entry.pattern, state, }); } } keywordTasks.sort((a, b) => compareKeywords(a.value, b.value)); const urlRanges = collectMarkdownUrlRanges(article.body); const searchableBody = normalizeTypographyForSearch(article.body); for (const task of keywordTasks) { const { state, canonical, value, pattern } = task; const regex = new RegExp(pattern, "gu"); const matchRange = findAvailableMatchRange(regex, searchableBody, occupiedRanges, urlRanges); if (!matchRange) { continue; } if (shouldSkipSingleWordMatch(value, article.body, matchRange)) { occupiedRanges.push(matchRange); continue; } occupiedRanges.push(matchRange); const normalized = normalizeTerm(value); if (state.normalizedExisting.has(normalized)) { continue; } if (isIgnoredKeyword(canonical, normalized, ignoredKeywords)) { continue; } state.currentValues.push(value); state.normalizedExisting.add(normalized); mutated = true; article.frontmatter[state.fieldName] = state.currentValues; if (!additions.has(canonical)) { additions.set(canonical, []); } additions.get(canonical).push(value); } if (mutated) { if (!options.dryRun) { writeArticle(article); } changes.push({ path: article.path, additions }); } } return changes; } function resolveFieldName(frontmatter, canonicalName, taxonomyMapping) { const candidateSet = taxonomyMapping.canonicalToFields.get(canonicalName); if (candidateSet) { for (const key of Object.keys(frontmatter)) { if (candidateSet.has(key)) { return key; } } } return canonicalName; } function writeArticle(article) { const yamlContent = yaml.dump(article.frontmatter, { lineWidth: 120, sortKeys: false }); const finalBody = article.body || ""; const next = `---\n${yamlContent}---\n${finalBody}`; fs.writeFileSync(article.path, next, "utf8"); } function toStringArray(value) { if (Array.isArray(value)) { return value .map((entry) => transformToString(entry)) .filter((entry) => entry.length > 0); } const single = transformToString(value); return single.length > 0 ? [single] : []; } function transformToString(value) { if (value === null || value === undefined) { return ""; } if (typeof value === "string") { return value.trim(); } if (typeof value === "number") { return String(value); } return ""; } function normalizeTerm(value) { return transformToString(value).normalize("NFKC").toLocaleLowerCase("fr"); } function compareKeywords(a, b) { const diff = b.length - a.length; if (diff !== 0) { return diff; } return collator.compare(a, b); } function escapeRegExp(value) { return value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); } function buildKeywordPattern(value) { const keyword = transformToString(value); if (!keyword) { return null; } const characters = Array.from(keyword); if (characters.length === 0) { return null; } const firstChar = characters[0]; const restChars = characters.slice(1); const firstPattern = buildFirstCharacterPattern(firstChar); const restPattern = buildRemainingPattern(restChars); return `(? variant.length > 0) .map((variant) => ({ raw: variant, escaped: escapeRegExp(variant), runeLength: Array.from(variant).length, })); if (entries.length === 1) { return entries[0].escaped; } if (entries.every((entry) => entry.runeLength === 1)) { return `[${entries.map((entry) => entry.escaped).join("")}]`; } return `(?:${entries.map((entry) => entry.escaped).join("|")})`; } function buildRemainingPattern(characters) { if (characters.length === 0) { return ""; } let pattern = ""; let previousWasWhitespace = false; for (const char of characters) { if (/\s/u.test(char)) { if (!previousWasWhitespace) { pattern += "\\s+"; previousWasWhitespace = true; } continue; } pattern += escapeRegExp(char); previousWasWhitespace = false; } return pattern; } function findAvailableMatchRange(regex, text, occupiedRanges, urlRanges) { regex.lastIndex = 0; let match; while ((match = regex.exec(text)) !== null) { const start = match.index; const end = start + match[0].length; if (rangeOverlaps(urlRanges, start, end)) { continue; } if (!overlapsExistingRange(occupiedRanges, start, end)) { return [start, end]; } } return null; } function overlapsExistingRange(ranges, start, end) { for (const [existingStart, existingEnd] of ranges) { if (start === existingStart && end === existingEnd) { continue; } if (start < existingEnd && end > existingStart) { return true; } } return false; } function collectMarkdownUrlRanges(markdown) { const ranges = []; if (!markdown) { return ranges; } const linkPattern = /\[[^\]]*\]\(([^)]+)\)/g; let match; while ((match = linkPattern.exec(markdown)) !== null) { const relativeParen = match[0].indexOf("("); if (relativeParen === -1) { continue; } const urlStart = match.index + relativeParen + 1; const urlEnd = urlStart + (match[1] ? match[1].length : 0); ranges.push([urlStart, urlEnd]); } return ranges; } function rangeOverlaps(ranges, start, end) { for (const [rangeStart, rangeEnd] of ranges) { if (start < rangeEnd && end > rangeStart) { return true; } } return false; } function normalizeTypographyForSearch(text) { if (!text) { return ""; } return text.replace(/[*_]/g, " "); } function shouldSkipSingleWordMatch(keyword, body, range) { if (!keyword || /\s/.test(keyword)) { return false; } const [, end] = range; const lookahead = body.slice(end); return /^\s+[A-Z\u00C0-\u017F]\./u.test(lookahead); } function extractIgnoredKeywords(rawFrontmatter, taxonomyMapping) { const ignoreMap = new Map(); if (!rawFrontmatter) { return ignoreMap; } const lines = rawFrontmatter.split(/\r?\n/); let currentField = null; for (const line of lines) { const trimmed = line.trim(); if (trimmed.length === 0) { continue; } const fieldMatch = line.match(/^([A-Za-z0-9_-]+):\s*(.*)$/); if (fieldMatch && !line.trimStart().startsWith("#")) { const fieldName = fieldMatch[1]; const remainder = fieldMatch[2]; if (remainder.trim().length === 0) { currentField = fieldName; } else { currentField = null; } continue; } if (!currentField) { continue; } const commentMatch = line.match(/^\s*#\s*-\s*(.+?)\s*$/); if (!commentMatch) { continue; } let value = commentMatch[1].trim(); if (!value) { continue; } if ((value.startsWith('"') && value.endsWith('"')) || (value.startsWith("'") && value.endsWith("'"))) { value = value.slice(1, -1).trim(); } if (!value) { continue; } const canonical = taxonomyMapping.fieldToCanonical.get(currentField); if (!canonical) { continue; } const normalized = normalizeTerm(value); if (!normalized) { continue; } if (!ignoreMap.has(canonical)) { ignoreMap.set(canonical, new Set()); } ignoreMap.get(canonical).add(normalized); } return ignoreMap; } function isIgnoredKeyword(canonical, normalizedValue, ignoreMap) { if (!canonical || !normalizedValue) { return false; } const values = ignoreMap.get(canonical); if (!values) { return false; } return values.has(normalizedValue); } main();