Automatisation préliminaire de l'attribution et du linking des mots-clé
This commit is contained in:
425
tools/link_frontmatter_keywords.js
Normal file
425
tools/link_frontmatter_keywords.js
Normal file
@@ -0,0 +1,425 @@
|
||||
#!/usr/bin/env node
|
||||
|
||||
/**
|
||||
* Parcourt tous les articles Markdown du dossier content/ et
|
||||
* crée automatiquement un lien vers la page du mot-clé pour la
|
||||
* première occurrence de chaque terme défini dans les taxonomies
|
||||
* du frontmatter. Les occurrences déjà liées sont ignorées.
|
||||
*
|
||||
* Sort avec un code différent de 0 lorsqu'au moins un fichier est modifié.
|
||||
*/
|
||||
|
||||
const fs = require("node:fs");
|
||||
const path = require("node:path");
|
||||
const yaml = require("js-yaml");
|
||||
|
||||
const PROJECT_ROOT = path.resolve(__dirname, "..");
|
||||
const CONTENT_ROOT = path.join(PROJECT_ROOT, "content");
|
||||
const TAXONOMIES_FILE = path.join(PROJECT_ROOT, "config", "_default", "taxonomies.yaml");
|
||||
const FRONTMATTER_PATTERN = /^---\n([\s\S]+?)\n---\n?([\s\S]*)$/;
|
||||
const WORD_CHAR = /[\p{L}\p{N}]/u;
|
||||
const INLINE_FORMATTING_CHARS = ["*", "_"];
|
||||
|
||||
main();
|
||||
|
||||
function main() {
|
||||
const taxonomyMapping = loadTaxonomyMapping(TAXONOMIES_FILE);
|
||||
const files = collectMarkdownFiles(CONTENT_ROOT);
|
||||
|
||||
if (files.length === 0) {
|
||||
console.log("Aucun article Markdown trouvé sous content/.");
|
||||
return;
|
||||
}
|
||||
|
||||
const changed = [];
|
||||
for (const filePath of files) {
|
||||
if (processFile(filePath, taxonomyMapping)) {
|
||||
changed.push(filePath);
|
||||
}
|
||||
}
|
||||
|
||||
if (changed.length > 0) {
|
||||
for (const filePath of changed) {
|
||||
const rel = path.relative(PROJECT_ROOT, filePath);
|
||||
console.log(`✏️ ${rel}`);
|
||||
}
|
||||
console.log("Des modifications ont été effectuées. Merci de les revoir.");
|
||||
process.exit(2);
|
||||
} else {
|
||||
console.log("Tous les articles sont déjà correctement liés.");
|
||||
}
|
||||
}
|
||||
|
||||
function processFile(filePath, taxonomyMapping) {
|
||||
let raw;
|
||||
try {
|
||||
raw = fs.readFileSync(filePath, "utf8");
|
||||
} catch (error) {
|
||||
console.warn(`⚠️ Impossible de lire ${filePath}: ${error.message}`);
|
||||
return false;
|
||||
}
|
||||
|
||||
const match = raw.match(FRONTMATTER_PATTERN);
|
||||
if (!match) {
|
||||
return false;
|
||||
}
|
||||
|
||||
let frontmatter;
|
||||
try {
|
||||
frontmatter = yaml.load(match[1]) || {};
|
||||
} catch (error) {
|
||||
console.warn(`⚠️ Frontmatter invalide dans ${filePath}: ${error.message}`);
|
||||
return false;
|
||||
}
|
||||
|
||||
const keywords = extractKeywords(frontmatter, match[1], taxonomyMapping.fieldToCanonical);
|
||||
if (keywords.length === 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const { body, changed } = linkKeywordsInBody(match[2], keywords);
|
||||
if (!changed) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const prefixLength = raw.length - match[2].length;
|
||||
const updated = raw.slice(0, prefixLength) + body;
|
||||
fs.writeFileSync(filePath, updated, "utf8");
|
||||
return true;
|
||||
}
|
||||
|
||||
function loadTaxonomyMapping(configPath) {
|
||||
let raw;
|
||||
try {
|
||||
raw = fs.readFileSync(configPath, "utf8");
|
||||
} catch (error) {
|
||||
console.error(`Impossible de lire ${configPath}: ${error.message}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
let data;
|
||||
try {
|
||||
data = yaml.load(raw) || {};
|
||||
} catch (error) {
|
||||
console.error(`YAML invalide dans ${configPath}: ${error.message}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
if (typeof data !== "object" || data === null) {
|
||||
console.error(`Format inattendu dans ${configPath}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const fieldToCanonical = new Map();
|
||||
for (const [singular, plural] of Object.entries(data)) {
|
||||
const canonicalName =
|
||||
typeof plural === "string" && plural.trim().length > 0 ? plural.trim() : singular.trim();
|
||||
if (!canonicalName) continue;
|
||||
const candidates = new Set([singular, canonicalName].filter(Boolean));
|
||||
for (const name of candidates) {
|
||||
fieldToCanonical.set(name, canonicalName);
|
||||
}
|
||||
}
|
||||
|
||||
if (fieldToCanonical.size === 0) {
|
||||
console.error("Aucune taxonomie n'est définie dans la configuration.");
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
return { fieldToCanonical };
|
||||
}
|
||||
|
||||
function collectMarkdownFiles(root) {
|
||||
const files = [];
|
||||
walk(root, files);
|
||||
return files.sort((a, b) => a.localeCompare(b));
|
||||
}
|
||||
|
||||
function walk(dir, bucket) {
|
||||
let entries;
|
||||
try {
|
||||
entries = fs.readdirSync(dir, { withFileTypes: true });
|
||||
} catch (error) {
|
||||
console.warn(`⚠️ Impossible de parcourir ${dir}: ${error.message}`);
|
||||
return;
|
||||
}
|
||||
|
||||
for (const entry of entries) {
|
||||
if (entry.name === ".git" || entry.name === "node_modules") {
|
||||
continue;
|
||||
}
|
||||
const absolute = path.join(dir, entry.name);
|
||||
if (entry.isDirectory()) {
|
||||
walk(absolute, bucket);
|
||||
} else if (entry.isFile() && entry.name.toLowerCase().endsWith(".md")) {
|
||||
bucket.push(absolute);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function extractKeywords(frontmatter, frontmatterRaw, fieldToCanonical) {
|
||||
const keywords = [];
|
||||
const seen = new Set();
|
||||
|
||||
function addKeyword(taxonomy, term) {
|
||||
if (!taxonomy || typeof term !== "string") return;
|
||||
const normalized = term.trim();
|
||||
if (!normalized) return;
|
||||
const key = `${taxonomy}::${normalized.toLowerCase()}`;
|
||||
if (seen.has(key)) return;
|
||||
const slug = slugify(normalized);
|
||||
if (!slug) return;
|
||||
seen.add(key);
|
||||
keywords.push({
|
||||
taxonomy,
|
||||
term: normalized,
|
||||
url: `/${taxonomy}/${slug}/`,
|
||||
});
|
||||
}
|
||||
|
||||
if (typeof frontmatter === "object" && frontmatter !== null) {
|
||||
for (const [field, value] of Object.entries(frontmatter)) {
|
||||
const canonical = fieldToCanonical.get(field);
|
||||
if (!canonical) continue;
|
||||
const terms = normalizeTerms(value);
|
||||
for (const term of terms) {
|
||||
addKeyword(canonical, term);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (const entry of extractCommentedTerms(frontmatterRaw, fieldToCanonical)) {
|
||||
addKeyword(entry.taxonomy, entry.term);
|
||||
}
|
||||
|
||||
return keywords;
|
||||
}
|
||||
|
||||
function normalizeTerms(value) {
|
||||
if (Array.isArray(value)) {
|
||||
return value.map((item) => normalizeTerm(item)).filter(Boolean);
|
||||
}
|
||||
const single = normalizeTerm(value);
|
||||
return single ? [single] : [];
|
||||
}
|
||||
|
||||
function normalizeTerm(value) {
|
||||
if (typeof value !== "string") return null;
|
||||
const trimmed = value.trim();
|
||||
return trimmed.length > 0 ? trimmed : null;
|
||||
}
|
||||
|
||||
function extractCommentedTerms(frontmatterRaw, fieldToCanonical) {
|
||||
if (typeof frontmatterRaw !== "string" || frontmatterRaw.length === 0) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const results = [];
|
||||
const lines = frontmatterRaw.split(/\r?\n/);
|
||||
let currentCanonical = null;
|
||||
let currentIndent = 0;
|
||||
|
||||
for (const line of lines) {
|
||||
const indent = getIndentation(line);
|
||||
const fieldMatch = line.match(/^\s*([A-Za-z0-9_]+):\s*(?:#.*)?$/);
|
||||
if (fieldMatch) {
|
||||
const fieldName = fieldMatch[1];
|
||||
currentCanonical = fieldToCanonical.get(fieldName) || null;
|
||||
currentIndent = indent;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!currentCanonical) continue;
|
||||
const commentMatch = line.match(/^\s*#\s*-\s+(.*)$/);
|
||||
if (!commentMatch) continue;
|
||||
if (indent <= currentIndent) continue;
|
||||
const term = commentMatch[1].trim();
|
||||
if (!term) continue;
|
||||
results.push({ taxonomy: currentCanonical, term });
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
function linkKeywordsInBody(body, keywords) {
|
||||
if (typeof body !== "string" || body.length === 0 || keywords.length === 0) {
|
||||
return { body, changed: false };
|
||||
}
|
||||
|
||||
let updated = body;
|
||||
let changed = false;
|
||||
let linkRanges = computeLinkRanges(updated);
|
||||
|
||||
for (const keyword of keywords) {
|
||||
const occurrence = findKeywordOccurrence(updated, keyword.term, linkRanges);
|
||||
if (!occurrence) continue;
|
||||
const expanded = includeFormattingCharacters(updated, occurrence.start, occurrence.end);
|
||||
const before = updated.slice(0, expanded.start);
|
||||
const label = updated.slice(expanded.start, expanded.end);
|
||||
const after = updated.slice(expanded.end);
|
||||
updated = `${before}[${label}](${keyword.url})${after}`;
|
||||
changed = true;
|
||||
linkRanges = computeLinkRanges(updated);
|
||||
}
|
||||
|
||||
return { body: updated, changed };
|
||||
}
|
||||
|
||||
function findKeywordOccurrence(text, keyword, linkRanges) {
|
||||
if (!keyword) return null;
|
||||
const escaped = escapeRegExp(keyword);
|
||||
if (!escaped) return null;
|
||||
const regex = new RegExp(escaped, "giu");
|
||||
let match;
|
||||
|
||||
while ((match = regex.exec(text)) !== null) {
|
||||
const start = match.index;
|
||||
const end = start + match[0].length;
|
||||
if (isInsideExistingLink(start, end, linkRanges)) {
|
||||
continue;
|
||||
}
|
||||
if (!hasWordBoundaries(text, start, end)) {
|
||||
continue;
|
||||
}
|
||||
return { start, end, text: match[0] };
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
function computeLinkRanges(text) {
|
||||
const ranges = [];
|
||||
if (typeof text !== "string" || text.length === 0) {
|
||||
return ranges;
|
||||
}
|
||||
|
||||
for (let i = 0; i < text.length; i++) {
|
||||
let isImage = false;
|
||||
if (text[i] === "!" && text[i + 1] === "[") {
|
||||
isImage = true;
|
||||
i += 1;
|
||||
}
|
||||
if (text[i] !== "[") continue;
|
||||
|
||||
const openBracket = i;
|
||||
const closeBracket = findMatchingPair(text, openBracket, "[", "]");
|
||||
if (closeBracket === -1) continue;
|
||||
|
||||
let pointer = closeBracket + 1;
|
||||
while (pointer < text.length && /\s/.test(text[pointer])) pointer++;
|
||||
if (pointer >= text.length || text[pointer] !== "(") {
|
||||
i = closeBracket;
|
||||
continue;
|
||||
}
|
||||
|
||||
const openParen = pointer;
|
||||
const closeParen = findMatchingPair(text, openParen, "(", ")");
|
||||
if (closeParen === -1) break;
|
||||
|
||||
ranges.push({
|
||||
textStart: openBracket + 1,
|
||||
textEnd: closeBracket,
|
||||
destStart: openParen + 1,
|
||||
destEnd: closeParen,
|
||||
isImage,
|
||||
});
|
||||
i = closeParen;
|
||||
}
|
||||
|
||||
return ranges;
|
||||
}
|
||||
|
||||
function findMatchingPair(text, startIndex, openChar, closeChar) {
|
||||
let depth = 0;
|
||||
for (let i = startIndex; i < text.length; i++) {
|
||||
const ch = text[i];
|
||||
if (ch === "\\") {
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
if (ch === openChar) {
|
||||
depth++;
|
||||
} else if (ch === closeChar) {
|
||||
depth--;
|
||||
if (depth === 0) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
function isInsideExistingLink(start, end, ranges) {
|
||||
return ranges.some((range) => {
|
||||
const overlapsText = start < range.textEnd && end > range.textStart;
|
||||
const overlapsDest =
|
||||
typeof range.destStart === "number" &&
|
||||
typeof range.destEnd === "number" &&
|
||||
start < range.destEnd &&
|
||||
end > range.destStart;
|
||||
return overlapsText || overlapsDest;
|
||||
});
|
||||
}
|
||||
|
||||
function hasWordBoundaries(text, start, end) {
|
||||
const before = start > 0 ? text[start - 1] : "";
|
||||
const after = end < text.length ? text[end] : "";
|
||||
const startChar = text[start];
|
||||
const endChar = text[end - 1];
|
||||
|
||||
if (isWordChar(startChar) && isWordChar(before)) {
|
||||
return false;
|
||||
}
|
||||
if (isWordChar(endChar) && isWordChar(after)) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
function isWordChar(ch) {
|
||||
return Boolean(ch && WORD_CHAR.test(ch));
|
||||
}
|
||||
|
||||
function slugify(value) {
|
||||
return value
|
||||
.normalize("NFD")
|
||||
.replace(/\p{Diacritic}/gu, "")
|
||||
.toLowerCase()
|
||||
.replace(/[^a-z0-9]+/g, "-")
|
||||
.replace(/^-+|-+$/g, "")
|
||||
.replace(/-{2,}/g, "-");
|
||||
}
|
||||
|
||||
function escapeRegExp(value) {
|
||||
return value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
||||
}
|
||||
|
||||
function includeFormattingCharacters(text, start, end) {
|
||||
let newStart = start;
|
||||
let newEnd = end;
|
||||
|
||||
for (const marker of INLINE_FORMATTING_CHARS) {
|
||||
let prefixCount = 0;
|
||||
while (newStart - prefixCount - 1 >= 0 && text[newStart - prefixCount - 1] === marker) {
|
||||
prefixCount++;
|
||||
}
|
||||
|
||||
let suffixCount = 0;
|
||||
while (newEnd + suffixCount < text.length && text[newEnd + suffixCount] === marker) {
|
||||
suffixCount++;
|
||||
}
|
||||
|
||||
const count = Math.min(prefixCount, suffixCount);
|
||||
if (count > 0) {
|
||||
newStart -= count;
|
||||
newEnd += count;
|
||||
}
|
||||
}
|
||||
|
||||
return { start: newStart, end: newEnd };
|
||||
}
|
||||
|
||||
function getIndentation(line) {
|
||||
if (typeof line !== "string" || line.length === 0) return 0;
|
||||
const match = line.match(/^\s*/);
|
||||
return match ? match[0].length : 0;
|
||||
}
|
||||
646
tools/link_taxonomy_terms.js
Normal file
646
tools/link_taxonomy_terms.js
Normal file
@@ -0,0 +1,646 @@
|
||||
#!/usr/bin/env node
|
||||
|
||||
/**
|
||||
* Automatically attaches taxonomy terms to Hugo articles by scanning the body
|
||||
* of each Markdown file for known keywords that already exist in frontmatter.
|
||||
*
|
||||
* Usage:
|
||||
* node tools/link_taxonomy_terms.js [--dry-run] [paths...]
|
||||
*
|
||||
* Without arguments every Markdown file under content/ is processed.
|
||||
*/
|
||||
|
||||
const fs = require("node:fs");
|
||||
const path = require("node:path");
|
||||
const yaml = require("js-yaml");
|
||||
|
||||
const PROJECT_ROOT = path.resolve(__dirname, "..");
|
||||
const CONTENT_ROOT = path.join(PROJECT_ROOT, "content");
|
||||
const TAXONOMIES_PATH = path.join(PROJECT_ROOT, "config", "_default", "taxonomies.yaml");
|
||||
const FRONTMATTER_PATTERN = /^---\n([\s\S]+?)\n---\n?([\s\S]*)$/;
|
||||
const collator = new Intl.Collator("fr", { sensitivity: "base", usage: "sort" });
|
||||
|
||||
function main() {
|
||||
const { options, targets } = parseArgs(process.argv.slice(2));
|
||||
|
||||
const taxonomyMapping = loadTaxonomyMapping(TAXONOMIES_PATH);
|
||||
if (taxonomyMapping.canonicalNames.length === 0) {
|
||||
console.error("❌ No taxonomies found in config/_default/taxonomies.yaml");
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const files = collectMarkdownFiles(targets);
|
||||
if (files.length === 0) {
|
||||
console.log("No Markdown content found to analyse.");
|
||||
return;
|
||||
}
|
||||
|
||||
const articles = files
|
||||
.map((filePath) => parseArticle(filePath))
|
||||
.filter((article) => article !== null);
|
||||
|
||||
if (articles.length === 0) {
|
||||
console.log("No articles with valid YAML frontmatter were found.");
|
||||
return;
|
||||
}
|
||||
|
||||
const { catalog, totalKeywords } = buildKeywordCatalog(articles, taxonomyMapping);
|
||||
if (totalKeywords === 0) {
|
||||
console.log("No taxonomy keywords available to propagate.");
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(
|
||||
`Catalogued ${totalKeywords} keyword${totalKeywords > 1 ? "s" : ""} across ${
|
||||
catalog.size
|
||||
} taxonomie${catalog.size > 1 ? "s" : ""}.`
|
||||
);
|
||||
|
||||
const modifications = applyTaxonomies(articles, catalog, taxonomyMapping, options);
|
||||
if (modifications.length === 0) {
|
||||
console.log("No taxonomy updates required.");
|
||||
if (options.dryRun) {
|
||||
console.log("Dry-run only: no files would be modified.");
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
for (const change of modifications) {
|
||||
const relPath = path.relative(PROJECT_ROOT, change.path);
|
||||
console.log(`✏️ ${relPath}`);
|
||||
for (const [taxonomy, values] of change.additions.entries()) {
|
||||
console.log(` ${taxonomy}: ${values.join(", ")}`);
|
||||
}
|
||||
}
|
||||
|
||||
if (options.dryRun) {
|
||||
console.log(`Dry-run complete. ${modifications.length} article(s) would be updated.`);
|
||||
} else {
|
||||
console.log(`Updated ${modifications.length} article(s).`);
|
||||
console.log(`Vérifier les modifications.`);
|
||||
process.exit(2);
|
||||
}
|
||||
}
|
||||
|
||||
function parseArgs(argv) {
|
||||
const options = { dryRun: false };
|
||||
const targets = [];
|
||||
|
||||
for (const arg of argv) {
|
||||
if (arg === "--dry-run" || arg === "--check") {
|
||||
options.dryRun = true;
|
||||
} else if (arg === "--help" || arg === "-h") {
|
||||
showUsage();
|
||||
process.exit(0);
|
||||
} else if (arg.startsWith("-")) {
|
||||
console.error(`Unknown option: ${arg}`);
|
||||
showUsage();
|
||||
process.exit(1);
|
||||
} else {
|
||||
targets.push(arg);
|
||||
}
|
||||
}
|
||||
|
||||
return { options, targets };
|
||||
}
|
||||
|
||||
function showUsage() {
|
||||
console.log(`Usage: node tools/link_taxonomy_terms.js [--dry-run] [path...]
|
||||
|
||||
Options
|
||||
--dry-run Analyse files but do not rewrite anything
|
||||
--help Show this message
|
||||
|
||||
Examples
|
||||
node tools/link_taxonomy_terms.js --dry-run
|
||||
node tools/link_taxonomy_terms.js content/interets/paleontologie`);
|
||||
}
|
||||
|
||||
function loadTaxonomyMapping(configPath) {
|
||||
let raw;
|
||||
try {
|
||||
raw = fs.readFileSync(configPath, "utf8");
|
||||
} catch (error) {
|
||||
throw new Error(`Unable to read ${configPath}: ${error.message}`);
|
||||
}
|
||||
|
||||
let data;
|
||||
try {
|
||||
data = yaml.load(raw) || {};
|
||||
} catch (error) {
|
||||
throw new Error(`Invalid YAML in ${configPath}: ${error.message}`);
|
||||
}
|
||||
|
||||
if (typeof data !== "object" || Array.isArray(data)) {
|
||||
throw new Error(`Unexpected taxonomies format in ${configPath}`);
|
||||
}
|
||||
|
||||
const fieldToCanonical = new Map();
|
||||
const canonicalToFields = new Map();
|
||||
|
||||
for (const [singular, plural] of Object.entries(data)) {
|
||||
const canonicalName = typeof plural === "string" && plural.trim().length > 0 ? plural.trim() : singular;
|
||||
if (!canonicalName) continue;
|
||||
const candidateNames = new Set([singular, canonicalName].filter(Boolean));
|
||||
for (const name of candidateNames) {
|
||||
fieldToCanonical.set(name, canonicalName);
|
||||
if (!canonicalToFields.has(canonicalName)) {
|
||||
canonicalToFields.set(canonicalName, new Set());
|
||||
}
|
||||
canonicalToFields.get(canonicalName).add(name);
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
fieldToCanonical,
|
||||
canonicalToFields,
|
||||
canonicalNames: Array.from(canonicalToFields.keys()),
|
||||
};
|
||||
}
|
||||
|
||||
function collectMarkdownFiles(targets) {
|
||||
const files = new Set();
|
||||
|
||||
if (targets.length === 0) {
|
||||
walkContentTree(CONTENT_ROOT, files);
|
||||
return Array.from(files).sort();
|
||||
}
|
||||
|
||||
for (const target of targets) {
|
||||
const absolute = path.resolve(PROJECT_ROOT, target);
|
||||
if (!fs.existsSync(absolute)) {
|
||||
console.warn(`⚠️ Skipping missing path: ${target}`);
|
||||
continue;
|
||||
}
|
||||
const stats = fs.statSync(absolute);
|
||||
if (stats.isDirectory()) {
|
||||
walkContentTree(absolute, files);
|
||||
} else if (stats.isFile() && absolute.toLowerCase().endsWith(".md")) {
|
||||
files.add(absolute);
|
||||
}
|
||||
}
|
||||
|
||||
return Array.from(files).sort();
|
||||
}
|
||||
|
||||
function walkContentTree(dir, fileSet) {
|
||||
let entries;
|
||||
try {
|
||||
entries = fs.readdirSync(dir, { withFileTypes: true });
|
||||
} catch (error) {
|
||||
console.warn(`⚠️ Cannot read ${dir}: ${error.message}`);
|
||||
return;
|
||||
}
|
||||
|
||||
for (const entry of entries) {
|
||||
const fullPath = path.join(dir, entry.name);
|
||||
if (entry.isDirectory()) {
|
||||
if (entry.name === ".git" || entry.name === "node_modules") continue;
|
||||
walkContentTree(fullPath, fileSet);
|
||||
} else if (entry.isFile() && entry.name.toLowerCase().endsWith(".md")) {
|
||||
fileSet.add(fullPath);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function parseArticle(filePath) {
|
||||
let raw;
|
||||
try {
|
||||
raw = fs.readFileSync(filePath, "utf8");
|
||||
} catch (error) {
|
||||
console.warn(`⚠️ Unable to read ${filePath}: ${error.message}`);
|
||||
return null;
|
||||
}
|
||||
|
||||
const match = raw.match(FRONTMATTER_PATTERN);
|
||||
if (!match) {
|
||||
console.warn(`⚠️ ${path.relative(PROJECT_ROOT, filePath)} is missing YAML frontmatter. Skipping.`);
|
||||
return null;
|
||||
}
|
||||
|
||||
let data = {};
|
||||
try {
|
||||
data = yaml.load(match[1]) || {};
|
||||
} catch (error) {
|
||||
console.warn(`⚠️ Failed to parse frontmatter in ${filePath}: ${error.message}`);
|
||||
return null;
|
||||
}
|
||||
|
||||
if (typeof data !== "object" || Array.isArray(data)) {
|
||||
console.warn(`⚠️ Unexpected frontmatter structure in ${filePath}. Skipping.`);
|
||||
return null;
|
||||
}
|
||||
|
||||
return {
|
||||
path: filePath,
|
||||
frontmatter: data,
|
||||
frontmatterRaw: match[1],
|
||||
body: match[2] || "",
|
||||
};
|
||||
}
|
||||
|
||||
function buildKeywordCatalog(articles, taxonomyMapping) {
|
||||
const keywordMaps = new Map();
|
||||
|
||||
for (const canonical of taxonomyMapping.canonicalNames) {
|
||||
keywordMaps.set(canonical, new Map());
|
||||
}
|
||||
|
||||
for (const article of articles) {
|
||||
const frontmatter = article.frontmatter;
|
||||
for (const [field, value] of Object.entries(frontmatter)) {
|
||||
const canonical = taxonomyMapping.fieldToCanonical.get(field);
|
||||
if (!canonical) continue;
|
||||
|
||||
const strings = toStringArray(value);
|
||||
if (strings.length === 0) continue;
|
||||
|
||||
const lookup = keywordMaps.get(canonical);
|
||||
for (const entry of strings) {
|
||||
const normalized = normalizeTerm(entry);
|
||||
if (!normalized || lookup.has(normalized)) continue;
|
||||
lookup.set(normalized, entry);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const catalog = new Map();
|
||||
let totalKeywords = 0;
|
||||
|
||||
for (const [canonical, map] of keywordMaps.entries()) {
|
||||
if (map.size === 0) continue;
|
||||
const sortedValues = Array.from(map.values()).sort(compareKeywords);
|
||||
const entries = [];
|
||||
for (const value of sortedValues) {
|
||||
const pattern = buildKeywordPattern(value);
|
||||
if (!pattern) continue;
|
||||
entries.push({ value, pattern });
|
||||
}
|
||||
if (entries.length === 0) continue;
|
||||
totalKeywords += entries.length;
|
||||
catalog.set(canonical, entries);
|
||||
}
|
||||
|
||||
return { catalog, totalKeywords };
|
||||
}
|
||||
|
||||
function applyTaxonomies(articles, catalog, taxonomyMapping, options) {
|
||||
const changes = [];
|
||||
|
||||
for (const article of articles) {
|
||||
const additions = new Map();
|
||||
let mutated = false;
|
||||
const occupiedRanges = [];
|
||||
const taxonomyStates = new Map();
|
||||
const keywordTasks = [];
|
||||
const ignoredKeywords = extractIgnoredKeywords(article.frontmatterRaw, taxonomyMapping);
|
||||
|
||||
for (const [canonical, keywordEntries] of catalog.entries()) {
|
||||
if (keywordEntries.length === 0) continue;
|
||||
const fieldName = resolveFieldName(article.frontmatter, canonical, taxonomyMapping);
|
||||
const currentValues = toStringArray(article.frontmatter[fieldName]);
|
||||
const normalizedExisting = new Set(currentValues.map((value) => normalizeTerm(value)));
|
||||
const state = {
|
||||
canonical,
|
||||
fieldName,
|
||||
currentValues,
|
||||
normalizedExisting,
|
||||
};
|
||||
taxonomyStates.set(canonical, state);
|
||||
|
||||
for (const entry of keywordEntries) {
|
||||
keywordTasks.push({
|
||||
canonical,
|
||||
value: entry.value,
|
||||
pattern: entry.pattern,
|
||||
state,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
keywordTasks.sort((a, b) => compareKeywords(a.value, b.value));
|
||||
|
||||
const urlRanges = collectMarkdownUrlRanges(article.body);
|
||||
const searchableBody = normalizeTypographyForSearch(article.body);
|
||||
|
||||
for (const task of keywordTasks) {
|
||||
const { state, canonical, value, pattern } = task;
|
||||
const regex = new RegExp(pattern, "gu");
|
||||
const matchRange = findAvailableMatchRange(regex, searchableBody, occupiedRanges, urlRanges);
|
||||
if (!matchRange) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (shouldSkipSingleWordMatch(value, article.body, matchRange)) {
|
||||
occupiedRanges.push(matchRange);
|
||||
continue;
|
||||
}
|
||||
|
||||
occupiedRanges.push(matchRange);
|
||||
|
||||
const normalized = normalizeTerm(value);
|
||||
if (state.normalizedExisting.has(normalized)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (isIgnoredKeyword(canonical, normalized, ignoredKeywords)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
state.currentValues.push(value);
|
||||
state.normalizedExisting.add(normalized);
|
||||
mutated = true;
|
||||
article.frontmatter[state.fieldName] = state.currentValues;
|
||||
|
||||
if (!additions.has(canonical)) {
|
||||
additions.set(canonical, []);
|
||||
}
|
||||
additions.get(canonical).push(value);
|
||||
}
|
||||
|
||||
if (mutated) {
|
||||
if (!options.dryRun) {
|
||||
writeArticle(article);
|
||||
}
|
||||
changes.push({ path: article.path, additions });
|
||||
}
|
||||
}
|
||||
|
||||
return changes;
|
||||
}
|
||||
|
||||
function resolveFieldName(frontmatter, canonicalName, taxonomyMapping) {
|
||||
const candidateSet = taxonomyMapping.canonicalToFields.get(canonicalName);
|
||||
if (candidateSet) {
|
||||
for (const key of Object.keys(frontmatter)) {
|
||||
if (candidateSet.has(key)) {
|
||||
return key;
|
||||
}
|
||||
}
|
||||
}
|
||||
return canonicalName;
|
||||
}
|
||||
|
||||
function writeArticle(article) {
|
||||
const yamlContent = yaml.dump(article.frontmatter, { lineWidth: 120, sortKeys: false });
|
||||
const finalBody = article.body || "";
|
||||
const next = `---\n${yamlContent}---\n${finalBody}`;
|
||||
fs.writeFileSync(article.path, next, "utf8");
|
||||
}
|
||||
|
||||
function toStringArray(value) {
|
||||
if (Array.isArray(value)) {
|
||||
return value
|
||||
.map((entry) => transformToString(entry))
|
||||
.filter((entry) => entry.length > 0);
|
||||
}
|
||||
const single = transformToString(value);
|
||||
return single.length > 0 ? [single] : [];
|
||||
}
|
||||
|
||||
function transformToString(value) {
|
||||
if (value === null || value === undefined) {
|
||||
return "";
|
||||
}
|
||||
if (typeof value === "string") {
|
||||
return value.trim();
|
||||
}
|
||||
if (typeof value === "number") {
|
||||
return String(value);
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
function normalizeTerm(value) {
|
||||
return transformToString(value).normalize("NFKC").toLocaleLowerCase("fr");
|
||||
}
|
||||
|
||||
function compareKeywords(a, b) {
|
||||
const diff = b.length - a.length;
|
||||
if (diff !== 0) {
|
||||
return diff;
|
||||
}
|
||||
return collator.compare(a, b);
|
||||
}
|
||||
|
||||
function escapeRegExp(value) {
|
||||
return value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
||||
}
|
||||
|
||||
function buildKeywordPattern(value) {
|
||||
const keyword = transformToString(value);
|
||||
if (!keyword) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const characters = Array.from(keyword);
|
||||
if (characters.length === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const firstChar = characters[0];
|
||||
const restChars = characters.slice(1);
|
||||
|
||||
const firstPattern = buildFirstCharacterPattern(firstChar);
|
||||
const restPattern = buildRemainingPattern(restChars);
|
||||
return `(?<![\\p{L}\\p{N}_])${firstPattern}${restPattern}(?![\\p{L}\\p{N}_])`;
|
||||
}
|
||||
|
||||
function buildFirstCharacterPattern(char) {
|
||||
if (!/\p{L}/u.test(char)) {
|
||||
return escapeRegExp(char);
|
||||
}
|
||||
|
||||
const variants = new Set([char, char.toLocaleLowerCase("fr"), char.toLocaleUpperCase("fr")]);
|
||||
const entries = Array.from(variants)
|
||||
.filter((variant) => variant.length > 0)
|
||||
.map((variant) => ({
|
||||
raw: variant,
|
||||
escaped: escapeRegExp(variant),
|
||||
runeLength: Array.from(variant).length,
|
||||
}));
|
||||
|
||||
if (entries.length === 1) {
|
||||
return entries[0].escaped;
|
||||
}
|
||||
|
||||
if (entries.every((entry) => entry.runeLength === 1)) {
|
||||
return `[${entries.map((entry) => entry.escaped).join("")}]`;
|
||||
}
|
||||
|
||||
return `(?:${entries.map((entry) => entry.escaped).join("|")})`;
|
||||
}
|
||||
|
||||
function buildRemainingPattern(characters) {
|
||||
if (characters.length === 0) {
|
||||
return "";
|
||||
}
|
||||
|
||||
let pattern = "";
|
||||
let previousWasWhitespace = false;
|
||||
|
||||
for (const char of characters) {
|
||||
if (/\s/u.test(char)) {
|
||||
if (!previousWasWhitespace) {
|
||||
pattern += "\\s+";
|
||||
previousWasWhitespace = true;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
pattern += escapeRegExp(char);
|
||||
previousWasWhitespace = false;
|
||||
}
|
||||
|
||||
return pattern;
|
||||
}
|
||||
|
||||
function findAvailableMatchRange(regex, text, occupiedRanges, urlRanges) {
|
||||
regex.lastIndex = 0;
|
||||
let match;
|
||||
while ((match = regex.exec(text)) !== null) {
|
||||
const start = match.index;
|
||||
const end = start + match[0].length;
|
||||
if (rangeOverlaps(urlRanges, start, end)) {
|
||||
continue;
|
||||
}
|
||||
if (!overlapsExistingRange(occupiedRanges, start, end)) {
|
||||
return [start, end];
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
function overlapsExistingRange(ranges, start, end) {
|
||||
for (const [existingStart, existingEnd] of ranges) {
|
||||
if (start === existingStart && end === existingEnd) {
|
||||
continue;
|
||||
}
|
||||
if (start < existingEnd && end > existingStart) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
function collectMarkdownUrlRanges(markdown) {
|
||||
const ranges = [];
|
||||
if (!markdown) {
|
||||
return ranges;
|
||||
}
|
||||
|
||||
const linkPattern = /\[[^\]]*\]\(([^)]+)\)/g;
|
||||
let match;
|
||||
while ((match = linkPattern.exec(markdown)) !== null) {
|
||||
const relativeParen = match[0].indexOf("(");
|
||||
if (relativeParen === -1) {
|
||||
continue;
|
||||
}
|
||||
const urlStart = match.index + relativeParen + 1;
|
||||
const urlEnd = urlStart + (match[1] ? match[1].length : 0);
|
||||
ranges.push([urlStart, urlEnd]);
|
||||
}
|
||||
return ranges;
|
||||
}
|
||||
|
||||
function rangeOverlaps(ranges, start, end) {
|
||||
for (const [rangeStart, rangeEnd] of ranges) {
|
||||
if (start < rangeEnd && end > rangeStart) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
function normalizeTypographyForSearch(text) {
|
||||
if (!text) {
|
||||
return "";
|
||||
}
|
||||
return text.replace(/[*_]/g, " ");
|
||||
}
|
||||
|
||||
function shouldSkipSingleWordMatch(keyword, body, range) {
|
||||
if (!keyword || /\s/.test(keyword)) {
|
||||
return false;
|
||||
}
|
||||
const [, end] = range;
|
||||
const lookahead = body.slice(end);
|
||||
return /^\s+[A-Z\u00C0-\u017F]\./u.test(lookahead);
|
||||
}
|
||||
|
||||
function extractIgnoredKeywords(rawFrontmatter, taxonomyMapping) {
|
||||
const ignoreMap = new Map();
|
||||
if (!rawFrontmatter) {
|
||||
return ignoreMap;
|
||||
}
|
||||
|
||||
const lines = rawFrontmatter.split(/\r?\n/);
|
||||
let currentField = null;
|
||||
|
||||
for (const line of lines) {
|
||||
const trimmed = line.trim();
|
||||
if (trimmed.length === 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const fieldMatch = line.match(/^([A-Za-z0-9_-]+):\s*(.*)$/);
|
||||
if (fieldMatch && !line.trimStart().startsWith("#")) {
|
||||
const fieldName = fieldMatch[1];
|
||||
const remainder = fieldMatch[2];
|
||||
if (remainder.trim().length === 0) {
|
||||
currentField = fieldName;
|
||||
} else {
|
||||
currentField = null;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!currentField) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const commentMatch = line.match(/^\s*#\s*-\s*(.+?)\s*$/);
|
||||
if (!commentMatch) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let value = commentMatch[1].trim();
|
||||
if (!value) {
|
||||
continue;
|
||||
}
|
||||
if ((value.startsWith('"') && value.endsWith('"')) || (value.startsWith("'") && value.endsWith("'"))) {
|
||||
value = value.slice(1, -1).trim();
|
||||
}
|
||||
if (!value) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const canonical = taxonomyMapping.fieldToCanonical.get(currentField);
|
||||
if (!canonical) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const normalized = normalizeTerm(value);
|
||||
if (!normalized) {
|
||||
continue;
|
||||
}
|
||||
if (!ignoreMap.has(canonical)) {
|
||||
ignoreMap.set(canonical, new Set());
|
||||
}
|
||||
ignoreMap.get(canonical).add(normalized);
|
||||
}
|
||||
|
||||
return ignoreMap;
|
||||
}
|
||||
|
||||
function isIgnoredKeyword(canonical, normalizedValue, ignoreMap) {
|
||||
if (!canonical || !normalizedValue) {
|
||||
return false;
|
||||
}
|
||||
const values = ignoreMap.get(canonical);
|
||||
if (!values) {
|
||||
return false;
|
||||
}
|
||||
return values.has(normalizedValue);
|
||||
}
|
||||
|
||||
main();
|
||||
Reference in New Issue
Block a user