1

Automatisation préliminaire de l'attribution et du linking des mots-clé

This commit is contained in:
2025-11-16 15:48:42 +01:00
parent 0f5f27e40a
commit afc4f45ad7
3 changed files with 1077 additions and 0 deletions

View File

@@ -0,0 +1,646 @@
#!/usr/bin/env node
/**
* Automatically attaches taxonomy terms to Hugo articles by scanning the body
* of each Markdown file for known keywords that already exist in frontmatter.
*
* Usage:
* node tools/link_taxonomy_terms.js [--dry-run] [paths...]
*
* Without arguments every Markdown file under content/ is processed.
*/
const fs = require("node:fs");
const path = require("node:path");
const yaml = require("js-yaml");
const PROJECT_ROOT = path.resolve(__dirname, "..");
const CONTENT_ROOT = path.join(PROJECT_ROOT, "content");
const TAXONOMIES_PATH = path.join(PROJECT_ROOT, "config", "_default", "taxonomies.yaml");
const FRONTMATTER_PATTERN = /^---\n([\s\S]+?)\n---\n?([\s\S]*)$/;
const collator = new Intl.Collator("fr", { sensitivity: "base", usage: "sort" });
function main() {
const { options, targets } = parseArgs(process.argv.slice(2));
const taxonomyMapping = loadTaxonomyMapping(TAXONOMIES_PATH);
if (taxonomyMapping.canonicalNames.length === 0) {
console.error("❌ No taxonomies found in config/_default/taxonomies.yaml");
process.exit(1);
}
const files = collectMarkdownFiles(targets);
if (files.length === 0) {
console.log("No Markdown content found to analyse.");
return;
}
const articles = files
.map((filePath) => parseArticle(filePath))
.filter((article) => article !== null);
if (articles.length === 0) {
console.log("No articles with valid YAML frontmatter were found.");
return;
}
const { catalog, totalKeywords } = buildKeywordCatalog(articles, taxonomyMapping);
if (totalKeywords === 0) {
console.log("No taxonomy keywords available to propagate.");
return;
}
console.log(
`Catalogued ${totalKeywords} keyword${totalKeywords > 1 ? "s" : ""} across ${
catalog.size
} taxonomie${catalog.size > 1 ? "s" : ""}.`
);
const modifications = applyTaxonomies(articles, catalog, taxonomyMapping, options);
if (modifications.length === 0) {
console.log("No taxonomy updates required.");
if (options.dryRun) {
console.log("Dry-run only: no files would be modified.");
}
return;
}
for (const change of modifications) {
const relPath = path.relative(PROJECT_ROOT, change.path);
console.log(`✏️ ${relPath}`);
for (const [taxonomy, values] of change.additions.entries()) {
console.log(` ${taxonomy}: ${values.join(", ")}`);
}
}
if (options.dryRun) {
console.log(`Dry-run complete. ${modifications.length} article(s) would be updated.`);
} else {
console.log(`Updated ${modifications.length} article(s).`);
console.log(`Vérifier les modifications.`);
process.exit(2);
}
}
function parseArgs(argv) {
const options = { dryRun: false };
const targets = [];
for (const arg of argv) {
if (arg === "--dry-run" || arg === "--check") {
options.dryRun = true;
} else if (arg === "--help" || arg === "-h") {
showUsage();
process.exit(0);
} else if (arg.startsWith("-")) {
console.error(`Unknown option: ${arg}`);
showUsage();
process.exit(1);
} else {
targets.push(arg);
}
}
return { options, targets };
}
function showUsage() {
console.log(`Usage: node tools/link_taxonomy_terms.js [--dry-run] [path...]
Options
--dry-run Analyse files but do not rewrite anything
--help Show this message
Examples
node tools/link_taxonomy_terms.js --dry-run
node tools/link_taxonomy_terms.js content/interets/paleontologie`);
}
function loadTaxonomyMapping(configPath) {
let raw;
try {
raw = fs.readFileSync(configPath, "utf8");
} catch (error) {
throw new Error(`Unable to read ${configPath}: ${error.message}`);
}
let data;
try {
data = yaml.load(raw) || {};
} catch (error) {
throw new Error(`Invalid YAML in ${configPath}: ${error.message}`);
}
if (typeof data !== "object" || Array.isArray(data)) {
throw new Error(`Unexpected taxonomies format in ${configPath}`);
}
const fieldToCanonical = new Map();
const canonicalToFields = new Map();
for (const [singular, plural] of Object.entries(data)) {
const canonicalName = typeof plural === "string" && plural.trim().length > 0 ? plural.trim() : singular;
if (!canonicalName) continue;
const candidateNames = new Set([singular, canonicalName].filter(Boolean));
for (const name of candidateNames) {
fieldToCanonical.set(name, canonicalName);
if (!canonicalToFields.has(canonicalName)) {
canonicalToFields.set(canonicalName, new Set());
}
canonicalToFields.get(canonicalName).add(name);
}
}
return {
fieldToCanonical,
canonicalToFields,
canonicalNames: Array.from(canonicalToFields.keys()),
};
}
function collectMarkdownFiles(targets) {
const files = new Set();
if (targets.length === 0) {
walkContentTree(CONTENT_ROOT, files);
return Array.from(files).sort();
}
for (const target of targets) {
const absolute = path.resolve(PROJECT_ROOT, target);
if (!fs.existsSync(absolute)) {
console.warn(`⚠️ Skipping missing path: ${target}`);
continue;
}
const stats = fs.statSync(absolute);
if (stats.isDirectory()) {
walkContentTree(absolute, files);
} else if (stats.isFile() && absolute.toLowerCase().endsWith(".md")) {
files.add(absolute);
}
}
return Array.from(files).sort();
}
function walkContentTree(dir, fileSet) {
let entries;
try {
entries = fs.readdirSync(dir, { withFileTypes: true });
} catch (error) {
console.warn(`⚠️ Cannot read ${dir}: ${error.message}`);
return;
}
for (const entry of entries) {
const fullPath = path.join(dir, entry.name);
if (entry.isDirectory()) {
if (entry.name === ".git" || entry.name === "node_modules") continue;
walkContentTree(fullPath, fileSet);
} else if (entry.isFile() && entry.name.toLowerCase().endsWith(".md")) {
fileSet.add(fullPath);
}
}
}
function parseArticle(filePath) {
let raw;
try {
raw = fs.readFileSync(filePath, "utf8");
} catch (error) {
console.warn(`⚠️ Unable to read ${filePath}: ${error.message}`);
return null;
}
const match = raw.match(FRONTMATTER_PATTERN);
if (!match) {
console.warn(`⚠️ ${path.relative(PROJECT_ROOT, filePath)} is missing YAML frontmatter. Skipping.`);
return null;
}
let data = {};
try {
data = yaml.load(match[1]) || {};
} catch (error) {
console.warn(`⚠️ Failed to parse frontmatter in ${filePath}: ${error.message}`);
return null;
}
if (typeof data !== "object" || Array.isArray(data)) {
console.warn(`⚠️ Unexpected frontmatter structure in ${filePath}. Skipping.`);
return null;
}
return {
path: filePath,
frontmatter: data,
frontmatterRaw: match[1],
body: match[2] || "",
};
}
function buildKeywordCatalog(articles, taxonomyMapping) {
const keywordMaps = new Map();
for (const canonical of taxonomyMapping.canonicalNames) {
keywordMaps.set(canonical, new Map());
}
for (const article of articles) {
const frontmatter = article.frontmatter;
for (const [field, value] of Object.entries(frontmatter)) {
const canonical = taxonomyMapping.fieldToCanonical.get(field);
if (!canonical) continue;
const strings = toStringArray(value);
if (strings.length === 0) continue;
const lookup = keywordMaps.get(canonical);
for (const entry of strings) {
const normalized = normalizeTerm(entry);
if (!normalized || lookup.has(normalized)) continue;
lookup.set(normalized, entry);
}
}
}
const catalog = new Map();
let totalKeywords = 0;
for (const [canonical, map] of keywordMaps.entries()) {
if (map.size === 0) continue;
const sortedValues = Array.from(map.values()).sort(compareKeywords);
const entries = [];
for (const value of sortedValues) {
const pattern = buildKeywordPattern(value);
if (!pattern) continue;
entries.push({ value, pattern });
}
if (entries.length === 0) continue;
totalKeywords += entries.length;
catalog.set(canonical, entries);
}
return { catalog, totalKeywords };
}
function applyTaxonomies(articles, catalog, taxonomyMapping, options) {
const changes = [];
for (const article of articles) {
const additions = new Map();
let mutated = false;
const occupiedRanges = [];
const taxonomyStates = new Map();
const keywordTasks = [];
const ignoredKeywords = extractIgnoredKeywords(article.frontmatterRaw, taxonomyMapping);
for (const [canonical, keywordEntries] of catalog.entries()) {
if (keywordEntries.length === 0) continue;
const fieldName = resolveFieldName(article.frontmatter, canonical, taxonomyMapping);
const currentValues = toStringArray(article.frontmatter[fieldName]);
const normalizedExisting = new Set(currentValues.map((value) => normalizeTerm(value)));
const state = {
canonical,
fieldName,
currentValues,
normalizedExisting,
};
taxonomyStates.set(canonical, state);
for (const entry of keywordEntries) {
keywordTasks.push({
canonical,
value: entry.value,
pattern: entry.pattern,
state,
});
}
}
keywordTasks.sort((a, b) => compareKeywords(a.value, b.value));
const urlRanges = collectMarkdownUrlRanges(article.body);
const searchableBody = normalizeTypographyForSearch(article.body);
for (const task of keywordTasks) {
const { state, canonical, value, pattern } = task;
const regex = new RegExp(pattern, "gu");
const matchRange = findAvailableMatchRange(regex, searchableBody, occupiedRanges, urlRanges);
if (!matchRange) {
continue;
}
if (shouldSkipSingleWordMatch(value, article.body, matchRange)) {
occupiedRanges.push(matchRange);
continue;
}
occupiedRanges.push(matchRange);
const normalized = normalizeTerm(value);
if (state.normalizedExisting.has(normalized)) {
continue;
}
if (isIgnoredKeyword(canonical, normalized, ignoredKeywords)) {
continue;
}
state.currentValues.push(value);
state.normalizedExisting.add(normalized);
mutated = true;
article.frontmatter[state.fieldName] = state.currentValues;
if (!additions.has(canonical)) {
additions.set(canonical, []);
}
additions.get(canonical).push(value);
}
if (mutated) {
if (!options.dryRun) {
writeArticle(article);
}
changes.push({ path: article.path, additions });
}
}
return changes;
}
function resolveFieldName(frontmatter, canonicalName, taxonomyMapping) {
const candidateSet = taxonomyMapping.canonicalToFields.get(canonicalName);
if (candidateSet) {
for (const key of Object.keys(frontmatter)) {
if (candidateSet.has(key)) {
return key;
}
}
}
return canonicalName;
}
function writeArticle(article) {
const yamlContent = yaml.dump(article.frontmatter, { lineWidth: 120, sortKeys: false });
const finalBody = article.body || "";
const next = `---\n${yamlContent}---\n${finalBody}`;
fs.writeFileSync(article.path, next, "utf8");
}
function toStringArray(value) {
if (Array.isArray(value)) {
return value
.map((entry) => transformToString(entry))
.filter((entry) => entry.length > 0);
}
const single = transformToString(value);
return single.length > 0 ? [single] : [];
}
function transformToString(value) {
if (value === null || value === undefined) {
return "";
}
if (typeof value === "string") {
return value.trim();
}
if (typeof value === "number") {
return String(value);
}
return "";
}
function normalizeTerm(value) {
return transformToString(value).normalize("NFKC").toLocaleLowerCase("fr");
}
function compareKeywords(a, b) {
const diff = b.length - a.length;
if (diff !== 0) {
return diff;
}
return collator.compare(a, b);
}
function escapeRegExp(value) {
return value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
}
function buildKeywordPattern(value) {
const keyword = transformToString(value);
if (!keyword) {
return null;
}
const characters = Array.from(keyword);
if (characters.length === 0) {
return null;
}
const firstChar = characters[0];
const restChars = characters.slice(1);
const firstPattern = buildFirstCharacterPattern(firstChar);
const restPattern = buildRemainingPattern(restChars);
return `(?<![\\p{L}\\p{N}_])${firstPattern}${restPattern}(?![\\p{L}\\p{N}_])`;
}
function buildFirstCharacterPattern(char) {
if (!/\p{L}/u.test(char)) {
return escapeRegExp(char);
}
const variants = new Set([char, char.toLocaleLowerCase("fr"), char.toLocaleUpperCase("fr")]);
const entries = Array.from(variants)
.filter((variant) => variant.length > 0)
.map((variant) => ({
raw: variant,
escaped: escapeRegExp(variant),
runeLength: Array.from(variant).length,
}));
if (entries.length === 1) {
return entries[0].escaped;
}
if (entries.every((entry) => entry.runeLength === 1)) {
return `[${entries.map((entry) => entry.escaped).join("")}]`;
}
return `(?:${entries.map((entry) => entry.escaped).join("|")})`;
}
function buildRemainingPattern(characters) {
if (characters.length === 0) {
return "";
}
let pattern = "";
let previousWasWhitespace = false;
for (const char of characters) {
if (/\s/u.test(char)) {
if (!previousWasWhitespace) {
pattern += "\\s+";
previousWasWhitespace = true;
}
continue;
}
pattern += escapeRegExp(char);
previousWasWhitespace = false;
}
return pattern;
}
function findAvailableMatchRange(regex, text, occupiedRanges, urlRanges) {
regex.lastIndex = 0;
let match;
while ((match = regex.exec(text)) !== null) {
const start = match.index;
const end = start + match[0].length;
if (rangeOverlaps(urlRanges, start, end)) {
continue;
}
if (!overlapsExistingRange(occupiedRanges, start, end)) {
return [start, end];
}
}
return null;
}
function overlapsExistingRange(ranges, start, end) {
for (const [existingStart, existingEnd] of ranges) {
if (start === existingStart && end === existingEnd) {
continue;
}
if (start < existingEnd && end > existingStart) {
return true;
}
}
return false;
}
function collectMarkdownUrlRanges(markdown) {
const ranges = [];
if (!markdown) {
return ranges;
}
const linkPattern = /\[[^\]]*\]\(([^)]+)\)/g;
let match;
while ((match = linkPattern.exec(markdown)) !== null) {
const relativeParen = match[0].indexOf("(");
if (relativeParen === -1) {
continue;
}
const urlStart = match.index + relativeParen + 1;
const urlEnd = urlStart + (match[1] ? match[1].length : 0);
ranges.push([urlStart, urlEnd]);
}
return ranges;
}
function rangeOverlaps(ranges, start, end) {
for (const [rangeStart, rangeEnd] of ranges) {
if (start < rangeEnd && end > rangeStart) {
return true;
}
}
return false;
}
function normalizeTypographyForSearch(text) {
if (!text) {
return "";
}
return text.replace(/[*_]/g, " ");
}
function shouldSkipSingleWordMatch(keyword, body, range) {
if (!keyword || /\s/.test(keyword)) {
return false;
}
const [, end] = range;
const lookahead = body.slice(end);
return /^\s+[A-Z\u00C0-\u017F]\./u.test(lookahead);
}
function extractIgnoredKeywords(rawFrontmatter, taxonomyMapping) {
const ignoreMap = new Map();
if (!rawFrontmatter) {
return ignoreMap;
}
const lines = rawFrontmatter.split(/\r?\n/);
let currentField = null;
for (const line of lines) {
const trimmed = line.trim();
if (trimmed.length === 0) {
continue;
}
const fieldMatch = line.match(/^([A-Za-z0-9_-]+):\s*(.*)$/);
if (fieldMatch && !line.trimStart().startsWith("#")) {
const fieldName = fieldMatch[1];
const remainder = fieldMatch[2];
if (remainder.trim().length === 0) {
currentField = fieldName;
} else {
currentField = null;
}
continue;
}
if (!currentField) {
continue;
}
const commentMatch = line.match(/^\s*#\s*-\s*(.+?)\s*$/);
if (!commentMatch) {
continue;
}
let value = commentMatch[1].trim();
if (!value) {
continue;
}
if ((value.startsWith('"') && value.endsWith('"')) || (value.startsWith("'") && value.endsWith("'"))) {
value = value.slice(1, -1).trim();
}
if (!value) {
continue;
}
const canonical = taxonomyMapping.fieldToCanonical.get(currentField);
if (!canonical) {
continue;
}
const normalized = normalizeTerm(value);
if (!normalized) {
continue;
}
if (!ignoreMap.has(canonical)) {
ignoreMap.set(canonical, new Set());
}
ignoreMap.get(canonical).add(normalized);
}
return ignoreMap;
}
function isIgnoredKeyword(canonical, normalizedValue, ignoreMap) {
if (!canonical || !normalizedValue) {
return false;
}
const values = ignoreMap.get(canonical);
if (!values) {
return false;
}
return values.has(normalizedValue);
}
main();