diff --git a/tools/check_internal_links.js b/tools/check_internal_links.js index 7c5d7ad6..3b8508c4 100644 --- a/tools/check_internal_links.js +++ b/tools/check_internal_links.js @@ -2,15 +2,18 @@ const fs = require("fs"); const path = require("path"); +const yaml = require("js-yaml"); const { sanitizeUrlCandidate } = require("./lib/markdown_links"); const SITE_ROOT = path.resolve(__dirname, ".."); const CONTENT_DIR = path.join(SITE_ROOT, "content"); +const TAXONOMIES_FILE = path.join(SITE_ROOT, "config", "_default", "taxonomies.yaml"); const TARGET_EXTENSIONS = new Set([".md", ".markdown", ".mdx", ".yaml", ".yml"]); const MARKDOWN_EXTENSIONS = new Set([".md", ".markdown", ".mdx"]); const INTERNAL_LINK_REGEX = /\/[^\s"'`<>\\\[\]{}|]+/g; const VALID_PREFIX_REGEX = /[\s"'`([<{=:]/; const PATH_KEY_REGEX = /^\s*(?:"path"|'path'|path)\s*:/i; +const FRONTMATTER_PATTERN = /^---\r?\n([\s\S]+?)\r?\n---\r?\n?/; function toPosix(value) { return value.split(path.sep).join("/"); @@ -58,6 +61,199 @@ function collectContentEntries(rootDir) { return { files, directories }; } +function collectTaxonomyKeywordPaths(files) { + const mapping = loadTaxonomyMapping(TAXONOMIES_FILE); + if (!mapping) { + return new Set(); + } + + const keywordPaths = new Set(); + + for (const filePath of files) { + if (!isMarkdownFile(filePath)) { + continue; + } + + let raw; + try { + raw = fs.readFileSync(filePath, "utf8"); + } catch (error) { + console.warn( + `Impossible de lire ${relativeToSite(filePath)} pour extraire les taxonomies (${error.message}).`, + ); + continue; + } + + const frontmatterMatch = raw.match(FRONTMATTER_PATTERN); + if (!frontmatterMatch) { + continue; + } + + let frontmatter = {}; + try { + frontmatter = yaml.load(frontmatterMatch[1]) || {}; + } catch (error) { + console.warn(`Frontmatter invalide dans ${relativeToSite(filePath)} (${error.message}).`); + continue; + } + + const keywords = extractTaxonomyKeywords( + frontmatter, + frontmatterMatch[1], + mapping.fieldToCanonical, + ); + for (const keyword of keywords) { + const normalized = normalizeInternalLink(keyword.url); + if (normalized) { + keywordPaths.add(normalized); + } + } + } + + return keywordPaths; +} + +function loadTaxonomyMapping(configPath) { + let raw; + try { + raw = fs.readFileSync(configPath, "utf8"); + } catch (error) { + console.warn(`Impossible de lire ${relativeToSite(configPath)} (${error.message}).`); + return null; + } + + let data; + try { + data = yaml.load(raw) || {}; + } catch (error) { + console.warn(`YAML invalide dans ${relativeToSite(configPath)} (${error.message}).`); + return null; + } + + if (typeof data !== "object" || data === null) { + console.warn(`Format inattendu dans ${relativeToSite(configPath)}.`); + return null; + } + + const fieldToCanonical = new Map(); + for (const [singular, plural] of Object.entries(data)) { + const canonical = + typeof plural === "string" && plural.trim().length > 0 ? plural.trim() : singular.trim(); + if (!canonical) continue; + const candidates = new Set([singular, canonical].filter(Boolean)); + for (const candidate of candidates) { + fieldToCanonical.set(candidate, canonical); + } + } + + if (fieldToCanonical.size === 0) { + console.warn("Aucune taxonomie valide n'a été trouvée."); + return null; + } + + return { fieldToCanonical }; +} + +function extractTaxonomyKeywords(frontmatter, frontmatterRaw, fieldToCanonical) { + const keywords = []; + const seen = new Set(); + + function addKeyword(taxonomy, term) { + if (!taxonomy || typeof term !== "string") return; + const normalized = term.trim(); + if (!normalized) return; + const slug = slugify(normalized); + if (!slug) return; + const key = `${taxonomy}::${normalized.toLowerCase()}`; + if (seen.has(key)) return; + seen.add(key); + keywords.push({ + taxonomy, + term: normalized, + url: `/${taxonomy}/${slug}/`, + }); + } + + if (typeof frontmatter === "object" && frontmatter !== null) { + for (const [field, value] of Object.entries(frontmatter)) { + const canonical = fieldToCanonical.get(field); + if (!canonical) continue; + const terms = normalizeTerms(value); + for (const term of terms) { + addKeyword(canonical, term); + } + } + } + + for (const entry of extractCommentedTerms(frontmatterRaw, fieldToCanonical)) { + addKeyword(entry.taxonomy, entry.term); + } + + return keywords; +} + +function normalizeTerms(value) { + if (Array.isArray(value)) { + return value.map((item) => normalizeTerm(item)).filter(Boolean); + } + const single = normalizeTerm(value); + return single ? [single] : []; +} + +function normalizeTerm(value) { + if (typeof value !== "string") return null; + const trimmed = value.trim(); + return trimmed.length > 0 ? trimmed : null; +} + +function extractCommentedTerms(frontmatterRaw, fieldToCanonical) { + if (typeof frontmatterRaw !== "string" || frontmatterRaw.length === 0) { + return []; + } + + const results = []; + const lines = frontmatterRaw.split(/\r?\n/); + let currentCanonical = null; + let currentIndent = 0; + + for (const line of lines) { + const indent = getIndentation(line); + const fieldMatch = line.match(/^\s*([A-Za-z0-9_]+):\s*(?:#.*)?$/); + if (fieldMatch) { + const fieldName = fieldMatch[1]; + currentCanonical = fieldToCanonical.get(fieldName) || null; + currentIndent = indent; + continue; + } + + if (!currentCanonical) continue; + const commentMatch = line.match(/^\s*#\s*-\s+(.*)$/); + if (!commentMatch) continue; + if (indent <= currentIndent) continue; + const term = commentMatch[1].trim(); + if (!term) continue; + results.push({ taxonomy: currentCanonical, term }); + } + + return results; +} + +function getIndentation(line) { + if (typeof line !== "string" || line.length === 0) return 0; + const match = line.match(/^\s*/); + return match ? match[0].length : 0; +} + +function slugify(value) { + return value + .normalize("NFD") + .replace(/\p{Diacritic}/gu, "") + .toLowerCase() + .replace(/[^a-z0-9]+/g, "-") + .replace(/^-+|-+$/g, "") + .replace(/-{2,}/g, "-"); +} + function sanitizeInternalLink(raw) { const candidate = sanitizeUrlCandidate(raw); if (!candidate) return null; @@ -276,6 +472,10 @@ function main() { } const { files, directories } = collectContentEntries(CONTENT_DIR); + const taxonomyPaths = collectTaxonomyKeywordPaths(files); + for (const keywordPath of taxonomyPaths) { + directories.add(keywordPath); + } const missingLinks = new Map(); for (const filePath of files) {