1
Files
2025/tools/check_internal_links.js

525 lines
14 KiB
JavaScript

#!/usr/bin/env node
const fs = require("fs");
const path = require("path");
const yaml = require("js-yaml");
const { sanitizeUrlCandidate } = require("./lib/markdown_links");
const SITE_ROOT = path.resolve(__dirname, "..");
const CONTENT_DIR = path.join(SITE_ROOT, "content");
const TAXONOMIES_FILE = path.join(SITE_ROOT, "config", "_default", "taxonomies.yaml");
const TARGET_EXTENSIONS = new Set([".md", ".markdown", ".mdx", ".yaml", ".yml"]);
const MARKDOWN_EXTENSIONS = new Set([".md", ".markdown", ".mdx"]);
const INTERNAL_LINK_REGEX = /\/[^\s"'`<>\\\[\]{}|]+/g;
const VALID_PREFIX_REGEX = /[\s"'`([<{=:]/;
const PATH_KEY_REGEX = /^\s*(?:"path"|'path'|path)\s*:/i;
const FRONTMATTER_PATTERN = /^---\r?\n([\s\S]+?)\r?\n---\r?\n?/;
function toPosix(value) {
return value.split(path.sep).join("/");
}
function relativeToSite(filePath) {
return toPosix(path.relative(SITE_ROOT, filePath));
}
function isTargetFile(filePath) {
const ext = path.extname(filePath).toLowerCase();
return TARGET_EXTENSIONS.has(ext);
}
function isMarkdownFile(filePath) {
const ext = path.extname(filePath).toLowerCase();
return MARKDOWN_EXTENSIONS.has(ext);
}
function isYamlFile(filePath) {
const ext = path.extname(filePath).toLowerCase();
return ext === ".yaml" || ext === ".yml";
}
function collectContentEntries(rootDir) {
const files = [];
const directories = new Set(["/"]);
function walk(currentDir) {
const entries = fs.readdirSync(currentDir, { withFileTypes: true });
for (const entry of entries) {
const fullPath = path.join(currentDir, entry.name);
if (entry.isDirectory()) {
const relative = path.relative(rootDir, fullPath);
const normalized = relative ? `/${toPosix(relative)}` : "/";
directories.add(normalized);
walk(fullPath);
} else if (entry.isFile() && isTargetFile(fullPath)) {
files.push(fullPath);
}
}
}
walk(rootDir);
return { files, directories };
}
function collectTaxonomyKeywordPaths(files) {
const mapping = loadTaxonomyMapping(TAXONOMIES_FILE);
if (!mapping) {
return new Set();
}
const keywordPaths = new Set();
for (const filePath of files) {
if (!isMarkdownFile(filePath)) {
continue;
}
let raw;
try {
raw = fs.readFileSync(filePath, "utf8");
} catch (error) {
console.warn(
`Impossible de lire ${relativeToSite(filePath)} pour extraire les taxonomies (${error.message}).`,
);
continue;
}
const frontmatterMatch = raw.match(FRONTMATTER_PATTERN);
if (!frontmatterMatch) {
continue;
}
let frontmatter = {};
try {
frontmatter = yaml.load(frontmatterMatch[1]) || {};
} catch (error) {
console.warn(`Frontmatter invalide dans ${relativeToSite(filePath)} (${error.message}).`);
continue;
}
const keywords = extractTaxonomyKeywords(
frontmatter,
frontmatterMatch[1],
mapping.fieldToCanonical,
);
for (const keyword of keywords) {
const normalized = normalizeInternalLink(keyword.url);
if (normalized) {
keywordPaths.add(normalized);
}
}
}
return keywordPaths;
}
function loadTaxonomyMapping(configPath) {
let raw;
try {
raw = fs.readFileSync(configPath, "utf8");
} catch (error) {
console.warn(`Impossible de lire ${relativeToSite(configPath)} (${error.message}).`);
return null;
}
let data;
try {
data = yaml.load(raw) || {};
} catch (error) {
console.warn(`YAML invalide dans ${relativeToSite(configPath)} (${error.message}).`);
return null;
}
if (typeof data !== "object" || data === null) {
console.warn(`Format inattendu dans ${relativeToSite(configPath)}.`);
return null;
}
const fieldToCanonical = new Map();
for (const [singular, plural] of Object.entries(data)) {
const canonical =
typeof plural === "string" && plural.trim().length > 0 ? plural.trim() : singular.trim();
if (!canonical) continue;
const candidates = new Set([singular, canonical].filter(Boolean));
for (const candidate of candidates) {
fieldToCanonical.set(candidate, canonical);
}
}
if (fieldToCanonical.size === 0) {
console.warn("Aucune taxonomie valide n'a été trouvée.");
return null;
}
return { fieldToCanonical };
}
function extractTaxonomyKeywords(frontmatter, frontmatterRaw, fieldToCanonical) {
const keywords = [];
const seen = new Set();
function addKeyword(taxonomy, term) {
if (!taxonomy || typeof term !== "string") return;
const normalized = term.trim();
if (!normalized) return;
const slug = slugify(normalized);
if (!slug) return;
const key = `${taxonomy}::${normalized.toLowerCase()}`;
if (seen.has(key)) return;
seen.add(key);
keywords.push({
taxonomy,
term: normalized,
url: `/${taxonomy}/${slug}/`,
});
}
if (typeof frontmatter === "object" && frontmatter !== null) {
for (const [field, value] of Object.entries(frontmatter)) {
const canonical = fieldToCanonical.get(field);
if (!canonical) continue;
const terms = normalizeTerms(value);
for (const term of terms) {
addKeyword(canonical, term);
}
}
}
for (const entry of extractCommentedTerms(frontmatterRaw, fieldToCanonical)) {
addKeyword(entry.taxonomy, entry.term);
}
return keywords;
}
function normalizeTerms(value) {
if (Array.isArray(value)) {
return value.map((item) => normalizeTerm(item)).filter(Boolean);
}
const single = normalizeTerm(value);
return single ? [single] : [];
}
function normalizeTerm(value) {
if (typeof value !== "string") return null;
const trimmed = value.trim();
return trimmed.length > 0 ? trimmed : null;
}
function extractCommentedTerms(frontmatterRaw, fieldToCanonical) {
if (typeof frontmatterRaw !== "string" || frontmatterRaw.length === 0) {
return [];
}
const results = [];
const lines = frontmatterRaw.split(/\r?\n/);
let currentCanonical = null;
let currentIndent = 0;
for (const line of lines) {
const indent = getIndentation(line);
const fieldMatch = line.match(/^\s*([A-Za-z0-9_]+):\s*(?:#.*)?$/);
if (fieldMatch) {
const fieldName = fieldMatch[1];
currentCanonical = fieldToCanonical.get(fieldName) || null;
currentIndent = indent;
continue;
}
if (!currentCanonical) continue;
const commentMatch = line.match(/^\s*#\s*-\s+(.*)$/);
if (!commentMatch) continue;
if (indent <= currentIndent) continue;
const term = commentMatch[1].trim();
if (!term) continue;
results.push({ taxonomy: currentCanonical, term });
}
return results;
}
function getIndentation(line) {
if (typeof line !== "string" || line.length === 0) return 0;
const match = line.match(/^\s*/);
return match ? match[0].length : 0;
}
function slugify(value) {
return value
.normalize("NFD")
.replace(/\p{Diacritic}/gu, "")
.toLowerCase()
.replace(/[^a-z0-9]+/g, "-")
.replace(/^-+|-+$/g, "")
.replace(/-{2,}/g, "-");
}
function sanitizeInternalLink(raw) {
const candidate = sanitizeUrlCandidate(raw);
if (!candidate) return null;
if (!candidate.startsWith("/")) return null;
if (candidate.startsWith("//")) return null;
if (candidate.includes("://")) return null;
return candidate;
}
function normalizeInternalLink(link) {
if (typeof link !== "string" || !link.startsWith("/")) {
return null;
}
let normalized = link.split("?")[0];
normalized = normalized.split("#")[0];
normalized = normalized.replace(/\/+/g, "/");
normalized = normalized.replace(/\/+$/, "");
if (!normalized) {
normalized = "/";
}
return normalized;
}
function expectedDirForLink(link) {
if (link === "/") {
return CONTENT_DIR;
}
const relative = link.slice(1);
const segments = relative.split("/").filter(Boolean);
return path.join(CONTENT_DIR, ...segments);
}
function countRepeatedChar(text, startIndex, char) {
let count = 0;
while (text[startIndex + count] === char) {
count++;
}
return count;
}
function findMatchingPair(text, startIndex, openChar, closeChar) {
let depth = 0;
for (let i = startIndex; i < text.length; i++) {
const ch = text[i];
if (ch === "\\") {
i++;
continue;
}
if (ch === openChar) {
depth++;
} else if (ch === closeChar) {
depth--;
if (depth === 0) {
return i;
}
}
}
return -1;
}
function extractMarkdownLinksFromLine(line) {
const results = [];
let inlineFence = null;
for (let i = 0; i < line.length; i++) {
const ch = line[i];
if (ch === "`") {
const runLength = countRepeatedChar(line, i, "`");
if (!inlineFence) {
inlineFence = runLength;
} else if (inlineFence === runLength) {
inlineFence = null;
}
i += runLength - 1;
continue;
}
if (inlineFence) {
continue;
}
if (ch !== "[") {
continue;
}
const closeBracket = findMatchingPair(line, i, "[", "]");
if (closeBracket === -1) {
break;
}
let pointer = closeBracket + 1;
while (pointer < line.length && /\s/.test(line[pointer])) {
pointer++;
}
if (pointer >= line.length || line[pointer] !== "(") {
i = closeBracket;
continue;
}
const closeParen = findMatchingPair(line, pointer, "(", ")");
if (closeParen === -1) {
break;
}
const destination = line.slice(pointer + 1, closeParen);
results.push({ destination });
i = closeParen;
}
return results;
}
function extractInternalLinks(filePath) {
const content = fs.readFileSync(filePath, "utf8");
const lines = content.split(/\r?\n/);
const entries = [];
const skipPathKey = isYamlFile(filePath);
const treatAsMarkdown = isMarkdownFile(filePath);
let fenceDelimiter = null;
let inFrontMatter = false;
for (let index = 0; index < lines.length; index++) {
const line = lines[index];
const trimmed = line.trim();
if (treatAsMarkdown) {
if (index === 0 && trimmed === "---") {
inFrontMatter = true;
continue;
}
if (inFrontMatter) {
if (trimmed === "---") {
inFrontMatter = false;
}
continue;
}
const fenceMatch = trimmed.match(/^(```+|~~~+)/);
if (fenceMatch) {
const delimiterChar = fenceMatch[1][0];
if (!fenceDelimiter) {
fenceDelimiter = delimiterChar;
} else if (delimiterChar === fenceDelimiter) {
fenceDelimiter = null;
}
continue;
}
if (fenceDelimiter) {
continue;
}
const markdownLinks = extractMarkdownLinksFromLine(line);
for (const { destination } of markdownLinks) {
const sanitized = sanitizeInternalLink(destination);
if (!sanitized) continue;
const normalized = normalizeInternalLink(sanitized);
if (!normalized || normalized === "//") continue;
entries.push({ link: normalized, line: index + 1 });
}
continue;
}
if (skipPathKey && PATH_KEY_REGEX.test(line)) {
continue;
}
for (const match of line.matchAll(INTERNAL_LINK_REGEX)) {
const raw = match[0];
const startIndex = match.index ?? line.indexOf(raw);
if (startIndex > 0) {
const prevChar = line[startIndex - 1];
if (!VALID_PREFIX_REGEX.test(prevChar)) {
continue;
}
}
const sanitized = sanitizeInternalLink(raw);
if (!sanitized) continue;
const normalized = normalizeInternalLink(sanitized);
if (!normalized || normalized === "//") continue;
entries.push({ link: normalized, line: index + 1 });
}
}
return entries;
}
function addMissingLink(missingMap, link, filePath, line) {
let entry = missingMap.get(link);
if (!entry) {
entry = {
expectedPath: expectedDirForLink(link),
references: [],
referenceKeys: new Set(),
};
missingMap.set(link, entry);
}
const referenceKey = `${filePath}:${line}`;
if (entry.referenceKeys.has(referenceKey)) {
return;
}
entry.referenceKeys.add(referenceKey);
entry.references.push({
file: relativeToSite(filePath),
line,
});
}
function main() {
if (!fs.existsSync(CONTENT_DIR)) {
console.error(`Le dossier content est introuvable (${CONTENT_DIR}).`);
process.exit(1);
}
const { files, directories } = collectContentEntries(CONTENT_DIR);
const taxonomyPaths = collectTaxonomyKeywordPaths(files);
for (const keywordPath of taxonomyPaths) {
directories.add(keywordPath);
}
const missingLinks = new Map();
for (const filePath of files) {
let entries;
try {
entries = extractInternalLinks(filePath);
} catch (error) {
console.warn(`Impossible de lire ${relativeToSite(filePath)} (${error.message}).`);
continue;
}
for (const { link, line } of entries) {
if (directories.has(link)) {
continue;
}
addMissingLink(missingLinks, link, filePath, line);
}
}
if (missingLinks.size === 0) {
console.log("Tous les liens internes pointent vers un dossier existant.");
return;
}
console.error(`Liens internes cassés détectés: ${missingLinks.size}`);
const sorted = Array.from(missingLinks.entries()).sort((a, b) => a[0].localeCompare(b[0], "fr"));
for (const [link, data] of sorted) {
const expectedRelative = relativeToSite(data.expectedPath);
console.error(`- ${link} (attendu: ${expectedRelative})`);
for (const reference of data.references) {
console.error(`${reference.file}:${reference.line}`);
}
}
process.exitCode = 1;
}
if (require.main === module) {
try {
main();
} catch (error) {
console.error(`Erreur lors de la vérification des liens internes: ${error.message}`);
process.exit(1);
}
}