1
Files
2025/tools/check_external_links.js

816 lines
22 KiB
JavaScript

#!/usr/bin/env node
const fs = require("fs");
const path = require("path");
const util = require("util");
const yaml = require("js-yaml");
const UserAgent = require("user-agents");
const { execFile } = require("child_process");
const {
collectMarkdownLinksFromFile,
extractLinksFromText,
} = require("./lib/markdown_links");
const execFileAsync = util.promisify(execFile);
const SITE_ROOT = path.resolve(__dirname, "..");
const CONTENT_DIR = path.join(SITE_ROOT, "content");
const CONFIG_PATH = path.join(__dirname, "config.json");
const DAY_MS = 24 * 60 * 60 * 1000;
const DEFAULT_CONFIG = {
cacheDir: path.join(__dirname, "cache"),
cacheFile: "external_links.yaml",
hostDelayMs: 2000,
retryDelayMs: 5000,
requestTimeoutSeconds: 5,
cacheTtlSuccessDays: 30,
cacheTtlClientErrorDays: 7,
cacheTtlServerErrorDays: 1,
cacheTtlTimeoutDays: 7,
maxConcurrentHosts: 4,
userAgent: null,
enableCookies: true,
cookieJar: path.join(__dirname, "cache", "curl_cookies.txt"),
};
function loadConfig() {
if (!fs.existsSync(CONFIG_PATH)) {
return {};
}
try {
return JSON.parse(fs.readFileSync(CONFIG_PATH, "utf8"));
} catch (error) {
console.warn(
`Impossible de parser ${path.relative(SITE_ROOT, CONFIG_PATH)} (${error.message}).`
);
return {};
}
}
const rawConfig = loadConfig();
const settings = {
...DEFAULT_CONFIG,
...(rawConfig.externalLinks || {}),
};
const CACHE_DIR = path.isAbsolute(settings.cacheDir)
? settings.cacheDir
: path.resolve(SITE_ROOT, settings.cacheDir);
const REPORT_PATH = path.isAbsolute(settings.cacheFile)
? settings.cacheFile
: path.join(CACHE_DIR, settings.cacheFile);
const COOKIE_JAR = settings.cookieJar
? path.isAbsolute(settings.cookieJar)
? settings.cookieJar
: path.resolve(SITE_ROOT, settings.cookieJar)
: path.join(CACHE_DIR, "curl_cookies.txt");
const HOST_DELAY_MS = Math.max(0, Number(settings.hostDelayMs) || 0);
const RETRY_DELAY_MS = Math.max(0, Number(settings.retryDelayMs) || 0);
const REQUEST_TIMEOUT_SECONDS = Math.max(1, Number(settings.requestTimeoutSeconds) || 5);
const MAX_CONCURRENT_HOSTS = Math.max(
1,
Number.isFinite(Number(settings.maxConcurrentHosts))
? Number(settings.maxConcurrentHosts)
: DEFAULT_CONFIG.maxConcurrentHosts
);
const DEFAULT_USER_AGENT =
typeof settings.userAgent === "string" && settings.userAgent.trim()
? settings.userAgent.trim()
: new UserAgent().toString();
const ENABLE_COOKIES = settings.enableCookies !== false;
const CACHE_TTL_SUCCESS_MS = daysToMs(
pickNumber(settings.cacheTtlSuccessDays, DEFAULT_CONFIG.cacheTtlSuccessDays)
);
const CACHE_TTL_CLIENT_ERROR_MS = daysToMs(
pickNumber(settings.cacheTtlClientErrorDays, DEFAULT_CONFIG.cacheTtlClientErrorDays)
);
const CACHE_TTL_SERVER_ERROR_MS = daysToMs(
pickNumber(settings.cacheTtlServerErrorDays, DEFAULT_CONFIG.cacheTtlServerErrorDays)
);
const CACHE_TTL_TIMEOUT_MS = daysToMs(
pickNumber(settings.cacheTtlTimeoutDays, DEFAULT_CONFIG.cacheTtlTimeoutDays)
);
fs.mkdirSync(CACHE_DIR, { recursive: true });
if (ENABLE_COOKIES) {
fs.mkdirSync(path.dirname(COOKIE_JAR), { recursive: true });
if (!fs.existsSync(COOKIE_JAR)) {
fs.closeSync(fs.openSync(COOKIE_JAR, "a"));
}
}
function pickNumber(value, fallback) {
const parsed = Number(value);
if (Number.isFinite(parsed)) {
return parsed;
}
return fallback;
}
function daysToMs(days) {
if (!Number.isFinite(days) || days <= 0) {
return 0;
}
return days * DAY_MS;
}
function ensureDirectoryExists(targetFile) {
fs.mkdirSync(path.dirname(targetFile), { recursive: true });
}
function toPosix(relativePath) {
return typeof relativePath === "string" ? relativePath.split(path.sep).join("/") : relativePath;
}
function relativeToSite(filePath) {
return toPosix(path.relative(SITE_ROOT, filePath));
}
function toPagePath(relativeContentPath) {
if (!relativeContentPath) return null;
let normalized = toPosix(relativeContentPath);
if (!normalized) return null;
normalized = normalized.replace(/^content\//, "");
if (!normalized) {
return "/";
}
normalized = normalized.replace(/\/index\.md$/i, "");
normalized = normalized.replace(/\/_index\.md$/i, "");
normalized = normalized.replace(/\.md$/i, "");
normalized = normalized.replace(/\/+/g, "/");
normalized = normalized.replace(/\/+$/, "");
normalized = normalized.replace(/^\/+/, "");
if (!normalized) {
return "/";
}
return `/${normalized}`;
}
function deriveBundlePagePath(contentRelative) {
if (!contentRelative) return null;
const bundleRoot = contentRelative.replace(/\/data\/.*$/, "");
const candidates = [`${bundleRoot}/index.md`, `${bundleRoot}/_index.md`];
for (const candidate of candidates) {
const absolute = path.join(CONTENT_DIR, candidate);
if (fs.existsSync(absolute)) {
return toPagePath(candidate);
}
}
return toPagePath(bundleRoot);
}
function derivePagePath(relativeFile) {
if (typeof relativeFile !== "string") return null;
const normalized = toPosix(relativeFile);
if (!normalized || !normalized.startsWith("content/")) return null;
const contentRelative = normalized.slice("content/".length);
if (contentRelative.includes("/data/")) {
return deriveBundlePagePath(contentRelative);
}
return toPagePath(contentRelative);
}
function loadState() {
if (!fs.existsSync(REPORT_PATH)) {
return { generatedAt: null, links: [], entries: {} };
}
try {
const payload = yaml.load(fs.readFileSync(REPORT_PATH, "utf8")) || {};
if (payload.entries && typeof payload.entries === "object") {
return {
generatedAt: payload.generatedAt || null,
links: Array.isArray(payload.links) ? payload.links : [],
entries: normalizeEntries(payload.entries),
};
}
return {
generatedAt: payload.generatedAt || null,
links: Array.isArray(payload.links) ? payload.links : [],
entries: normalizeEntries(payload),
};
} catch (error) {
console.warn(
`Impossible de lire ${path.relative(SITE_ROOT, REPORT_PATH)} (${error.message}).`
);
return { generatedAt: null, links: [], entries: {} };
}
}
function normalizeEntries(rawEntries) {
const normalized = {};
if (!rawEntries || typeof rawEntries !== "object") {
return normalized;
}
for (const [url, data] of Object.entries(rawEntries)) {
if (!url.includes("://")) {
continue;
}
normalized[url] = normalizeEntryShape(url, data);
}
return normalized;
}
function normalizeEntryShape(url, raw) {
const checkedAt = raw?.checkedAt || raw?.checked || null;
const locations = normalizeLocations(raw?.locations, raw?.files);
return {
url,
status: typeof raw?.status === "number" ? raw.status : null,
errorType: raw?.errorType || null,
method: raw?.method || null,
checkedAt,
locations,
};
}
function normalizeLocations(locations, fallbackFiles) {
const items = [];
if (Array.isArray(locations)) {
for (const entry of locations) {
if (!entry) continue;
if (typeof entry === "string") {
const [filePart, linePart] = entry.split(":");
const filePath = toPosix(filePart.trim());
items.push({
file: filePath,
line: linePart ? Number.parseInt(linePart, 10) || null : null,
page: derivePagePath(filePath),
});
} else if (typeof entry === "object") {
const file = sizeof(entry.file) ? entry.file : null;
if (file) {
const normalizedFile = toPosix(file);
items.push({
file: normalizedFile,
line: typeof entry.line === "number" ? entry.line : null,
page:
typeof entry.page === "string" && entry.page.trim()
? toPosix(entry.page.trim())
: derivePagePath(normalizedFile),
});
}
}
}
}
if (items.length === 0 && Array.isArray(fallbackFiles)) {
for (const file of fallbackFiles) {
if (!file) continue;
const normalizedFile = toPosix(file);
items.push({
file: normalizedFile,
line: null,
page: derivePagePath(normalizedFile),
});
}
}
return dedupeAndSortLocations(items);
}
function sizeof(value) {
return typeof value === "string" && value.trim().length > 0;
}
function dedupeAndSortLocations(list) {
if (!Array.isArray(list) || list.length === 0) {
return [];
}
const map = new Map();
for (const item of list) {
if (!item?.file) continue;
const key = `${item.file}::${item.line ?? ""}`;
if (!map.has(key)) {
const normalizedFile = toPosix(item.file);
map.set(key, {
file: normalizedFile,
line: typeof item.line === "number" ? item.line : null,
page:
typeof item.page === "string" && item.page.trim()
? toPosix(item.page.trim())
: derivePagePath(normalizedFile),
});
}
}
return Array.from(map.values()).sort((a, b) => {
const fileDiff = a.file.localeCompare(b.file);
if (fileDiff !== 0) return fileDiff;
const lineA = a.line ?? Number.POSITIVE_INFINITY;
const lineB = b.line ?? Number.POSITIVE_INFINITY;
return lineA - lineB;
});
}
function saveState(state) {
ensureDirectoryExists(REPORT_PATH);
fs.writeFileSync(REPORT_PATH, yaml.dump(state), "utf8");
}
function createEntry(url, existing = {}) {
return {
url,
status: typeof existing.status === "number" ? existing.status : null,
errorType: existing.errorType || null,
method: existing.method || null,
checkedAt: existing.checkedAt || null,
locations: Array.isArray(existing.locations) ? dedupeAndSortLocations(existing.locations) : [],
};
}
function mergeOccurrences(entries, occurrences) {
const merged = {};
for (const [url, urlOccurrences] of occurrences.entries()) {
const existing = entries[url] || createEntry(url);
merged[url] = {
...existing,
url,
locations: dedupeAndSortLocations(urlOccurrences),
};
}
return merged;
}
function recordOccurrence(map, filePath, line, url) {
if (!map.has(url)) {
map.set(url, []);
}
const relativeFile = relativeToSite(filePath);
const normalizedLine = typeof line === "number" && Number.isFinite(line) ? line : null;
const pagePath = derivePagePath(relativeFile);
const list = map.get(url);
const key = `${relativeFile}:${normalizedLine ?? ""}`;
if (!list.some((item) => `${item.file}:${item.line ?? ""}` === key)) {
list.push({ file: relativeFile, line: normalizedLine, page: pagePath });
}
}
function stripYamlInlineComment(line) {
let inSingle = false;
let inDouble = false;
for (let i = 0; i < line.length; i++) {
const ch = line[i];
if (ch === "'" && !inDouble) {
const next = line[i + 1];
if (inSingle && next === "'") {
i++;
continue;
}
inSingle = !inSingle;
} else if (ch === '"' && !inSingle) {
if (!inDouble) {
inDouble = true;
} else if (line[i - 1] !== "\\") {
inDouble = false;
}
} else if (ch === "#" && !inSingle && !inDouble) {
return line.slice(0, i);
} else if (ch === "\\" && inDouble) {
i++;
}
}
return line;
}
function isYamlCommentLine(line) {
return line.trim().startsWith("#");
}
function isBlockScalarIndicator(line) {
const cleaned = stripYamlInlineComment(line).trim();
return /:\s*[>|][0-9+-]*\s*$/.test(cleaned);
}
function processYamlRecursively(obj, links = new Set()) {
if (typeof obj === "string") {
for (const link of extractLinksFromText(obj)) {
links.add(link);
}
} else if (Array.isArray(obj)) {
for (const item of obj) {
processYamlRecursively(item, links);
}
} else if (obj && typeof obj === "object") {
for (const value of Object.values(obj)) {
processYamlRecursively(value, links);
}
}
return links;
}
async function collectMarkdownLinks(filePath, occurrences) {
const entries = await collectMarkdownLinksFromFile(filePath);
for (const { url, line } of entries) {
recordOccurrence(occurrences, filePath, line, url);
}
}
async function collectYamlLinks(filePath, occurrences) {
let linkSet = new Set();
try {
const doc = yaml.load(fs.readFileSync(filePath, "utf8"));
if (doc) {
linkSet = processYamlRecursively(doc);
}
} catch (error) {
console.warn(`Impossible de parser ${relativeToSite(filePath)} (${error.message}).`);
return;
}
if (linkSet.size === 0) {
return;
}
const recorded = new Map();
const lines = fs.readFileSync(filePath, "utf8").split(/\r?\n/);
let inBlockScalar = false;
let blockIndent = 0;
const mark = (url, lineNumber) => {
if (!recorded.has(url)) {
recorded.set(url, new Set());
}
const set = recorded.get(url);
if (!set.has(lineNumber)) {
set.add(lineNumber);
recordOccurrence(occurrences, filePath, lineNumber, url);
}
};
for (let index = 0; index < lines.length; index++) {
const lineNumber = index + 1;
const line = lines[index];
const indent = line.match(/^\s*/)?.[0].length ?? 0;
const trimmed = line.trim();
if (inBlockScalar) {
if (trimmed === "" && indent < blockIndent) {
inBlockScalar = false;
continue;
}
if (trimmed === "" || indent >= blockIndent) {
if (!isYamlCommentLine(line)) {
for (const link of extractLinksFromText(line)) {
if (linkSet.has(link)) {
mark(link, lineNumber);
}
}
}
continue;
}
inBlockScalar = false;
}
const withoutComment = stripYamlInlineComment(line);
const trimmedWithoutComment = withoutComment.trim();
if (isBlockScalarIndicator(line)) {
inBlockScalar = true;
blockIndent = indent + 1;
}
if (isYamlCommentLine(line) || !trimmedWithoutComment) {
continue;
}
for (const link of extractLinksFromText(withoutComment)) {
if (linkSet.has(link)) {
mark(link, lineNumber);
}
}
}
for (const link of linkSet) {
if (!recorded.has(link) || recorded.get(link).size === 0) {
recordOccurrence(occurrences, filePath, null, link);
}
}
}
function walk(dir, exts) {
let results = [];
const entries = fs.readdirSync(dir, { withFileTypes: true });
for (const entry of entries) {
const fullPath = path.join(dir, entry.name);
if (entry.isDirectory()) {
results = results.concat(walk(fullPath, exts));
} else if (exts.includes(path.extname(entry.name))) {
results.push(fullPath);
}
}
return results;
}
function delay(ms) {
return new Promise((resolve) => setTimeout(resolve, ms));
}
const lastHostChecks = new Map();
async function applyHostDelay(host) {
if (!host || HOST_DELAY_MS <= 0) {
return;
}
const last = lastHostChecks.get(host);
if (last) {
const elapsed = Date.now() - last;
const wait = HOST_DELAY_MS - elapsed;
if (wait > 0) {
await delay(wait);
}
}
}
function recordHostCheck(host) {
if (host) {
lastHostChecks.set(host, Date.now());
}
}
function extractHost(url) {
try {
return new URL(url).hostname;
} catch (_) {
return null;
}
}
async function curlRequest(url, method, hostHeader) {
const args = [
"--silent",
"--location",
"--fail",
"--max-time",
`${REQUEST_TIMEOUT_SECONDS}`,
"--output",
"/dev/null",
"--write-out",
"%{http_code}",
"--user-agent",
DEFAULT_USER_AGENT,
"--request",
method,
];
if (ENABLE_COOKIES) {
args.push("--cookie", COOKIE_JAR, "--cookie-jar", COOKIE_JAR);
}
if (hostHeader) {
args.push("-H", `Host: ${hostHeader}`);
}
args.push(url);
try {
const { stdout } = await execFileAsync("curl", args);
const status = parseInt(stdout.trim(), 10);
return {
status: Number.isNaN(status) ? null : status,
errorType: null,
method: method.toUpperCase(),
};
} catch (error) {
const rawStatus = error?.stdout?.toString().trim();
const status = rawStatus ? parseInt(rawStatus, 10) : null;
const errorCode = Number(error?.code);
const timeout = errorCode === 28 ? "timeout" : null;
return {
status: Number.isNaN(status) ? null : status,
errorType: timeout,
method: method.toUpperCase(),
};
}
}
function shouldRetryWithGet(result) {
if (!result) return true;
if (result.errorType) return true;
if (typeof result.status !== "number") return true;
return result.status >= 400;
}
function getTtlMs(entry) {
if (!entry) return 0;
if (entry.errorType === "timeout" || entry.status === 0 || entry.status === null) {
return CACHE_TTL_TIMEOUT_MS;
}
const status = Number(entry.status);
if (Number.isNaN(status)) {
return CACHE_TTL_TIMEOUT_MS;
}
if (status >= 500) {
return CACHE_TTL_SERVER_ERROR_MS;
}
if (status >= 400) {
return CACHE_TTL_CLIENT_ERROR_MS;
}
if (status >= 200 && status < 400) {
return CACHE_TTL_SUCCESS_MS;
}
return CACHE_TTL_TIMEOUT_MS;
}
function needsCheck(entry) {
if (!entry?.checkedAt) {
return true;
}
const checked = Date.parse(entry.checkedAt);
if (Number.isNaN(checked)) {
return true;
}
const ttl = getTtlMs(entry);
if (ttl <= 0) {
return true;
}
return Date.now() - checked >= ttl;
}
function groupEntriesByHost(entries) {
const groups = new Map();
for (const entry of entries) {
const host = extractHost(entry.url);
const key = host || `__invalid__:${entry.url}`;
if (!groups.has(key)) {
groups.set(key, { host, entries: [] });
}
groups.get(key).entries.push(entry);
}
return Array.from(groups.values());
}
async function runWithConcurrency(items, worker, concurrency) {
const executing = new Set();
for (const item of items) {
const task = Promise.resolve().then(() => worker(item));
executing.add(task);
const clean = () => executing.delete(task);
task.then(clean).catch(clean);
if (executing.size >= concurrency) {
await Promise.race(executing);
}
}
await Promise.allSettled(executing);
}
function updateEntryWithResult(entry, result) {
const now = new Date().toISOString();
entry.status = typeof result.status === "number" ? result.status : null;
entry.errorType = result.errorType || null;
entry.method = result.method;
entry.checkedAt = now;
}
function formatStatusForReport(entry) {
if (!entry) return "error";
if (entry.errorType === "timeout") return "timeout";
if (typeof entry.status === "number") return entry.status;
return "error";
}
function isDead(entry) {
if (!entry) return false;
if (entry.errorType === "timeout") return true;
if (typeof entry.status !== "number") return true;
return entry.status >= 400;
}
function getStatusOrder(value) {
if (typeof value === "number" && Number.isFinite(value)) {
return value;
}
const label = typeof value === "string" ? value.toLowerCase() : "";
if (label === "timeout") {
return 10000;
}
return 10001;
}
function buildDeadLinks(entries) {
const list = [];
for (const entry of Object.values(entries)) {
if (!isDead(entry)) continue;
list.push({
url: entry.url,
status: formatStatusForReport(entry),
locations: entry.locations || [],
});
}
return list.sort((a, b) => {
const orderDiff = getStatusOrder(a.status) - getStatusOrder(b.status);
if (orderDiff !== 0) return orderDiff;
if (typeof a.status === "number" && typeof b.status === "number") {
return a.status - b.status;
}
const labelDiff = String(a.status).localeCompare(String(b.status));
if (labelDiff !== 0) return labelDiff;
return a.url.localeCompare(b.url);
});
}
function logProgress(processed, total) {
process.stdout.write(`\rURLs vérifiées ${processed}/${total}`);
}
async function collectOccurrences() {
const occurrences = new Map();
const mdFiles = walk(CONTENT_DIR, [".md", ".markdown"]);
for (const file of mdFiles) {
await collectMarkdownLinks(file, occurrences);
}
const yamlFiles = walk(CONTENT_DIR, [".yaml", ".yml"]);
for (const file of yamlFiles) {
await collectYamlLinks(file, occurrences);
}
return occurrences;
}
function persistEntriesSnapshot(entries, snapshotMeta) {
const payload = {
generatedAt: snapshotMeta?.generatedAt || null,
links: Array.isArray(snapshotMeta?.links) ? snapshotMeta.links : [],
entries,
};
saveState(payload);
}
async function checkEntries(entriesToCheck, entries, snapshotMeta) {
if (entriesToCheck.length === 0) {
return;
}
const hostGroups = groupEntriesByHost(entriesToCheck);
const concurrency = Math.max(1, Math.min(MAX_CONCURRENT_HOSTS, hostGroups.length));
let processed = 0;
process.stdout.write(`Vérification de ${entriesToCheck.length} URL...\n`);
await runWithConcurrency(
hostGroups,
async ({ host, entries: groupEntries }) => {
for (const entry of groupEntries) {
if (host) {
await applyHostDelay(host);
}
const hostHeader = host || extractHost(entry.url);
let result = await curlRequest(entry.url, "HEAD", hostHeader);
recordHostCheck(host);
if (shouldRetryWithGet(result)) {
if (RETRY_DELAY_MS > 0) {
await delay(RETRY_DELAY_MS);
}
if (host) {
await applyHostDelay(host);
}
result = await curlRequest(entry.url, "GET", hostHeader);
recordHostCheck(host);
}
updateEntryWithResult(entries[entry.url], result);
persistEntriesSnapshot(entries, snapshotMeta);
processed += 1;
logProgress(processed, entriesToCheck.length);
}
},
concurrency
);
process.stdout.write("\n");
}
async function main() {
const occurrences = await collectOccurrences();
if (occurrences.size === 0) {
const emptyState = { generatedAt: new Date().toISOString(), links: [], entries: {} };
saveState(emptyState);
console.log("Aucun lien externe détecté.");
return;
}
const state = loadState();
const mergedEntries = mergeOccurrences(state.entries, occurrences);
const entriesArray = Object.values(mergedEntries);
const pending = entriesArray.filter((entry) => needsCheck(entry));
const snapshotMeta = {
generatedAt: state.generatedAt || null,
links: Array.isArray(state.links) ? state.links : [],
};
await checkEntries(pending, mergedEntries, snapshotMeta);
const deadLinks = buildDeadLinks(mergedEntries);
const nextState = {
generatedAt: new Date().toISOString(),
links: deadLinks,
entries: mergedEntries,
};
saveState(nextState);
console.log(
`Liens externes analysés: ${entriesArray.length} URL (${deadLinks.length} mort(s)). Données écrites dans ${path.relative(
SITE_ROOT,
REPORT_PATH
)}`
);
}
main().catch((error) => {
console.error("Erreur lors de la vérification des liens:", error);
process.exitCode = 1;
});