1

Amélioration de la détection de liens externes morts

This commit is contained in:
2025-10-31 12:41:34 +01:00
parent 7442622c74
commit f8b824c540
7 changed files with 885 additions and 3474 deletions

View File

@@ -1,73 +1,337 @@
const fs = require("fs");
const path = require("path");
const yaml = require("js-yaml");
const { scrapePage } = require("./lib/puppeteer");
const readline = require("readline");
const util = require("util");
const { execFile } = require("child_process");
const UserAgent = require("user-agents");
const {
collectMarkdownLinksFromFile,
extractLinksFromText,
} = require("./lib/markdown_links");
const CONTENT_DIR = path.join(__dirname, "..", "content");
const DATA_DIR = path.join(__dirname, "..", "data");
const SITE_ROOT = path.resolve(__dirname, "..");
const CACHE_PATH = path.join(DATA_DIR, "external_links.yaml");
const CACHE_TTL_DAYS = 7;
const CONFIG_PATH = path.join(__dirname, "config.json");
let config = {};
if (fs.existsSync(CONFIG_PATH)) {
try {
config = JSON.parse(fs.readFileSync(CONFIG_PATH, "utf8"));
} catch (error) {
console.warn(
`Failed to parse ${path.relative(SITE_ROOT, CONFIG_PATH)}. Using defaults. (${error.message})`
);
}
}
const externalConfig = {
cacheDir: path.join(__dirname, "cache"),
cacheFile: "external_links.yaml",
hostDelayMs: 2000,
retryDelayMs: 5000,
requestTimeoutSeconds: 5,
cacheTtlSuccessDays: 7,
cacheTtlClientErrorDays: 0,
outputFormat: "markdown",
outputFile: path.join(__dirname, "cache", "external_links_report.md"),
userAgent: null,
enableCookies: true,
cookieJar: path.join(__dirname, "cache", "curl_cookies.txt"),
...(config.externalLinks || {}),
};
const CONTENT_DIR = path.join(SITE_ROOT, "content");
const CACHE_DIR = path.isAbsolute(externalConfig.cacheDir)
? externalConfig.cacheDir
: path.resolve(SITE_ROOT, externalConfig.cacheDir);
const CACHE_PATH = path.isAbsolute(externalConfig.cacheFile)
? externalConfig.cacheFile
: path.join(CACHE_DIR, externalConfig.cacheFile);
const OUTPUT_FILE = path.isAbsolute(externalConfig.outputFile)
? externalConfig.outputFile
: path.resolve(SITE_ROOT, externalConfig.outputFile);
const COOKIE_JAR = externalConfig.cookieJar
? path.isAbsolute(externalConfig.cookieJar)
? externalConfig.cookieJar
: path.resolve(SITE_ROOT, externalConfig.cookieJar)
: path.join(CACHE_DIR, "curl_cookies.txt");
const CACHE_TTL_SUCCESS_DAYS = Number(externalConfig.cacheTtlSuccessDays) || 0;
const CACHE_TTL_CLIENT_ERROR_DAYS = Number(externalConfig.cacheTtlClientErrorDays) || 0;
const HOST_DELAY_MS = Number(externalConfig.hostDelayMs) || 0;
const RETRY_DELAY_MS = Number(externalConfig.retryDelayMs) || 0;
const REQUEST_TIMEOUT_SECONDS = Number(externalConfig.requestTimeoutSeconds) || 0;
const maxConcurrentConfig = Number(externalConfig.maxConcurrentHosts);
const MAX_CONCURRENT_HOSTS =
Number.isFinite(maxConcurrentConfig) && maxConcurrentConfig > 0
? maxConcurrentConfig
: 4;
const DEFAULT_USER_AGENT =
typeof externalConfig.userAgent === "string" && externalConfig.userAgent.trim()
? externalConfig.userAgent.trim()
: new UserAgent().toString();
const ENABLE_COOKIES = externalConfig.enableCookies !== false;
const PROGRESS_FILE = path.join(__dirname, "cache", "external_links_progress.csv");
const execFileAsync = util.promisify(execFile);
fs.mkdirSync(CACHE_DIR, { recursive: true });
if (ENABLE_COOKIES) {
fs.mkdirSync(path.dirname(COOKIE_JAR), { recursive: true });
if (!fs.existsSync(COOKIE_JAR)) {
fs.closeSync(fs.openSync(COOKIE_JAR, "a"));
}
}
try {
if (fs.existsSync(PROGRESS_FILE)) {
fs.unlinkSync(PROGRESS_FILE);
}
} catch (error) {
console.warn(`Unable to remove existing progress file: ${error.message}`);
}
let cache = {};
if (fs.existsSync(CACHE_PATH)) {
cache = yaml.load(fs.readFileSync(CACHE_PATH, "utf8")) || {};
}
let cacheDirty = false;
const now = new Date();
const BAD_LINKS = [];
const lastHostChecks = new Map();
const runResults = new Map();
function isExternalLink(link) {
return typeof link === "string" && link.includes("://");
function updateProgress(processed, total) {
process.stdout.write(`\rURL ${processed}/${total}`);
}
function isCacheValid(entry) {
if (!entry?.checked) return false;
const date = new Date(entry.checked);
return (now - date) / (1000 * 60 * 60 * 24) < CACHE_TTL_DAYS;
const ttlDays = (() => {
const status = entry.status;
if (typeof status === "number") {
if (status < 400) return CACHE_TTL_SUCCESS_DAYS;
if (status < 500) return CACHE_TTL_CLIENT_ERROR_DAYS;
return 0;
}
return 0;
})();
if (ttlDays <= 0) return false;
return (now - date) / (1000 * 60 * 60 * 24) < ttlDays;
}
function extractLinksFromText(text) {
const regex = /\bhttps?:\/\/[^\s)"'>]+/g;
return text.match(regex) || [];
}
async function checkLink(file, line, url) {
if (isCacheValid(cache[url])) return;
const meta = await scrapePage(url, null, { screenshot: false });
cache[url] = {
status: meta.httpStatus || null,
checked: new Date().toISOString(),
};
const bundle = path.relative(SITE_ROOT, file);
if (!meta.httpStatus || meta.httpStatus >= 400) {
BAD_LINKS.push({ bundle, url, line, status: meta.httpStatus });
process.stdout.write("❌");
} else {
process.stdout.write("✔");
async function collectMarkdownLinks(filePath, occurrencesMap) {
const entries = await collectMarkdownLinksFromFile(filePath);
for (const { url, line } of entries) {
recordOccurrence(occurrencesMap, filePath, line, url);
}
}
async function processMarkdown(filePath) {
const fileStream = fs.createReadStream(filePath);
const rl = readline.createInterface({ input: fileStream, crlfDelay: Infinity });
let lineNumber = 0;
for await (const line of rl) {
lineNumber++;
const links = extractLinksFromText(line);
for (const link of links) {
await checkLink(filePath, lineNumber, link);
function recordOccurrence(occurrencesMap, filePath, lineNumber, url) {
if (!occurrencesMap.has(url)) {
occurrencesMap.set(url, { url, occurrences: [] });
}
const entry = occurrencesMap.get(url);
const alreadyRecorded = entry.occurrences.some(
(item) => item.file === filePath && item.line === lineNumber
);
if (!alreadyRecorded) {
entry.occurrences.push({ file: filePath, line: lineNumber });
}
}
function delay(ms) {
return new Promise((resolve) => setTimeout(resolve, ms));
}
async function applyHostDelay(host) {
if (!host) return;
const last = lastHostChecks.get(host);
if (last) {
const elapsed = Date.now() - last;
const waitTime = HOST_DELAY_MS - elapsed;
if (waitTime > 0) {
await delay(waitTime);
}
}
}
function processYamlRecursively(obj, links = []) {
if (typeof obj === "string" && isExternalLink(obj)) {
links.push(obj);
function recordHostCheck(host) {
if (host) {
lastHostChecks.set(host, Date.now());
}
}
function extractHost(url) {
try {
return new URL(url).hostname;
} catch (_) {
return null;
}
}
function persistCache() {
if (!cacheDirty) return;
ensureDirectoryExists(CACHE_PATH);
fs.writeFileSync(CACHE_PATH, yaml.dump(cache));
cacheDirty = false;
}
function formatLocations(occurrences) {
return occurrences
.map(({ file, line }) => `${path.relative(SITE_ROOT, file)}:${line}`)
.join("; ");
}
function escapeCsvField(value) {
const stringValue = String(value);
if (/[",\n]/.test(stringValue)) {
return `"${stringValue.replace(/"/g, '""')}"`;
}
return stringValue;
}
function appendProgress(url, occurrences, status) {
const locationText = formatLocations(occurrences);
const statusText =
typeof status === "number" && status < 400 && status !== null ? "" : status ?? "";
const line = [
escapeCsvField(url),
escapeCsvField(locationText),
escapeCsvField(statusText),
].join(",");
fs.appendFileSync(PROGRESS_FILE, `${line}\n`);
}
function groupEntriesByHost(entries) {
const result = new Map();
for (const entry of entries) {
const host = extractHost(entry.url);
const key = host || `__invalid__:${entry.url}`;
if (!result.has(key)) {
result.set(key, { host, entries: [] });
}
result.get(key).entries.push(entry);
}
return Array.from(result.values());
}
async function runWithConcurrency(items, worker, concurrency) {
const executing = new Set();
const promises = [];
for (const item of items) {
const promise = Promise.resolve().then(() => worker(item));
promises.push(promise);
executing.add(promise);
const clean = () => executing.delete(promise);
promise.then(clean).catch(clean);
if (executing.size >= concurrency) {
await Promise.race(executing);
}
}
return Promise.all(promises);
}
async function curlRequest(url, method) {
const args = [
"--silent",
"--location",
"--fail",
"--max-time",
`${REQUEST_TIMEOUT_SECONDS}`,
"--output",
"/dev/null",
"--write-out",
"%{http_code}",
"--user-agent",
DEFAULT_USER_AGENT,
"--request",
method,
url,
];
if (ENABLE_COOKIES) {
args.push("--cookie", COOKIE_JAR, "--cookie-jar", COOKIE_JAR);
}
try {
const { stdout } = await execFileAsync("curl", args);
const status = parseInt(stdout.trim(), 10);
return {
status: Number.isNaN(status) ? null : status,
errorType: null,
method: method.toUpperCase(),
};
} catch (error) {
const rawStatus = error?.stdout?.toString().trim();
const status = rawStatus ? parseInt(rawStatus, 10) : null;
const errorCode = Number(error?.code);
const errorType = errorCode === 28 ? "timeout" : null;
return {
status: Number.isNaN(status) ? null : status,
errorType,
method: method.toUpperCase(),
};
}
}
function shouldRetryWithGet(result) {
if (result.errorType) return true;
if (result.status === null) return true;
return result.status >= 400;
}
async function checkLink(url) {
let info = runResults.get(url);
if (!info) {
const cachedInfo = cache[url];
if (!isCacheValid(cachedInfo)) {
const host = extractHost(url);
if (host) {
await applyHostDelay(host);
}
let result = await curlRequest(url, "HEAD");
recordHostCheck(host);
if (shouldRetryWithGet(result)) {
await delay(RETRY_DELAY_MS);
if (host) {
await applyHostDelay(host);
}
result = await curlRequest(url, "GET");
recordHostCheck(host);
}
info = {
status: result.status ?? null,
errorType: result.errorType || null,
method: result.method,
checked: new Date().toISOString(),
};
cache[url] = info;
cacheDirty = true;
persistCache();
} else if (cachedInfo) {
info = cachedInfo;
} else {
info = {
status: null,
errorType: "unknown",
method: "HEAD",
checked: new Date().toISOString(),
};
}
runResults.set(url, info);
}
return info;
}
function processYamlRecursively(obj, links = new Set()) {
if (typeof obj === "string") {
for (const link of extractLinksFromText(obj)) {
links.add(link);
}
} else if (Array.isArray(obj)) {
for (const item of obj) processYamlRecursively(item, links);
} else if (typeof obj === "object" && obj !== null) {
@@ -76,15 +340,117 @@ function processYamlRecursively(obj, links = []) {
return links;
}
async function processYaml(filePath) {
function stripYamlInlineComment(line) {
let inSingle = false;
let inDouble = false;
for (let i = 0; i < line.length; i++) {
const ch = line[i];
if (ch === "'" && !inDouble) {
const next = line[i + 1];
if (inSingle && next === "'") {
i++;
continue;
}
inSingle = !inSingle;
} else if (ch === '"' && !inSingle) {
if (!inDouble) {
inDouble = true;
} else if (line[i - 1] !== "\\") {
inDouble = false;
}
} else if (ch === "#" && !inSingle && !inDouble) {
return line.slice(0, i);
} else if (ch === "\\" && inDouble) {
i++;
}
}
return line;
}
function isYamlCommentLine(line) {
return line.trim().startsWith("#");
}
function isBlockScalarIndicator(line) {
const cleaned = stripYamlInlineComment(line).trim();
return /:\s*[>|][0-9+-]*\s*$/.test(cleaned);
}
async function collectYamlLinks(filePath, occurrencesMap) {
let linkSet = new Set();
try {
const doc = yaml.load(fs.readFileSync(filePath, "utf8"));
const links = processYamlRecursively(doc);
for (const link of links) {
await checkLink(filePath, "?", link);
}
linkSet = processYamlRecursively(doc);
} catch (e) {
console.error(`Failed to parse YAML file: ${filePath}`);
return;
}
if (linkSet.size === 0) return;
const recorded = new Map();
const rawLines = fs.readFileSync(filePath, "utf8").split(/\r?\n/);
let inBlockScalar = false;
let blockIndent = 0;
const markRecorded = (url, lineNumber) => {
if (!recorded.has(url)) {
recorded.set(url, new Set());
}
const lines = recorded.get(url);
if (lines.has(lineNumber)) return;
lines.add(lineNumber);
recordOccurrence(occurrencesMap, filePath, lineNumber, url);
};
for (let index = 0; index < rawLines.length; index++) {
const lineNumber = index + 1;
const line = rawLines[index];
const indent = line.match(/^\s*/)?.[0].length ?? 0;
const trimmed = line.trim();
if (inBlockScalar) {
if (trimmed === "" && indent < blockIndent) {
inBlockScalar = false;
continue;
}
if (trimmed === "" || indent >= blockIndent) {
if (isYamlCommentLine(line)) {
continue;
}
for (const link of extractLinksFromText(line)) {
if (linkSet.has(link)) {
markRecorded(link, lineNumber);
}
}
continue;
}
inBlockScalar = false;
}
const withoutComment = stripYamlInlineComment(line);
const trimmedWithoutComment = withoutComment.trim();
if (isBlockScalarIndicator(line)) {
inBlockScalar = true;
blockIndent = indent + 1;
}
if (isYamlCommentLine(line)) continue;
if (!trimmedWithoutComment) continue;
for (const link of extractLinksFromText(withoutComment)) {
if (linkSet.has(link)) {
markRecorded(link, lineNumber);
}
}
}
for (const link of linkSet) {
if (!recorded.has(link) || recorded.get(link).size === 0) {
recordOccurrence(occurrencesMap, filePath, "?", link);
}
}
}
@@ -103,24 +469,155 @@ function walk(dir, exts) {
return results;
}
(async () => {
const mdFiles = walk(CONTENT_DIR, [".md"]);
const yamlFiles = walk(DATA_DIR, [".yaml", ".yml"]);
console.log(`Scanning ${mdFiles.length} Markdown and ${yamlFiles.length} YAML files...`);
function ensureDirectoryExists(targetFile) {
fs.mkdirSync(path.dirname(targetFile), { recursive: true });
}
function escapeMarkdownCell(value) {
return String(value).replace(/\|/g, "\\|").replace(/\r?\n/g, " ");
}
function generateMarkdownReport(entries) {
const header = [
"# Broken External Links",
"",
`Generated: ${new Date().toISOString()}`,
"",
];
if (entries.length === 0) {
return header.concat(["No broken external links found."]).join("\n");
}
const rows = entries.map((entry) => {
const url = escapeMarkdownCell(entry.url);
const location = escapeMarkdownCell(entry.location);
const status = escapeMarkdownCell(entry.status);
return `| ${url} | ${location} | ${status} |`;
});
return header
.concat(["| URL | Location | Status |", "| --- | --- | --- |", ...rows])
.join("\n");
}
function generateCsvReport(entries) {
const lines = [`"url","location","status"`];
for (const entry of entries) {
const line = [entry.url, entry.location, entry.status]
.map((field) => `"${String(field).replace(/"/g, '""')}"`)
.join(",");
lines.push(line);
}
return lines.join("\n");
}
function writeReport(entries) {
const format = String(externalConfig.outputFormat || "markdown").toLowerCase();
const content =
format === "csv" ? generateCsvReport(entries) : generateMarkdownReport(entries);
ensureDirectoryExists(OUTPUT_FILE);
fs.writeFileSync(OUTPUT_FILE, content, "utf8");
}
(async () => {
const occurrencesByUrl = new Map();
const mdFiles = walk(CONTENT_DIR, [".md", ".markdown"]);
const yamlFiles = walk(CONTENT_DIR, [".yaml", ".yml"]);
for (const file of mdFiles) {
await processMarkdown(file);
await collectMarkdownLinks(file, occurrencesByUrl);
}
for (const file of yamlFiles) {
await processYaml(file);
await collectYamlLinks(file, occurrencesByUrl);
}
const uniqueEntries = Array.from(occurrencesByUrl.values());
const activeUrls = new Set(uniqueEntries.map((entry) => entry.url));
let cachePruned = false;
for (const url of Object.keys(cache)) {
if (!activeUrls.has(url)) {
delete cache[url];
cachePruned = true;
}
}
if (cachePruned) {
cacheDirty = true;
}
ensureDirectoryExists(PROGRESS_FILE);
fs.writeFileSync(PROGRESS_FILE, `"url","locations","status"\n`, "utf8");
const total = uniqueEntries.length;
if (total === 0) {
process.stdout.write("No external links found.\n");
ensureDirectoryExists(CACHE_PATH);
fs.writeFileSync(CACHE_PATH, yaml.dump(cache));
writeReport([]);
return;
}
const hostGroups = groupEntriesByHost(uniqueEntries);
const concurrency = Math.max(1, Math.min(MAX_CONCURRENT_HOSTS, hostGroups.length || 1));
let processed = 0;
await runWithConcurrency(
hostGroups,
async ({ entries }) => {
for (const entry of entries) {
const info = await checkLink(entry.url);
const status = typeof info?.status === "number" ? info.status : null;
const errorType = info?.errorType || null;
const hasHttpError = status !== null && status >= 400;
const isTimeout = errorType === "timeout";
const statusLabel = isTimeout ? "timeout" : status ?? "error";
if (status === null || hasHttpError || isTimeout) {
BAD_LINKS.push({
location: formatLocations(entry.occurrences),
url: entry.url,
status: statusLabel,
});
}
appendProgress(entry.url, entry.occurrences, hasHttpError || isTimeout || status === null ? statusLabel : status);
processed += 1;
updateProgress(processed, total);
}
},
concurrency
);
process.stdout.write("\n");
ensureDirectoryExists(CACHE_PATH);
fs.writeFileSync(CACHE_PATH, yaml.dump(cache));
console.log("\n\n=== Broken External Links Report ===");
if (BAD_LINKS.length === 0) {
console.log("✅ No broken external links found.");
} else {
console.table(BAD_LINKS);
writeReport([]);
console.log(
`No broken external links detected. Report saved to ${path.relative(
SITE_ROOT,
OUTPUT_FILE
)}.`
);
return;
}
const sorted = BAD_LINKS.sort((a, b) => {
const rank = (entry) => {
if (entry.status === "timeout") return 2;
if (typeof entry.status === "number") {
return entry.status === 404 ? 0 : 1;
}
return 1;
};
const diff = rank(a) - rank(b);
if (diff !== 0) return diff;
if (typeof a.status === "number" && typeof b.status === "number") {
return a.status - b.status;
}
return a.url.localeCompare(b.url);
});
writeReport(sorted);
console.log(
`Found ${sorted.length} broken external link(s). Report saved to ${path.relative(
SITE_ROOT,
OUTPUT_FILE
)}.`
);
})();