Amélioration de la détection de liens externes morts
This commit is contained in:
File diff suppressed because it is too large
Load Diff
@@ -1,73 +1,337 @@
|
||||
const fs = require("fs");
|
||||
const path = require("path");
|
||||
const yaml = require("js-yaml");
|
||||
const { scrapePage } = require("./lib/puppeteer");
|
||||
const readline = require("readline");
|
||||
const util = require("util");
|
||||
const { execFile } = require("child_process");
|
||||
const UserAgent = require("user-agents");
|
||||
const {
|
||||
collectMarkdownLinksFromFile,
|
||||
extractLinksFromText,
|
||||
} = require("./lib/markdown_links");
|
||||
|
||||
const CONTENT_DIR = path.join(__dirname, "..", "content");
|
||||
const DATA_DIR = path.join(__dirname, "..", "data");
|
||||
const SITE_ROOT = path.resolve(__dirname, "..");
|
||||
const CACHE_PATH = path.join(DATA_DIR, "external_links.yaml");
|
||||
const CACHE_TTL_DAYS = 7;
|
||||
const CONFIG_PATH = path.join(__dirname, "config.json");
|
||||
|
||||
let config = {};
|
||||
if (fs.existsSync(CONFIG_PATH)) {
|
||||
try {
|
||||
config = JSON.parse(fs.readFileSync(CONFIG_PATH, "utf8"));
|
||||
} catch (error) {
|
||||
console.warn(
|
||||
`Failed to parse ${path.relative(SITE_ROOT, CONFIG_PATH)}. Using defaults. (${error.message})`
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
const externalConfig = {
|
||||
cacheDir: path.join(__dirname, "cache"),
|
||||
cacheFile: "external_links.yaml",
|
||||
hostDelayMs: 2000,
|
||||
retryDelayMs: 5000,
|
||||
requestTimeoutSeconds: 5,
|
||||
cacheTtlSuccessDays: 7,
|
||||
cacheTtlClientErrorDays: 0,
|
||||
outputFormat: "markdown",
|
||||
outputFile: path.join(__dirname, "cache", "external_links_report.md"),
|
||||
userAgent: null,
|
||||
enableCookies: true,
|
||||
cookieJar: path.join(__dirname, "cache", "curl_cookies.txt"),
|
||||
...(config.externalLinks || {}),
|
||||
};
|
||||
|
||||
const CONTENT_DIR = path.join(SITE_ROOT, "content");
|
||||
const CACHE_DIR = path.isAbsolute(externalConfig.cacheDir)
|
||||
? externalConfig.cacheDir
|
||||
: path.resolve(SITE_ROOT, externalConfig.cacheDir);
|
||||
const CACHE_PATH = path.isAbsolute(externalConfig.cacheFile)
|
||||
? externalConfig.cacheFile
|
||||
: path.join(CACHE_DIR, externalConfig.cacheFile);
|
||||
const OUTPUT_FILE = path.isAbsolute(externalConfig.outputFile)
|
||||
? externalConfig.outputFile
|
||||
: path.resolve(SITE_ROOT, externalConfig.outputFile);
|
||||
const COOKIE_JAR = externalConfig.cookieJar
|
||||
? path.isAbsolute(externalConfig.cookieJar)
|
||||
? externalConfig.cookieJar
|
||||
: path.resolve(SITE_ROOT, externalConfig.cookieJar)
|
||||
: path.join(CACHE_DIR, "curl_cookies.txt");
|
||||
|
||||
const CACHE_TTL_SUCCESS_DAYS = Number(externalConfig.cacheTtlSuccessDays) || 0;
|
||||
const CACHE_TTL_CLIENT_ERROR_DAYS = Number(externalConfig.cacheTtlClientErrorDays) || 0;
|
||||
const HOST_DELAY_MS = Number(externalConfig.hostDelayMs) || 0;
|
||||
const RETRY_DELAY_MS = Number(externalConfig.retryDelayMs) || 0;
|
||||
const REQUEST_TIMEOUT_SECONDS = Number(externalConfig.requestTimeoutSeconds) || 0;
|
||||
const maxConcurrentConfig = Number(externalConfig.maxConcurrentHosts);
|
||||
const MAX_CONCURRENT_HOSTS =
|
||||
Number.isFinite(maxConcurrentConfig) && maxConcurrentConfig > 0
|
||||
? maxConcurrentConfig
|
||||
: 4;
|
||||
const DEFAULT_USER_AGENT =
|
||||
typeof externalConfig.userAgent === "string" && externalConfig.userAgent.trim()
|
||||
? externalConfig.userAgent.trim()
|
||||
: new UserAgent().toString();
|
||||
const ENABLE_COOKIES = externalConfig.enableCookies !== false;
|
||||
const PROGRESS_FILE = path.join(__dirname, "cache", "external_links_progress.csv");
|
||||
const execFileAsync = util.promisify(execFile);
|
||||
|
||||
fs.mkdirSync(CACHE_DIR, { recursive: true });
|
||||
if (ENABLE_COOKIES) {
|
||||
fs.mkdirSync(path.dirname(COOKIE_JAR), { recursive: true });
|
||||
if (!fs.existsSync(COOKIE_JAR)) {
|
||||
fs.closeSync(fs.openSync(COOKIE_JAR, "a"));
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
if (fs.existsSync(PROGRESS_FILE)) {
|
||||
fs.unlinkSync(PROGRESS_FILE);
|
||||
}
|
||||
} catch (error) {
|
||||
console.warn(`Unable to remove existing progress file: ${error.message}`);
|
||||
}
|
||||
|
||||
let cache = {};
|
||||
if (fs.existsSync(CACHE_PATH)) {
|
||||
cache = yaml.load(fs.readFileSync(CACHE_PATH, "utf8")) || {};
|
||||
}
|
||||
let cacheDirty = false;
|
||||
|
||||
const now = new Date();
|
||||
const BAD_LINKS = [];
|
||||
const lastHostChecks = new Map();
|
||||
const runResults = new Map();
|
||||
|
||||
function isExternalLink(link) {
|
||||
return typeof link === "string" && link.includes("://");
|
||||
function updateProgress(processed, total) {
|
||||
process.stdout.write(`\rURL ${processed}/${total}`);
|
||||
}
|
||||
|
||||
function isCacheValid(entry) {
|
||||
if (!entry?.checked) return false;
|
||||
const date = new Date(entry.checked);
|
||||
return (now - date) / (1000 * 60 * 60 * 24) < CACHE_TTL_DAYS;
|
||||
const ttlDays = (() => {
|
||||
const status = entry.status;
|
||||
if (typeof status === "number") {
|
||||
if (status < 400) return CACHE_TTL_SUCCESS_DAYS;
|
||||
if (status < 500) return CACHE_TTL_CLIENT_ERROR_DAYS;
|
||||
return 0;
|
||||
}
|
||||
return 0;
|
||||
})();
|
||||
if (ttlDays <= 0) return false;
|
||||
return (now - date) / (1000 * 60 * 60 * 24) < ttlDays;
|
||||
}
|
||||
|
||||
function extractLinksFromText(text) {
|
||||
const regex = /\bhttps?:\/\/[^\s)"'>]+/g;
|
||||
return text.match(regex) || [];
|
||||
}
|
||||
|
||||
async function checkLink(file, line, url) {
|
||||
if (isCacheValid(cache[url])) return;
|
||||
|
||||
const meta = await scrapePage(url, null, { screenshot: false });
|
||||
cache[url] = {
|
||||
status: meta.httpStatus || null,
|
||||
checked: new Date().toISOString(),
|
||||
};
|
||||
|
||||
const bundle = path.relative(SITE_ROOT, file);
|
||||
|
||||
if (!meta.httpStatus || meta.httpStatus >= 400) {
|
||||
BAD_LINKS.push({ bundle, url, line, status: meta.httpStatus });
|
||||
process.stdout.write("❌");
|
||||
} else {
|
||||
process.stdout.write("✔");
|
||||
async function collectMarkdownLinks(filePath, occurrencesMap) {
|
||||
const entries = await collectMarkdownLinksFromFile(filePath);
|
||||
for (const { url, line } of entries) {
|
||||
recordOccurrence(occurrencesMap, filePath, line, url);
|
||||
}
|
||||
}
|
||||
|
||||
async function processMarkdown(filePath) {
|
||||
const fileStream = fs.createReadStream(filePath);
|
||||
const rl = readline.createInterface({ input: fileStream, crlfDelay: Infinity });
|
||||
let lineNumber = 0;
|
||||
for await (const line of rl) {
|
||||
lineNumber++;
|
||||
const links = extractLinksFromText(line);
|
||||
for (const link of links) {
|
||||
await checkLink(filePath, lineNumber, link);
|
||||
function recordOccurrence(occurrencesMap, filePath, lineNumber, url) {
|
||||
if (!occurrencesMap.has(url)) {
|
||||
occurrencesMap.set(url, { url, occurrences: [] });
|
||||
}
|
||||
const entry = occurrencesMap.get(url);
|
||||
const alreadyRecorded = entry.occurrences.some(
|
||||
(item) => item.file === filePath && item.line === lineNumber
|
||||
);
|
||||
if (!alreadyRecorded) {
|
||||
entry.occurrences.push({ file: filePath, line: lineNumber });
|
||||
}
|
||||
}
|
||||
|
||||
function delay(ms) {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
async function applyHostDelay(host) {
|
||||
if (!host) return;
|
||||
const last = lastHostChecks.get(host);
|
||||
if (last) {
|
||||
const elapsed = Date.now() - last;
|
||||
const waitTime = HOST_DELAY_MS - elapsed;
|
||||
if (waitTime > 0) {
|
||||
await delay(waitTime);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function processYamlRecursively(obj, links = []) {
|
||||
if (typeof obj === "string" && isExternalLink(obj)) {
|
||||
links.push(obj);
|
||||
function recordHostCheck(host) {
|
||||
if (host) {
|
||||
lastHostChecks.set(host, Date.now());
|
||||
}
|
||||
}
|
||||
|
||||
function extractHost(url) {
|
||||
try {
|
||||
return new URL(url).hostname;
|
||||
} catch (_) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
function persistCache() {
|
||||
if (!cacheDirty) return;
|
||||
ensureDirectoryExists(CACHE_PATH);
|
||||
fs.writeFileSync(CACHE_PATH, yaml.dump(cache));
|
||||
cacheDirty = false;
|
||||
}
|
||||
|
||||
function formatLocations(occurrences) {
|
||||
return occurrences
|
||||
.map(({ file, line }) => `${path.relative(SITE_ROOT, file)}:${line}`)
|
||||
.join("; ");
|
||||
}
|
||||
|
||||
function escapeCsvField(value) {
|
||||
const stringValue = String(value);
|
||||
if (/[",\n]/.test(stringValue)) {
|
||||
return `"${stringValue.replace(/"/g, '""')}"`;
|
||||
}
|
||||
return stringValue;
|
||||
}
|
||||
|
||||
function appendProgress(url, occurrences, status) {
|
||||
const locationText = formatLocations(occurrences);
|
||||
const statusText =
|
||||
typeof status === "number" && status < 400 && status !== null ? "" : status ?? "";
|
||||
const line = [
|
||||
escapeCsvField(url),
|
||||
escapeCsvField(locationText),
|
||||
escapeCsvField(statusText),
|
||||
].join(",");
|
||||
fs.appendFileSync(PROGRESS_FILE, `${line}\n`);
|
||||
}
|
||||
|
||||
function groupEntriesByHost(entries) {
|
||||
const result = new Map();
|
||||
for (const entry of entries) {
|
||||
const host = extractHost(entry.url);
|
||||
const key = host || `__invalid__:${entry.url}`;
|
||||
if (!result.has(key)) {
|
||||
result.set(key, { host, entries: [] });
|
||||
}
|
||||
result.get(key).entries.push(entry);
|
||||
}
|
||||
return Array.from(result.values());
|
||||
}
|
||||
|
||||
async function runWithConcurrency(items, worker, concurrency) {
|
||||
const executing = new Set();
|
||||
const promises = [];
|
||||
for (const item of items) {
|
||||
const promise = Promise.resolve().then(() => worker(item));
|
||||
promises.push(promise);
|
||||
executing.add(promise);
|
||||
const clean = () => executing.delete(promise);
|
||||
promise.then(clean).catch(clean);
|
||||
if (executing.size >= concurrency) {
|
||||
await Promise.race(executing);
|
||||
}
|
||||
}
|
||||
return Promise.all(promises);
|
||||
}
|
||||
|
||||
async function curlRequest(url, method) {
|
||||
const args = [
|
||||
"--silent",
|
||||
"--location",
|
||||
"--fail",
|
||||
"--max-time",
|
||||
`${REQUEST_TIMEOUT_SECONDS}`,
|
||||
"--output",
|
||||
"/dev/null",
|
||||
"--write-out",
|
||||
"%{http_code}",
|
||||
"--user-agent",
|
||||
DEFAULT_USER_AGENT,
|
||||
"--request",
|
||||
method,
|
||||
url,
|
||||
];
|
||||
|
||||
if (ENABLE_COOKIES) {
|
||||
args.push("--cookie", COOKIE_JAR, "--cookie-jar", COOKIE_JAR);
|
||||
}
|
||||
|
||||
try {
|
||||
const { stdout } = await execFileAsync("curl", args);
|
||||
const status = parseInt(stdout.trim(), 10);
|
||||
return {
|
||||
status: Number.isNaN(status) ? null : status,
|
||||
errorType: null,
|
||||
method: method.toUpperCase(),
|
||||
};
|
||||
} catch (error) {
|
||||
const rawStatus = error?.stdout?.toString().trim();
|
||||
const status = rawStatus ? parseInt(rawStatus, 10) : null;
|
||||
const errorCode = Number(error?.code);
|
||||
const errorType = errorCode === 28 ? "timeout" : null;
|
||||
return {
|
||||
status: Number.isNaN(status) ? null : status,
|
||||
errorType,
|
||||
method: method.toUpperCase(),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
function shouldRetryWithGet(result) {
|
||||
if (result.errorType) return true;
|
||||
if (result.status === null) return true;
|
||||
return result.status >= 400;
|
||||
}
|
||||
|
||||
async function checkLink(url) {
|
||||
let info = runResults.get(url);
|
||||
if (!info) {
|
||||
const cachedInfo = cache[url];
|
||||
if (!isCacheValid(cachedInfo)) {
|
||||
const host = extractHost(url);
|
||||
if (host) {
|
||||
await applyHostDelay(host);
|
||||
}
|
||||
|
||||
let result = await curlRequest(url, "HEAD");
|
||||
recordHostCheck(host);
|
||||
|
||||
if (shouldRetryWithGet(result)) {
|
||||
await delay(RETRY_DELAY_MS);
|
||||
if (host) {
|
||||
await applyHostDelay(host);
|
||||
}
|
||||
result = await curlRequest(url, "GET");
|
||||
recordHostCheck(host);
|
||||
}
|
||||
|
||||
info = {
|
||||
status: result.status ?? null,
|
||||
errorType: result.errorType || null,
|
||||
method: result.method,
|
||||
checked: new Date().toISOString(),
|
||||
};
|
||||
cache[url] = info;
|
||||
cacheDirty = true;
|
||||
persistCache();
|
||||
} else if (cachedInfo) {
|
||||
info = cachedInfo;
|
||||
} else {
|
||||
info = {
|
||||
status: null,
|
||||
errorType: "unknown",
|
||||
method: "HEAD",
|
||||
checked: new Date().toISOString(),
|
||||
};
|
||||
}
|
||||
runResults.set(url, info);
|
||||
}
|
||||
return info;
|
||||
}
|
||||
|
||||
function processYamlRecursively(obj, links = new Set()) {
|
||||
if (typeof obj === "string") {
|
||||
for (const link of extractLinksFromText(obj)) {
|
||||
links.add(link);
|
||||
}
|
||||
} else if (Array.isArray(obj)) {
|
||||
for (const item of obj) processYamlRecursively(item, links);
|
||||
} else if (typeof obj === "object" && obj !== null) {
|
||||
@@ -76,15 +340,117 @@ function processYamlRecursively(obj, links = []) {
|
||||
return links;
|
||||
}
|
||||
|
||||
async function processYaml(filePath) {
|
||||
function stripYamlInlineComment(line) {
|
||||
let inSingle = false;
|
||||
let inDouble = false;
|
||||
for (let i = 0; i < line.length; i++) {
|
||||
const ch = line[i];
|
||||
if (ch === "'" && !inDouble) {
|
||||
const next = line[i + 1];
|
||||
if (inSingle && next === "'") {
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
inSingle = !inSingle;
|
||||
} else if (ch === '"' && !inSingle) {
|
||||
if (!inDouble) {
|
||||
inDouble = true;
|
||||
} else if (line[i - 1] !== "\\") {
|
||||
inDouble = false;
|
||||
}
|
||||
} else if (ch === "#" && !inSingle && !inDouble) {
|
||||
return line.slice(0, i);
|
||||
} else if (ch === "\\" && inDouble) {
|
||||
i++;
|
||||
}
|
||||
}
|
||||
return line;
|
||||
}
|
||||
|
||||
function isYamlCommentLine(line) {
|
||||
return line.trim().startsWith("#");
|
||||
}
|
||||
|
||||
function isBlockScalarIndicator(line) {
|
||||
const cleaned = stripYamlInlineComment(line).trim();
|
||||
return /:\s*[>|][0-9+-]*\s*$/.test(cleaned);
|
||||
}
|
||||
|
||||
async function collectYamlLinks(filePath, occurrencesMap) {
|
||||
let linkSet = new Set();
|
||||
try {
|
||||
const doc = yaml.load(fs.readFileSync(filePath, "utf8"));
|
||||
const links = processYamlRecursively(doc);
|
||||
for (const link of links) {
|
||||
await checkLink(filePath, "?", link);
|
||||
}
|
||||
linkSet = processYamlRecursively(doc);
|
||||
} catch (e) {
|
||||
console.error(`Failed to parse YAML file: ${filePath}`);
|
||||
return;
|
||||
}
|
||||
|
||||
if (linkSet.size === 0) return;
|
||||
|
||||
const recorded = new Map();
|
||||
const rawLines = fs.readFileSync(filePath, "utf8").split(/\r?\n/);
|
||||
let inBlockScalar = false;
|
||||
let blockIndent = 0;
|
||||
|
||||
const markRecorded = (url, lineNumber) => {
|
||||
if (!recorded.has(url)) {
|
||||
recorded.set(url, new Set());
|
||||
}
|
||||
const lines = recorded.get(url);
|
||||
if (lines.has(lineNumber)) return;
|
||||
lines.add(lineNumber);
|
||||
recordOccurrence(occurrencesMap, filePath, lineNumber, url);
|
||||
};
|
||||
|
||||
for (let index = 0; index < rawLines.length; index++) {
|
||||
const lineNumber = index + 1;
|
||||
const line = rawLines[index];
|
||||
const indent = line.match(/^\s*/)?.[0].length ?? 0;
|
||||
const trimmed = line.trim();
|
||||
|
||||
if (inBlockScalar) {
|
||||
if (trimmed === "" && indent < blockIndent) {
|
||||
inBlockScalar = false;
|
||||
continue;
|
||||
}
|
||||
if (trimmed === "" || indent >= blockIndent) {
|
||||
if (isYamlCommentLine(line)) {
|
||||
continue;
|
||||
}
|
||||
for (const link of extractLinksFromText(line)) {
|
||||
if (linkSet.has(link)) {
|
||||
markRecorded(link, lineNumber);
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
inBlockScalar = false;
|
||||
}
|
||||
|
||||
const withoutComment = stripYamlInlineComment(line);
|
||||
const trimmedWithoutComment = withoutComment.trim();
|
||||
|
||||
if (isBlockScalarIndicator(line)) {
|
||||
inBlockScalar = true;
|
||||
blockIndent = indent + 1;
|
||||
}
|
||||
|
||||
if (isYamlCommentLine(line)) continue;
|
||||
|
||||
if (!trimmedWithoutComment) continue;
|
||||
|
||||
for (const link of extractLinksFromText(withoutComment)) {
|
||||
if (linkSet.has(link)) {
|
||||
markRecorded(link, lineNumber);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (const link of linkSet) {
|
||||
if (!recorded.has(link) || recorded.get(link).size === 0) {
|
||||
recordOccurrence(occurrencesMap, filePath, "?", link);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -103,24 +469,155 @@ function walk(dir, exts) {
|
||||
return results;
|
||||
}
|
||||
|
||||
(async () => {
|
||||
const mdFiles = walk(CONTENT_DIR, [".md"]);
|
||||
const yamlFiles = walk(DATA_DIR, [".yaml", ".yml"]);
|
||||
console.log(`Scanning ${mdFiles.length} Markdown and ${yamlFiles.length} YAML files...`);
|
||||
function ensureDirectoryExists(targetFile) {
|
||||
fs.mkdirSync(path.dirname(targetFile), { recursive: true });
|
||||
}
|
||||
|
||||
function escapeMarkdownCell(value) {
|
||||
return String(value).replace(/\|/g, "\\|").replace(/\r?\n/g, " ");
|
||||
}
|
||||
|
||||
function generateMarkdownReport(entries) {
|
||||
const header = [
|
||||
"# Broken External Links",
|
||||
"",
|
||||
`Generated: ${new Date().toISOString()}`,
|
||||
"",
|
||||
];
|
||||
if (entries.length === 0) {
|
||||
return header.concat(["No broken external links found."]).join("\n");
|
||||
}
|
||||
const rows = entries.map((entry) => {
|
||||
const url = escapeMarkdownCell(entry.url);
|
||||
const location = escapeMarkdownCell(entry.location);
|
||||
const status = escapeMarkdownCell(entry.status);
|
||||
return `| ${url} | ${location} | ${status} |`;
|
||||
});
|
||||
return header
|
||||
.concat(["| URL | Location | Status |", "| --- | --- | --- |", ...rows])
|
||||
.join("\n");
|
||||
}
|
||||
|
||||
function generateCsvReport(entries) {
|
||||
const lines = [`"url","location","status"`];
|
||||
for (const entry of entries) {
|
||||
const line = [entry.url, entry.location, entry.status]
|
||||
.map((field) => `"${String(field).replace(/"/g, '""')}"`)
|
||||
.join(",");
|
||||
lines.push(line);
|
||||
}
|
||||
return lines.join("\n");
|
||||
}
|
||||
|
||||
function writeReport(entries) {
|
||||
const format = String(externalConfig.outputFormat || "markdown").toLowerCase();
|
||||
const content =
|
||||
format === "csv" ? generateCsvReport(entries) : generateMarkdownReport(entries);
|
||||
ensureDirectoryExists(OUTPUT_FILE);
|
||||
fs.writeFileSync(OUTPUT_FILE, content, "utf8");
|
||||
}
|
||||
|
||||
(async () => {
|
||||
const occurrencesByUrl = new Map();
|
||||
const mdFiles = walk(CONTENT_DIR, [".md", ".markdown"]);
|
||||
const yamlFiles = walk(CONTENT_DIR, [".yaml", ".yml"]);
|
||||
for (const file of mdFiles) {
|
||||
await processMarkdown(file);
|
||||
await collectMarkdownLinks(file, occurrencesByUrl);
|
||||
}
|
||||
for (const file of yamlFiles) {
|
||||
await processYaml(file);
|
||||
await collectYamlLinks(file, occurrencesByUrl);
|
||||
}
|
||||
|
||||
const uniqueEntries = Array.from(occurrencesByUrl.values());
|
||||
const activeUrls = new Set(uniqueEntries.map((entry) => entry.url));
|
||||
let cachePruned = false;
|
||||
for (const url of Object.keys(cache)) {
|
||||
if (!activeUrls.has(url)) {
|
||||
delete cache[url];
|
||||
cachePruned = true;
|
||||
}
|
||||
}
|
||||
if (cachePruned) {
|
||||
cacheDirty = true;
|
||||
}
|
||||
ensureDirectoryExists(PROGRESS_FILE);
|
||||
fs.writeFileSync(PROGRESS_FILE, `"url","locations","status"\n`, "utf8");
|
||||
|
||||
const total = uniqueEntries.length;
|
||||
if (total === 0) {
|
||||
process.stdout.write("No external links found.\n");
|
||||
ensureDirectoryExists(CACHE_PATH);
|
||||
fs.writeFileSync(CACHE_PATH, yaml.dump(cache));
|
||||
writeReport([]);
|
||||
return;
|
||||
}
|
||||
|
||||
const hostGroups = groupEntriesByHost(uniqueEntries);
|
||||
const concurrency = Math.max(1, Math.min(MAX_CONCURRENT_HOSTS, hostGroups.length || 1));
|
||||
let processed = 0;
|
||||
await runWithConcurrency(
|
||||
hostGroups,
|
||||
async ({ entries }) => {
|
||||
for (const entry of entries) {
|
||||
const info = await checkLink(entry.url);
|
||||
const status = typeof info?.status === "number" ? info.status : null;
|
||||
const errorType = info?.errorType || null;
|
||||
const hasHttpError = status !== null && status >= 400;
|
||||
const isTimeout = errorType === "timeout";
|
||||
const statusLabel = isTimeout ? "timeout" : status ?? "error";
|
||||
|
||||
if (status === null || hasHttpError || isTimeout) {
|
||||
BAD_LINKS.push({
|
||||
location: formatLocations(entry.occurrences),
|
||||
url: entry.url,
|
||||
status: statusLabel,
|
||||
});
|
||||
}
|
||||
|
||||
appendProgress(entry.url, entry.occurrences, hasHttpError || isTimeout || status === null ? statusLabel : status);
|
||||
processed += 1;
|
||||
updateProgress(processed, total);
|
||||
}
|
||||
},
|
||||
concurrency
|
||||
);
|
||||
process.stdout.write("\n");
|
||||
|
||||
ensureDirectoryExists(CACHE_PATH);
|
||||
fs.writeFileSync(CACHE_PATH, yaml.dump(cache));
|
||||
|
||||
console.log("\n\n=== Broken External Links Report ===");
|
||||
if (BAD_LINKS.length === 0) {
|
||||
console.log("✅ No broken external links found.");
|
||||
} else {
|
||||
console.table(BAD_LINKS);
|
||||
writeReport([]);
|
||||
console.log(
|
||||
`No broken external links detected. Report saved to ${path.relative(
|
||||
SITE_ROOT,
|
||||
OUTPUT_FILE
|
||||
)}.`
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
const sorted = BAD_LINKS.sort((a, b) => {
|
||||
const rank = (entry) => {
|
||||
if (entry.status === "timeout") return 2;
|
||||
if (typeof entry.status === "number") {
|
||||
return entry.status === 404 ? 0 : 1;
|
||||
}
|
||||
return 1;
|
||||
};
|
||||
const diff = rank(a) - rank(b);
|
||||
if (diff !== 0) return diff;
|
||||
if (typeof a.status === "number" && typeof b.status === "number") {
|
||||
return a.status - b.status;
|
||||
}
|
||||
return a.url.localeCompare(b.url);
|
||||
});
|
||||
|
||||
writeReport(sorted);
|
||||
console.log(
|
||||
`Found ${sorted.length} broken external link(s). Report saved to ${path.relative(
|
||||
SITE_ROOT,
|
||||
OUTPUT_FILE
|
||||
)}.`
|
||||
);
|
||||
})();
|
||||
|
||||
@@ -1,5 +1,19 @@
|
||||
{
|
||||
"rebrickable": {
|
||||
"apiKey": ""
|
||||
},
|
||||
"externalLinks": {
|
||||
"cacheDir": "tools/cache",
|
||||
"cacheFile": "external_links.yaml",
|
||||
"hostDelayMs": 2000,
|
||||
"retryDelayMs": 5000,
|
||||
"requestTimeoutSeconds": 5,
|
||||
"cacheTtlSuccessDays": 7,
|
||||
"cacheTtlClientErrorDays": 0,
|
||||
"outputFormat": "markdown",
|
||||
"outputFile": "tools/cache/external_links_report.md",
|
||||
"userAgent": null,
|
||||
"enableCookies": true,
|
||||
"cookieJar": "tools/cache/curl_cookies.txt"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
246
tools/lib/markdown_links.js
Normal file
246
tools/lib/markdown_links.js
Normal file
@@ -0,0 +1,246 @@
|
||||
const fs = require("fs");
|
||||
const readline = require("readline");
|
||||
|
||||
function trimUnbalancedTrailing(value, openChar, closeChar) {
|
||||
let result = value;
|
||||
while (result.endsWith(closeChar)) {
|
||||
const openCount = (result.match(new RegExp(`\\${openChar}`, "g")) || []).length;
|
||||
const closeCount = (result.match(new RegExp(`\\${closeChar}`, "g")) || []).length;
|
||||
if (closeCount > openCount) {
|
||||
result = result.slice(0, -1);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
function sanitizeUrlCandidate(raw, options = {}) {
|
||||
if (typeof raw !== "string") return null;
|
||||
let candidate = raw.trim();
|
||||
if (!candidate) return null;
|
||||
|
||||
if (candidate.startsWith("<") && candidate.endsWith(">")) {
|
||||
candidate = candidate.slice(1, -1).trim();
|
||||
}
|
||||
|
||||
while (/[.,;:!?'"\u2018\u2019\u201C\u201D]+$/.test(candidate)) {
|
||||
candidate = candidate.slice(0, -1);
|
||||
}
|
||||
|
||||
if (!options.keepTrailingParens) {
|
||||
candidate = trimUnbalancedTrailing(candidate, "(", ")");
|
||||
} else if (candidate.endsWith(")")) {
|
||||
const openCount = (candidate.match(/\(/g) || []).length;
|
||||
const closeCount = (candidate.match(/\)/g) || []).length;
|
||||
if (closeCount > openCount) {
|
||||
candidate = trimUnbalancedTrailing(candidate, "(", ")");
|
||||
}
|
||||
}
|
||||
candidate = trimUnbalancedTrailing(candidate, "[", "]");
|
||||
candidate = trimUnbalancedTrailing(candidate, "{", "}");
|
||||
|
||||
candidate = candidate.replace(/[*_]+$/, "");
|
||||
candidate = candidate.replace(/\[\^[^\]]*\]$/, "");
|
||||
if (!options.keepTrailingParens) {
|
||||
candidate = trimUnbalancedTrailing(candidate, "(", ")");
|
||||
}
|
||||
|
||||
if ((candidate.match(/\(/g) || []).length > (candidate.match(/\)/g) || []).length) {
|
||||
return null;
|
||||
}
|
||||
if ((candidate.match(/\[/g) || []).length > (candidate.match(/]/g) || []).length) {
|
||||
return null;
|
||||
}
|
||||
if ((candidate.match(/{/g) || []).length > (candidate.match(/}/g) || []).length) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return candidate || null;
|
||||
}
|
||||
|
||||
function findMatchingPair(text, startIndex, openChar, closeChar) {
|
||||
let depth = 0;
|
||||
for (let i = startIndex; i < text.length; i++) {
|
||||
const ch = text[i];
|
||||
if (ch === "\\") {
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
if (ch === openChar) {
|
||||
depth++;
|
||||
} else if (ch === closeChar) {
|
||||
depth--;
|
||||
if (depth === 0) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
function parseLinkDestination(raw) {
|
||||
if (typeof raw !== "string") return null;
|
||||
let candidate = raw.trim();
|
||||
if (!candidate) return null;
|
||||
|
||||
if (candidate.startsWith("<")) {
|
||||
const closeIndex = candidate.indexOf(">");
|
||||
if (closeIndex > 0) {
|
||||
return candidate.slice(1, closeIndex).trim();
|
||||
}
|
||||
}
|
||||
|
||||
let result = "";
|
||||
let escaping = false;
|
||||
let parenDepth = 0;
|
||||
for (let i = 0; i < candidate.length; i++) {
|
||||
const ch = candidate[i];
|
||||
if (escaping) {
|
||||
result += ch;
|
||||
escaping = false;
|
||||
continue;
|
||||
}
|
||||
if (ch === "\\") {
|
||||
escaping = true;
|
||||
continue;
|
||||
}
|
||||
if (ch === "(") {
|
||||
parenDepth++;
|
||||
} else if (ch === ")" && parenDepth > 0) {
|
||||
parenDepth--;
|
||||
} else if (/\s/.test(ch) && parenDepth === 0) {
|
||||
break;
|
||||
}
|
||||
result += ch;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
function extractMarkdownDestinations(text) {
|
||||
const urls = [];
|
||||
for (let i = 0; i < text.length; i++) {
|
||||
if (text[i] === "!") {
|
||||
if (text[i + 1] !== "[") continue;
|
||||
i += 1;
|
||||
}
|
||||
if (text[i] !== "[") continue;
|
||||
|
||||
const closeBracket = findMatchingPair(text, i, "[", "]");
|
||||
if (closeBracket === -1) continue;
|
||||
|
||||
let pointer = closeBracket + 1;
|
||||
while (pointer < text.length && /\s/.test(text[pointer])) pointer++;
|
||||
if (pointer >= text.length || text[pointer] !== "(") {
|
||||
i = closeBracket;
|
||||
continue;
|
||||
}
|
||||
|
||||
const openParen = pointer;
|
||||
const closeParen = findMatchingPair(text, openParen, "(", ")");
|
||||
if (closeParen === -1) {
|
||||
break;
|
||||
}
|
||||
|
||||
const rawDestination = text.slice(openParen + 1, closeParen);
|
||||
const candidate = parseLinkDestination(rawDestination);
|
||||
if (candidate) {
|
||||
urls.push(candidate);
|
||||
}
|
||||
i = closeParen;
|
||||
}
|
||||
return urls;
|
||||
}
|
||||
|
||||
function isExternalLink(link) {
|
||||
return typeof link === "string" && link.includes("://");
|
||||
}
|
||||
|
||||
function extractLinksFromText(text) {
|
||||
if (typeof text !== "string" || !text.includes("http")) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const results = [];
|
||||
const seen = new Set();
|
||||
|
||||
function addCandidate(candidate, options = {}) {
|
||||
const sanitized = sanitizeUrlCandidate(candidate, options);
|
||||
if (!sanitized) return;
|
||||
if (!isExternalLink(sanitized)) return;
|
||||
if (seen.has(sanitized)) return;
|
||||
seen.add(sanitized);
|
||||
results.push(sanitized);
|
||||
}
|
||||
|
||||
for (const url of extractMarkdownDestinations(text)) {
|
||||
addCandidate(url, { keepTrailingParens: true });
|
||||
}
|
||||
|
||||
const angleRegex = /<\s*(https?:\/\/[^>\s]+)\s*>/gi;
|
||||
let match;
|
||||
while ((match = angleRegex.exec(text)) !== null) {
|
||||
addCandidate(match[1]);
|
||||
}
|
||||
|
||||
const autoRegex = /https?:\/\/[^\s<>"`]+/gi;
|
||||
while ((match = autoRegex.exec(text)) !== null) {
|
||||
addCandidate(match[0]);
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
async function collectMarkdownLinksFromStream(stream) {
|
||||
const rl = readline.createInterface({ input: stream, crlfDelay: Infinity });
|
||||
const results = [];
|
||||
let lineNumber = 0;
|
||||
let inFrontMatter = false;
|
||||
try {
|
||||
for await (const line of rl) {
|
||||
lineNumber++;
|
||||
const trimmed = line.trim();
|
||||
|
||||
if (lineNumber === 1 && trimmed === "---") {
|
||||
inFrontMatter = true;
|
||||
continue;
|
||||
}
|
||||
if (inFrontMatter) {
|
||||
if (trimmed === "---") {
|
||||
inFrontMatter = false;
|
||||
continue;
|
||||
}
|
||||
if (trimmed.startsWith("#")) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
for (const url of extractLinksFromText(line)) {
|
||||
results.push({ url, line: lineNumber });
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
rl.close();
|
||||
if (typeof stream.close === "function") {
|
||||
stream.close();
|
||||
}
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
async function collectMarkdownLinksFromFile(filePath) {
|
||||
const stream = fs.createReadStream(filePath, { encoding: "utf8" });
|
||||
try {
|
||||
return await collectMarkdownLinksFromStream(stream);
|
||||
} catch (error) {
|
||||
stream.destroy();
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
collectMarkdownLinksFromFile,
|
||||
collectMarkdownLinksFromStream,
|
||||
extractLinksFromText,
|
||||
sanitizeUrlCandidate,
|
||||
};
|
||||
@@ -1,4 +1,4 @@
|
||||
const { getArchiveUrl, saveToArchive } = require("./lib/archive");
|
||||
const { getArchiveUrl, saveToArchive } = require("../lib/archive");
|
||||
|
||||
(async () => {
|
||||
const testUrl = "https://richard-dern.fr";
|
||||
68
tools/tests/markdown_links.test.js
Normal file
68
tools/tests/markdown_links.test.js
Normal file
@@ -0,0 +1,68 @@
|
||||
const test = require("node:test");
|
||||
const assert = require("node:assert/strict");
|
||||
const { Readable } = require("node:stream");
|
||||
const {
|
||||
collectMarkdownLinksFromStream,
|
||||
extractLinksFromText,
|
||||
sanitizeUrlCandidate,
|
||||
} = require("../lib/markdown_links");
|
||||
|
||||
test("extractLinksFromText returns sanitized external URLs only once", () => {
|
||||
const input =
|
||||
"See [example](https://example.com) and <https://foo.com>. " +
|
||||
"Autolink https://bar.com/path).\nDuplicate https://example.com!";
|
||||
const urls = extractLinksFromText(input);
|
||||
assert.deepStrictEqual(urls, ["https://example.com", "https://foo.com", "https://bar.com/path"]);
|
||||
});
|
||||
|
||||
test("collectMarkdownLinksFromStream preserves line numbers", async () => {
|
||||
const content = [
|
||||
"Intro line with no link",
|
||||
"Markdown [link](https://docs.example.org/page).",
|
||||
"Plain link https://news.example.net/article.",
|
||||
"Trailing <https://portal.example.com/path> punctuation.",
|
||||
"Markdown [link](https://docs.example.org/page(with more valid content)).",
|
||||
"Le **[baume du Canada](https://fr.wikipedia.org/wiki/Baume_du_Canada)**",
|
||||
"(_Theropoda [incertae sedis](https://fr.wikipedia.org/wiki/Incertae_sedis)_)",
|
||||
"[CDN](https://fr.wikipedia.org/wiki/Réseau_de_diffusion_de_contenu)[^2]."
|
||||
].join("\n");
|
||||
const stream = Readable.from([content]);
|
||||
const links = await collectMarkdownLinksFromStream(stream);
|
||||
assert.deepStrictEqual(links, [
|
||||
{ url: "https://docs.example.org/page", line: 2 },
|
||||
{ url: "https://news.example.net/article", line: 3 },
|
||||
{ url: "https://portal.example.com/path", line: 4 },
|
||||
{ url: "https://docs.example.org/page(with more valid content)", line: 5 },
|
||||
{ url: "https://fr.wikipedia.org/wiki/Baume_du_Canada", line: 6 },
|
||||
{ url: "https://fr.wikipedia.org/wiki/Incertae_sedis", line: 7 },
|
||||
{ url: "https://fr.wikipedia.org/wiki/Réseau_de_diffusion_de_contenu", line: 8 },
|
||||
]);
|
||||
});
|
||||
|
||||
test("collectMarkdownLinksFromStream ignores URLs in front matter comments", async () => {
|
||||
const content = [
|
||||
"---",
|
||||
"links:",
|
||||
" # url: https://ignored.example.com",
|
||||
" - url: https://included.example.com",
|
||||
"---",
|
||||
"Body with https://body.example.com link.",
|
||||
].join("\n");
|
||||
const stream = Readable.from([content]);
|
||||
const links = await collectMarkdownLinksFromStream(stream);
|
||||
assert.deepStrictEqual(links, [
|
||||
{ url: "https://included.example.com", line: 4 },
|
||||
{ url: "https://body.example.com", line: 6 },
|
||||
]);
|
||||
});
|
||||
|
||||
test("sanitizeUrlCandidate removes spurious trailing punctuation", () => {
|
||||
const cases = [
|
||||
["https://example.com).", "https://example.com"],
|
||||
["https://example.com!\"", "https://example.com"],
|
||||
["<https://example.com>", "https://example.com"],
|
||||
];
|
||||
for (const [input, expected] of cases) {
|
||||
assert.equal(sanitizeUrlCandidate(input), expected);
|
||||
}
|
||||
});
|
||||
@@ -1,4 +1,4 @@
|
||||
const { scrapePage } = require("./lib/puppeteer");
|
||||
const { scrapePage } = require("../lib/puppeteer");
|
||||
const path = require("path");
|
||||
|
||||
(async () => {
|
||||
Reference in New Issue
Block a user