1

Uniformisation de la vérification des liens

This commit is contained in:
2025-11-28 23:43:07 +01:00
parent 5e846aa4b4
commit 0260c1ab4e
7 changed files with 2145 additions and 1157 deletions

View File

@@ -2,17 +2,13 @@
const fs = require("fs");
const path = require("path");
const util = require("util");
const yaml = require("js-yaml");
const UserAgent = require("user-agents");
const { execFile } = require("child_process");
const { buildUserAgent, probeUrl, shouldRetry } = require("./lib/http");
const {
collectMarkdownLinksFromFile,
extractLinksFromText,
} = require("./lib/markdown_links");
const execFileAsync = util.promisify(execFile);
const SITE_ROOT = path.resolve(__dirname, "..");
const CONTENT_DIR = path.join(SITE_ROOT, "content");
const CONFIG_PATH = path.join(__dirname, "config", "config.json");
@@ -29,9 +25,8 @@ const DEFAULT_CONFIG = {
cacheTtlServerErrorDays: 1,
cacheTtlTimeoutDays: 7,
maxConcurrentHosts: 4,
maxRedirects: 5,
userAgent: null,
enableCookies: true,
cookieJar: path.join(__dirname, "cache", "curl_cookies.txt"),
};
function loadConfig() {
@@ -60,26 +55,24 @@ const CACHE_DIR = path.isAbsolute(settings.cacheDir)
const REPORT_PATH = path.isAbsolute(settings.cacheFile)
? settings.cacheFile
: path.join(CACHE_DIR, settings.cacheFile);
const COOKIE_JAR = settings.cookieJar
? path.isAbsolute(settings.cookieJar)
? settings.cookieJar
: path.resolve(SITE_ROOT, settings.cookieJar)
: path.join(CACHE_DIR, "curl_cookies.txt");
const HOST_DELAY_MS = Math.max(0, Number(settings.hostDelayMs) || 0);
const RETRY_DELAY_MS = Math.max(0, Number(settings.retryDelayMs) || 0);
const REQUEST_TIMEOUT_SECONDS = Math.max(1, Number(settings.requestTimeoutSeconds) || 5);
const REQUEST_TIMEOUT_MS = REQUEST_TIMEOUT_SECONDS * 1000;
const MAX_CONCURRENT_HOSTS = Math.max(
1,
Number.isFinite(Number(settings.maxConcurrentHosts))
? Number(settings.maxConcurrentHosts)
: DEFAULT_CONFIG.maxConcurrentHosts
);
const DEFAULT_USER_AGENT =
typeof settings.userAgent === "string" && settings.userAgent.trim()
? settings.userAgent.trim()
: new UserAgent().toString();
const ENABLE_COOKIES = settings.enableCookies !== false;
const MAX_REDIRECTS = Math.max(
0,
Number.isFinite(Number(settings.maxRedirects))
? Number(settings.maxRedirects)
: DEFAULT_CONFIG.maxRedirects
);
const DEFAULT_USER_AGENT = buildUserAgent(settings.userAgent);
const CACHE_TTL_SUCCESS_MS = daysToMs(
pickNumber(settings.cacheTtlSuccessDays, DEFAULT_CONFIG.cacheTtlSuccessDays)
@@ -95,12 +88,12 @@ const CACHE_TTL_TIMEOUT_MS = daysToMs(
);
fs.mkdirSync(CACHE_DIR, { recursive: true });
if (ENABLE_COOKIES) {
fs.mkdirSync(path.dirname(COOKIE_JAR), { recursive: true });
if (!fs.existsSync(COOKIE_JAR)) {
fs.closeSync(fs.openSync(COOKIE_JAR, "a"));
}
}
const BASE_HTTP_OPTIONS = {
userAgent: DEFAULT_USER_AGENT,
timeoutMs: REQUEST_TIMEOUT_MS,
maxRedirects: MAX_REDIRECTS,
};
function pickNumber(value, fallback) {
const parsed = Number(value);
@@ -536,59 +529,6 @@ function extractHost(url) {
}
}
async function curlRequest(url, method, hostHeader) {
const args = [
"--silent",
"--location",
"--fail",
"--max-time",
`${REQUEST_TIMEOUT_SECONDS}`,
"--output",
"/dev/null",
"--write-out",
"%{http_code}",
"--user-agent",
DEFAULT_USER_AGENT,
"--request",
method,
];
if (ENABLE_COOKIES) {
args.push("--cookie", COOKIE_JAR, "--cookie-jar", COOKIE_JAR);
}
if (hostHeader) {
args.push("-H", `Host: ${hostHeader}`);
}
args.push(url);
try {
const { stdout } = await execFileAsync("curl", args);
const status = parseInt(stdout.trim(), 10);
return {
status: Number.isNaN(status) ? null : status,
errorType: null,
method: method.toUpperCase(),
};
} catch (error) {
const rawStatus = error?.stdout?.toString().trim();
const status = rawStatus ? parseInt(rawStatus, 10) : null;
const errorCode = Number(error?.code);
const timeout = errorCode === 28 ? "timeout" : null;
return {
status: Number.isNaN(status) ? null : status,
errorType: timeout,
method: method.toUpperCase(),
};
}
}
function shouldRetryWithGet(result) {
if (!result) return true;
if (result.errorType) return true;
if (typeof result.status !== "number") return true;
return result.status >= 400;
}
function getTtlMs(entry) {
if (!entry) return 0;
if (entry.errorType === "timeout" || entry.status === 0 || entry.status === null) {
@@ -748,17 +688,16 @@ async function checkEntries(entriesToCheck, entries, snapshotMeta) {
if (host) {
await applyHostDelay(host);
}
const hostHeader = host || extractHost(entry.url);
let result = await curlRequest(entry.url, "HEAD", hostHeader);
let result = await probeUrl(entry.url, { ...BASE_HTTP_OPTIONS, method: "HEAD" });
recordHostCheck(host);
if (shouldRetryWithGet(result)) {
if (shouldRetry(result)) {
if (RETRY_DELAY_MS > 0) {
await delay(RETRY_DELAY_MS);
}
if (host) {
await applyHostDelay(host);
}
result = await curlRequest(entry.url, "GET", hostHeader);
result = await probeUrl(entry.url, { ...BASE_HTTP_OPTIONS, method: "GET" });
recordHostCheck(host);
}
updateEntryWithResult(entries[entry.url], result);