1

Amélioration de la détection de liens morts

This commit is contained in:
2025-12-08 00:23:22 +01:00
parent 9f1a81f8f5
commit cb7cd917d7
3 changed files with 209 additions and 30 deletions

View File

@@ -3,7 +3,7 @@
const fs = require("fs"); const fs = require("fs");
const path = require("path"); const path = require("path");
const yaml = require("js-yaml"); const yaml = require("js-yaml");
const { buildUserAgent, probeUrl, shouldRetry } = require("./lib/http"); const { buildUserAgent, checkUrl } = require("./lib/http");
const { const {
collectMarkdownLinksFromFile, collectMarkdownLinksFromFile,
extractLinksFromText, extractLinksFromText,
@@ -18,7 +18,6 @@ const DEFAULT_CONFIG = {
cacheDir: path.join(__dirname, "cache"), cacheDir: path.join(__dirname, "cache"),
cacheFile: "external_links.yaml", cacheFile: "external_links.yaml",
hostDelayMs: 2000, hostDelayMs: 2000,
retryDelayMs: 5000,
requestTimeoutSeconds: 5, requestTimeoutSeconds: 5,
cacheTtlSuccessDays: 30, cacheTtlSuccessDays: 30,
cacheTtlClientErrorDays: 7, cacheTtlClientErrorDays: 7,
@@ -27,6 +26,7 @@ const DEFAULT_CONFIG = {
maxConcurrentHosts: 4, maxConcurrentHosts: 4,
maxRedirects: 5, maxRedirects: 5,
userAgent: null, userAgent: null,
ignoreHosts: [],
}; };
function loadConfig() { function loadConfig() {
@@ -57,7 +57,6 @@ const REPORT_PATH = path.isAbsolute(settings.cacheFile)
: path.join(CACHE_DIR, settings.cacheFile); : path.join(CACHE_DIR, settings.cacheFile);
const HOST_DELAY_MS = Math.max(0, Number(settings.hostDelayMs) || 0); const HOST_DELAY_MS = Math.max(0, Number(settings.hostDelayMs) || 0);
const RETRY_DELAY_MS = Math.max(0, Number(settings.retryDelayMs) || 0);
const REQUEST_TIMEOUT_SECONDS = Math.max(1, Number(settings.requestTimeoutSeconds) || 5); const REQUEST_TIMEOUT_SECONDS = Math.max(1, Number(settings.requestTimeoutSeconds) || 5);
const REQUEST_TIMEOUT_MS = REQUEST_TIMEOUT_SECONDS * 1000; const REQUEST_TIMEOUT_MS = REQUEST_TIMEOUT_SECONDS * 1000;
const MAX_CONCURRENT_HOSTS = Math.max( const MAX_CONCURRENT_HOSTS = Math.max(
@@ -73,6 +72,7 @@ const MAX_REDIRECTS = Math.max(
: DEFAULT_CONFIG.maxRedirects : DEFAULT_CONFIG.maxRedirects
); );
const DEFAULT_USER_AGENT = buildUserAgent(settings.userAgent); const DEFAULT_USER_AGENT = buildUserAgent(settings.userAgent);
const IGNORE_HOSTS = parseIgnoreHosts(settings.ignoreHosts);
const CACHE_TTL_SUCCESS_MS = daysToMs( const CACHE_TTL_SUCCESS_MS = daysToMs(
pickNumber(settings.cacheTtlSuccessDays, DEFAULT_CONFIG.cacheTtlSuccessDays) pickNumber(settings.cacheTtlSuccessDays, DEFAULT_CONFIG.cacheTtlSuccessDays)
@@ -103,6 +103,68 @@ function pickNumber(value, fallback) {
return fallback; return fallback;
} }
// Normalise un hôte pour comparaison.
function normalizeHost(value) {
if (typeof value !== "string") {
return null;
}
const normalized = value.trim().toLowerCase();
return normalized || null;
}
// Retire le port éventuel d'une chaîne décrivant un hôte.
function stripPort(hostValue) {
if (typeof hostValue !== "string") {
return null;
}
const trimmed = hostValue.trim();
if (!trimmed) {
return null;
}
const bracketMatch = trimmed.match(/^\[([^\]]+)\](?::\d+)?$/);
if (bracketMatch) {
return bracketMatch[1].toLowerCase();
}
const colonIndex = trimmed.lastIndexOf(":");
if (colonIndex > -1 && trimmed.indexOf(":") === colonIndex) {
const hostPart = trimmed.slice(0, colonIndex).trim();
if (hostPart) {
return hostPart.toLowerCase();
}
}
return trimmed.toLowerCase();
}
// Nettoie une entrée de configuration d'hôte à ignorer (schéma, chemin, port).
function normalizeIgnoreHostEntry(value) {
if (typeof value !== "string") {
return null;
}
let candidate = value.trim().toLowerCase();
if (!candidate) {
return null;
}
candidate = candidate.replace(/^[a-z][a-z0-9+.-]*:\/\//, "");
candidate = candidate.replace(/^\/\//, "");
candidate = candidate.replace(/\/.*$/, "");
candidate = candidate.replace(/[?#].*$/, "");
return stripPort(candidate);
}
// Construit l'ensemble des hôtes à ignorer à partir de la configuration.
function parseIgnoreHosts(raw) {
const set = new Set();
if (Array.isArray(raw)) {
for (const entry of raw) {
const host = normalizeIgnoreHostEntry(entry);
if (host) {
set.add(host);
}
}
}
return set;
}
function daysToMs(days) { function daysToMs(days) {
if (!Number.isFinite(days) || days <= 0) { if (!Number.isFinite(days) || days <= 0) {
return 0; return 0;
@@ -354,6 +416,22 @@ function filterReportLinks(links, activeUrls) {
return filtered; return filtered;
} }
// Supprime de la collecte les URLs dont l'hôte est ignoré.
function filterIgnoredHosts(occurrences, ignoreHosts) {
if (!ignoreHosts || ignoreHosts.size === 0) {
return occurrences;
}
const filtered = new Map();
for (const [url, urlOccurrences] of occurrences.entries()) {
const host = extractHost(url);
if (host && ignoreHosts.has(host)) {
continue;
}
filtered.set(url, urlOccurrences);
}
return filtered;
}
function recordOccurrence(map, filePath, line, url) { function recordOccurrence(map, filePath, line, url) {
if (!map.has(url)) { if (!map.has(url)) {
map.set(url, []); map.set(url, []);
@@ -551,7 +629,8 @@ function recordHostCheck(host) {
function extractHost(url) { function extractHost(url) {
try { try {
return new URL(url).hostname; const hostname = new URL(url).hostname;
return normalizeHost(hostname);
} catch (_) { } catch (_) {
return null; return null;
} }
@@ -716,18 +795,12 @@ async function checkEntries(entriesToCheck, entries, snapshotMeta) {
if (host) { if (host) {
await applyHostDelay(host); await applyHostDelay(host);
} }
let result = await probeUrl(entry.url, { ...BASE_HTTP_OPTIONS, method: "HEAD" }); const result = await checkUrl(entry.url, {
...BASE_HTTP_OPTIONS,
firstMethod: "GET",
retryWithGet: false,
});
recordHostCheck(host); recordHostCheck(host);
if (shouldRetry(result)) {
if (RETRY_DELAY_MS > 0) {
await delay(RETRY_DELAY_MS);
}
if (host) {
await applyHostDelay(host);
}
result = await probeUrl(entry.url, { ...BASE_HTTP_OPTIONS, method: "GET" });
recordHostCheck(host);
}
updateEntryWithResult(entries[entry.url], result); updateEntryWithResult(entries[entry.url], result);
persistEntriesSnapshot(entries, snapshotMeta); persistEntriesSnapshot(entries, snapshotMeta);
processed += 1; processed += 1;
@@ -740,7 +813,8 @@ async function checkEntries(entriesToCheck, entries, snapshotMeta) {
} }
async function main() { async function main() {
const occurrences = await collectOccurrences(); let occurrences = await collectOccurrences();
occurrences = filterIgnoredHosts(occurrences, IGNORE_HOSTS);
if (occurrences.size === 0) { if (occurrences.size === 0) {
const emptyState = { generatedAt: new Date().toISOString(), links: [], entries: {} }; const emptyState = { generatedAt: new Date().toISOString(), links: [], entries: {} };
saveState(emptyState); saveState(emptyState);

View File

@@ -14,7 +14,16 @@
"outputFile": "tools/cache/external_links_report.md", "outputFile": "tools/cache/external_links_report.md",
"userAgent": null, "userAgent": null,
"enableCookies": true, "enableCookies": true,
"cookieJar": "tools/cache/curl_cookies.txt" "cookieJar": "tools/cache/curl_cookies.txt",
"ignoreHosts": [
"10.0.2.1",
"web.archive.org",
"localhost",
"nas",
"selenium",
"ci.athaliasoft.com",
"rebrickable.com"
]
}, },
"weather": { "weather": {
"timezone": "Europe/Paris", "timezone": "Europe/Paris",

View File

@@ -1,18 +1,115 @@
const { fetch } = require("undici"); const { fetch } = require("undici");
const UserAgent = require("user-agents");
const DEFAULT_ACCEPT = const DEFAULT_ACCEPT =
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8"; "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8";
const DEFAULT_ACCEPT_LANGUAGE = "fr-FR,fr;q=0.9,en;q=0.7"; const DEFAULT_ACCEPT_LANGUAGE = "fr-FR,fr;q=0.9,en;q=0.7";
const DEFAULT_ACCEPT_ENCODING = "gzip, deflate, br";
const DEFAULT_CACHE_CONTROL = "no-cache";
const DEFAULT_PRAGMA = "no-cache";
const DEFAULT_TIMEOUT_MS = 5000; const DEFAULT_TIMEOUT_MS = 5000;
const DEFAULT_MAX_REDIRECTS = 5; const DEFAULT_MAX_REDIRECTS = 5;
const DEFAULT_USER_AGENTS = [
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
];
function buildUserAgent(preferred) { function buildUserAgent(preferred) {
if (typeof preferred === "string" && preferred.trim()) { if (typeof preferred === "string" && preferred.trim()) {
return preferred.trim(); return preferred.trim();
} }
const ua = new UserAgent(); const index = Math.floor(Math.random() * DEFAULT_USER_AGENTS.length);
return ua.toString(); return DEFAULT_USER_AGENTS[index];
}
function extractChromeVersion(userAgent) {
if (typeof userAgent !== "string") {
return null;
}
const match = userAgent.match(/Chrome\/(\d+)/i);
if (match && match[1]) {
return match[1];
}
return null;
}
function isChromeLike(userAgent) {
if (typeof userAgent !== "string") {
return false;
}
return /Chrome\/\d+/i.test(userAgent);
}
function derivePlatform(userAgent) {
if (typeof userAgent !== "string") {
return null;
}
if (/Windows NT/i.test(userAgent)) {
return "Windows";
}
if (/Mac OS X/i.test(userAgent)) {
return "macOS";
}
if (/Android/i.test(userAgent)) {
return "Android";
}
if (/iPhone|iPad|iPod/i.test(userAgent)) {
return "iOS";
}
if (/Linux/i.test(userAgent)) {
return "Linux";
}
return null;
}
function isMobileUserAgent(userAgent) {
if (typeof userAgent !== "string") {
return false;
}
return /Mobile|Android|iPhone|iPad|iPod/i.test(userAgent);
}
function buildSecChUa(userAgent) {
if (!isChromeLike(userAgent)) {
return null;
}
const version = extractChromeVersion(userAgent) || "122";
return `"Chromium";v="${version}", "Not A(Brand";v="24", "Google Chrome";v="${version}"`;
}
function buildNavigationHeaders(url, userAgent, extraHeaders = {}) {
const platform = derivePlatform(userAgent);
const secChUa = buildSecChUa(userAgent);
const secChUaMobile = isMobileUserAgent(userAgent) ? "?1" : "?0";
const secChUaPlatform = platform ? `"${platform}"` : null;
const baseHeaders = {
"user-agent": userAgent,
accept: DEFAULT_ACCEPT,
"accept-language": DEFAULT_ACCEPT_LANGUAGE,
"accept-encoding": DEFAULT_ACCEPT_ENCODING,
"cache-control": DEFAULT_CACHE_CONTROL,
pragma: DEFAULT_PRAGMA,
dnt: "1",
connection: "keep-alive",
"upgrade-insecure-requests": "1",
"sec-fetch-site": "none",
"sec-fetch-mode": "navigate",
"sec-fetch-user": "?1",
"sec-fetch-dest": "document",
...extraHeaders,
};
if (secChUa) {
baseHeaders["sec-ch-ua"] = secChUa;
}
if (secChUaMobile) {
baseHeaders["sec-ch-ua-mobile"] = secChUaMobile;
}
if (secChUaPlatform) {
baseHeaders["sec-ch-ua-platform"] = secChUaPlatform;
}
return baseHeaders;
} }
async function fetchWithRedirects(targetUrl, options, maxRedirects) { async function fetchWithRedirects(targetUrl, options, maxRedirects) {
@@ -47,18 +144,13 @@ async function fetchWithRedirects(targetUrl, options, maxRedirects) {
} }
async function probeUrl(url, options = {}) { async function probeUrl(url, options = {}) {
const method = typeof options.method === "string" ? options.method.toUpperCase() : "HEAD"; const method = typeof options.method === "string" ? options.method.toUpperCase() : "GET";
const timeoutMs = Number.isFinite(options.timeoutMs) ? options.timeoutMs : DEFAULT_TIMEOUT_MS; const timeoutMs = Number.isFinite(options.timeoutMs) ? options.timeoutMs : DEFAULT_TIMEOUT_MS;
const maxRedirects = Number.isFinite(options.maxRedirects) const maxRedirects = Number.isFinite(options.maxRedirects)
? options.maxRedirects ? options.maxRedirects
: DEFAULT_MAX_REDIRECTS; : DEFAULT_MAX_REDIRECTS;
const userAgent = buildUserAgent(options.userAgent); const userAgent = buildUserAgent(options.userAgent);
const headers = { const headers = buildNavigationHeaders(url, userAgent, options.headers || {});
"user-agent": userAgent,
"accept-language": DEFAULT_ACCEPT_LANGUAGE,
"accept": DEFAULT_ACCEPT,
...(options.headers || {}),
};
const controller = new AbortController(); const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), timeoutMs); const timer = setTimeout(() => controller.abort(), timeoutMs);
@@ -117,9 +209,13 @@ function shouldRetry(result) {
} }
async function checkUrl(url, options = {}) { async function checkUrl(url, options = {}) {
const firstMethod = options.firstMethod || "HEAD"; const firstMethod = options.firstMethod || "GET";
const retryWithGet =
typeof options.retryWithGet === "boolean"
? options.retryWithGet
: firstMethod === "HEAD";
let result = await probeUrl(url, { ...options, method: firstMethod }); let result = await probeUrl(url, { ...options, method: firstMethod });
if (options.retryWithGet !== false && shouldRetry(result)) { if (retryWithGet && shouldRetry(result)) {
result = await probeUrl(url, { ...options, method: "GET" }); result = await probeUrl(url, { ...options, method: "GET" });
} }
return result; return result;