From cb7cd917d7873b7e161f12457fd35a60ae453af4 Mon Sep 17 00:00:00 2001 From: Richard Dern Date: Mon, 8 Dec 2025 00:23:22 +0100 Subject: [PATCH] =?UTF-8?q?Am=C3=A9lioration=20de=20la=20d=C3=A9tection=20?= =?UTF-8?q?de=20liens=20morts?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tools/check_external_links.js | 106 +++++++++++++++++++++++++----- tools/config/config.json | 13 +++- tools/lib/http.js | 120 ++++++++++++++++++++++++++++++---- 3 files changed, 209 insertions(+), 30 deletions(-) diff --git a/tools/check_external_links.js b/tools/check_external_links.js index 73fcdbe7..0ec88bf4 100644 --- a/tools/check_external_links.js +++ b/tools/check_external_links.js @@ -3,7 +3,7 @@ const fs = require("fs"); const path = require("path"); const yaml = require("js-yaml"); -const { buildUserAgent, probeUrl, shouldRetry } = require("./lib/http"); +const { buildUserAgent, checkUrl } = require("./lib/http"); const { collectMarkdownLinksFromFile, extractLinksFromText, @@ -18,7 +18,6 @@ const DEFAULT_CONFIG = { cacheDir: path.join(__dirname, "cache"), cacheFile: "external_links.yaml", hostDelayMs: 2000, - retryDelayMs: 5000, requestTimeoutSeconds: 5, cacheTtlSuccessDays: 30, cacheTtlClientErrorDays: 7, @@ -27,6 +26,7 @@ const DEFAULT_CONFIG = { maxConcurrentHosts: 4, maxRedirects: 5, userAgent: null, + ignoreHosts: [], }; function loadConfig() { @@ -57,7 +57,6 @@ const REPORT_PATH = path.isAbsolute(settings.cacheFile) : path.join(CACHE_DIR, settings.cacheFile); const HOST_DELAY_MS = Math.max(0, Number(settings.hostDelayMs) || 0); -const RETRY_DELAY_MS = Math.max(0, Number(settings.retryDelayMs) || 0); const REQUEST_TIMEOUT_SECONDS = Math.max(1, Number(settings.requestTimeoutSeconds) || 5); const REQUEST_TIMEOUT_MS = REQUEST_TIMEOUT_SECONDS * 1000; const MAX_CONCURRENT_HOSTS = Math.max( @@ -73,6 +72,7 @@ const MAX_REDIRECTS = Math.max( : DEFAULT_CONFIG.maxRedirects ); const DEFAULT_USER_AGENT = buildUserAgent(settings.userAgent); +const IGNORE_HOSTS = parseIgnoreHosts(settings.ignoreHosts); const CACHE_TTL_SUCCESS_MS = daysToMs( pickNumber(settings.cacheTtlSuccessDays, DEFAULT_CONFIG.cacheTtlSuccessDays) @@ -103,6 +103,68 @@ function pickNumber(value, fallback) { return fallback; } +// Normalise un hôte pour comparaison. +function normalizeHost(value) { + if (typeof value !== "string") { + return null; + } + const normalized = value.trim().toLowerCase(); + return normalized || null; +} + +// Retire le port éventuel d'une chaîne décrivant un hôte. +function stripPort(hostValue) { + if (typeof hostValue !== "string") { + return null; + } + const trimmed = hostValue.trim(); + if (!trimmed) { + return null; + } + const bracketMatch = trimmed.match(/^\[([^\]]+)\](?::\d+)?$/); + if (bracketMatch) { + return bracketMatch[1].toLowerCase(); + } + const colonIndex = trimmed.lastIndexOf(":"); + if (colonIndex > -1 && trimmed.indexOf(":") === colonIndex) { + const hostPart = trimmed.slice(0, colonIndex).trim(); + if (hostPart) { + return hostPart.toLowerCase(); + } + } + return trimmed.toLowerCase(); +} + +// Nettoie une entrée de configuration d'hôte à ignorer (schéma, chemin, port). +function normalizeIgnoreHostEntry(value) { + if (typeof value !== "string") { + return null; + } + let candidate = value.trim().toLowerCase(); + if (!candidate) { + return null; + } + candidate = candidate.replace(/^[a-z][a-z0-9+.-]*:\/\//, ""); + candidate = candidate.replace(/^\/\//, ""); + candidate = candidate.replace(/\/.*$/, ""); + candidate = candidate.replace(/[?#].*$/, ""); + return stripPort(candidate); +} + +// Construit l'ensemble des hôtes à ignorer à partir de la configuration. +function parseIgnoreHosts(raw) { + const set = new Set(); + if (Array.isArray(raw)) { + for (const entry of raw) { + const host = normalizeIgnoreHostEntry(entry); + if (host) { + set.add(host); + } + } + } + return set; +} + function daysToMs(days) { if (!Number.isFinite(days) || days <= 0) { return 0; @@ -354,6 +416,22 @@ function filterReportLinks(links, activeUrls) { return filtered; } +// Supprime de la collecte les URLs dont l'hôte est ignoré. +function filterIgnoredHosts(occurrences, ignoreHosts) { + if (!ignoreHosts || ignoreHosts.size === 0) { + return occurrences; + } + const filtered = new Map(); + for (const [url, urlOccurrences] of occurrences.entries()) { + const host = extractHost(url); + if (host && ignoreHosts.has(host)) { + continue; + } + filtered.set(url, urlOccurrences); + } + return filtered; +} + function recordOccurrence(map, filePath, line, url) { if (!map.has(url)) { map.set(url, []); @@ -551,7 +629,8 @@ function recordHostCheck(host) { function extractHost(url) { try { - return new URL(url).hostname; + const hostname = new URL(url).hostname; + return normalizeHost(hostname); } catch (_) { return null; } @@ -716,18 +795,12 @@ async function checkEntries(entriesToCheck, entries, snapshotMeta) { if (host) { await applyHostDelay(host); } - let result = await probeUrl(entry.url, { ...BASE_HTTP_OPTIONS, method: "HEAD" }); + const result = await checkUrl(entry.url, { + ...BASE_HTTP_OPTIONS, + firstMethod: "GET", + retryWithGet: false, + }); recordHostCheck(host); - if (shouldRetry(result)) { - if (RETRY_DELAY_MS > 0) { - await delay(RETRY_DELAY_MS); - } - if (host) { - await applyHostDelay(host); - } - result = await probeUrl(entry.url, { ...BASE_HTTP_OPTIONS, method: "GET" }); - recordHostCheck(host); - } updateEntryWithResult(entries[entry.url], result); persistEntriesSnapshot(entries, snapshotMeta); processed += 1; @@ -740,7 +813,8 @@ async function checkEntries(entriesToCheck, entries, snapshotMeta) { } async function main() { - const occurrences = await collectOccurrences(); + let occurrences = await collectOccurrences(); + occurrences = filterIgnoredHosts(occurrences, IGNORE_HOSTS); if (occurrences.size === 0) { const emptyState = { generatedAt: new Date().toISOString(), links: [], entries: {} }; saveState(emptyState); diff --git a/tools/config/config.json b/tools/config/config.json index 5feb94fa..d1e493bb 100644 --- a/tools/config/config.json +++ b/tools/config/config.json @@ -14,7 +14,16 @@ "outputFile": "tools/cache/external_links_report.md", "userAgent": null, "enableCookies": true, - "cookieJar": "tools/cache/curl_cookies.txt" + "cookieJar": "tools/cache/curl_cookies.txt", + "ignoreHosts": [ + "10.0.2.1", + "web.archive.org", + "localhost", + "nas", + "selenium", + "ci.athaliasoft.com", + "rebrickable.com" + ] }, "weather": { "timezone": "Europe/Paris", @@ -98,4 +107,4 @@ "goaccess": { "url": null } -} +} \ No newline at end of file diff --git a/tools/lib/http.js b/tools/lib/http.js index f7086fd3..48f58421 100644 --- a/tools/lib/http.js +++ b/tools/lib/http.js @@ -1,18 +1,115 @@ const { fetch } = require("undici"); -const UserAgent = require("user-agents"); const DEFAULT_ACCEPT = "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8"; const DEFAULT_ACCEPT_LANGUAGE = "fr-FR,fr;q=0.9,en;q=0.7"; +const DEFAULT_ACCEPT_ENCODING = "gzip, deflate, br"; +const DEFAULT_CACHE_CONTROL = "no-cache"; +const DEFAULT_PRAGMA = "no-cache"; const DEFAULT_TIMEOUT_MS = 5000; const DEFAULT_MAX_REDIRECTS = 5; +const DEFAULT_USER_AGENTS = [ + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36", +]; function buildUserAgent(preferred) { if (typeof preferred === "string" && preferred.trim()) { return preferred.trim(); } - const ua = new UserAgent(); - return ua.toString(); + const index = Math.floor(Math.random() * DEFAULT_USER_AGENTS.length); + return DEFAULT_USER_AGENTS[index]; +} + +function extractChromeVersion(userAgent) { + if (typeof userAgent !== "string") { + return null; + } + const match = userAgent.match(/Chrome\/(\d+)/i); + if (match && match[1]) { + return match[1]; + } + return null; +} + +function isChromeLike(userAgent) { + if (typeof userAgent !== "string") { + return false; + } + return /Chrome\/\d+/i.test(userAgent); +} + +function derivePlatform(userAgent) { + if (typeof userAgent !== "string") { + return null; + } + if (/Windows NT/i.test(userAgent)) { + return "Windows"; + } + if (/Mac OS X/i.test(userAgent)) { + return "macOS"; + } + if (/Android/i.test(userAgent)) { + return "Android"; + } + if (/iPhone|iPad|iPod/i.test(userAgent)) { + return "iOS"; + } + if (/Linux/i.test(userAgent)) { + return "Linux"; + } + return null; +} + +function isMobileUserAgent(userAgent) { + if (typeof userAgent !== "string") { + return false; + } + return /Mobile|Android|iPhone|iPad|iPod/i.test(userAgent); +} + +function buildSecChUa(userAgent) { + if (!isChromeLike(userAgent)) { + return null; + } + const version = extractChromeVersion(userAgent) || "122"; + return `"Chromium";v="${version}", "Not A(Brand";v="24", "Google Chrome";v="${version}"`; +} + +function buildNavigationHeaders(url, userAgent, extraHeaders = {}) { + const platform = derivePlatform(userAgent); + const secChUa = buildSecChUa(userAgent); + const secChUaMobile = isMobileUserAgent(userAgent) ? "?1" : "?0"; + const secChUaPlatform = platform ? `"${platform}"` : null; + + const baseHeaders = { + "user-agent": userAgent, + accept: DEFAULT_ACCEPT, + "accept-language": DEFAULT_ACCEPT_LANGUAGE, + "accept-encoding": DEFAULT_ACCEPT_ENCODING, + "cache-control": DEFAULT_CACHE_CONTROL, + pragma: DEFAULT_PRAGMA, + dnt: "1", + connection: "keep-alive", + "upgrade-insecure-requests": "1", + "sec-fetch-site": "none", + "sec-fetch-mode": "navigate", + "sec-fetch-user": "?1", + "sec-fetch-dest": "document", + ...extraHeaders, + }; + + if (secChUa) { + baseHeaders["sec-ch-ua"] = secChUa; + } + if (secChUaMobile) { + baseHeaders["sec-ch-ua-mobile"] = secChUaMobile; + } + if (secChUaPlatform) { + baseHeaders["sec-ch-ua-platform"] = secChUaPlatform; + } + + return baseHeaders; } async function fetchWithRedirects(targetUrl, options, maxRedirects) { @@ -47,18 +144,13 @@ async function fetchWithRedirects(targetUrl, options, maxRedirects) { } async function probeUrl(url, options = {}) { - const method = typeof options.method === "string" ? options.method.toUpperCase() : "HEAD"; + const method = typeof options.method === "string" ? options.method.toUpperCase() : "GET"; const timeoutMs = Number.isFinite(options.timeoutMs) ? options.timeoutMs : DEFAULT_TIMEOUT_MS; const maxRedirects = Number.isFinite(options.maxRedirects) ? options.maxRedirects : DEFAULT_MAX_REDIRECTS; const userAgent = buildUserAgent(options.userAgent); - const headers = { - "user-agent": userAgent, - "accept-language": DEFAULT_ACCEPT_LANGUAGE, - "accept": DEFAULT_ACCEPT, - ...(options.headers || {}), - }; + const headers = buildNavigationHeaders(url, userAgent, options.headers || {}); const controller = new AbortController(); const timer = setTimeout(() => controller.abort(), timeoutMs); @@ -117,9 +209,13 @@ function shouldRetry(result) { } async function checkUrl(url, options = {}) { - const firstMethod = options.firstMethod || "HEAD"; + const firstMethod = options.firstMethod || "GET"; + const retryWithGet = + typeof options.retryWithGet === "boolean" + ? options.retryWithGet + : firstMethod === "HEAD"; let result = await probeUrl(url, { ...options, method: firstMethod }); - if (options.retryWithGet !== false && shouldRetry(result)) { + if (retryWithGet && shouldRetry(result)) { result = await probeUrl(url, { ...options, method: "GET" }); } return result;