1

Uniformisation de la vérification des liens

This commit is contained in:
2025-11-28 23:43:07 +01:00
parent 5e846aa4b4
commit 0260c1ab4e
7 changed files with 2145 additions and 1157 deletions

View File

@@ -15,29 +15,14 @@ Je le fais par plaisir du travail bien fait et par respect pour mes visiteurs, m
## Méthodologie ## Méthodologie
J'ai créé un script exploitant [cURL](https://curl.se/docs/) avec les paramètres suivants : Le script utilise désormais un fetch HTTP maison (basé sur [`undici`](https://github.com/nodejs/undici)) qui :
```javascript - génère un _user-agent_ réaliste à chaque exécution (bibliothèque `user-agents`)
const args = [ - envoie d'abord une requête [`HEAD`](https://developer.mozilla.org/fr/docs/Web/HTTP/Reference/Methods/HEAD) puis, en cas d'échec ou de code ≥ 400, une requête `GET` après un délai de 5s
"--silent", - suit les redirections manuellement (jusqu'à 5 sauts) et abandonne au-delà
"--location", - applique un _timeout_ de 5 s par requête
"--fail", - envoie des en-têtes classiques d'un navigateur (`Accept`, `Accept-Language`)
"--max-time", - n'enregistre pas de cookies
`${REQUEST_TIMEOUT_SECONDS}`,
"--output",
"/dev/null",
"--write-out",
"%{http_code}",
"--user-agent",
DEFAULT_USER_AGENT,
"--request",
method,
url,
];
```
`DEFAULT_USER_AGENT` est un UA valide et régulièrement mis à jour.
Je fais une première requête avec la méthode [`HEAD`](https://developer.mozilla.org/fr/docs/Web/HTTP/Reference/Methods/HEAD), et si cette requête échoue, j'en envoie une autre avec la méthode `GET`, après un délais de 5s.
Trois cas de figure se présentent à ce stade. Trois cas de figure se présentent à ce stade.
@@ -91,9 +76,9 @@ Les requêtes ayant abouti à un _timeout_ ne sont pas renouvelées avant 1 sema
### Autres cas ### Autres cas
Il arrive que cURL me renvoie une erreur HTTP 0 (qui n'existe pas réellement). Il arrive que le fetch me renvoie un statut nul/0 (qui n'existe pas réellement).
L'examen des journaux détaillés de ces requêtes m'apprend qu'en général (mais pas toujours), le problème est essentiellement lié aux certificats du serveur (obsolescence, nom de domaine qui ne correspond pas, etc.). Dans la majorité des cas, le problème est lié aux certificats du serveur (obsolescence, nom de domaine qui ne correspond pas, etc.) ou à un refus de connexion.
Les requêtes aboutissant à un code HTTP 0 ne sont pas renouvelées avant 1 semaine. Les requêtes aboutissant à un code HTTP 0 ou à une erreur réseau ne sont pas renouvelées avant 1 semaine.
## Rapport ## Rapport

2998
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@@ -11,7 +11,7 @@
"luxon": "^3.7.2", "luxon": "^3.7.2",
"postcss-import": "^16.1.0", "postcss-import": "^16.1.0",
"postcss-nested": "^7.0.2", "postcss-nested": "^7.0.2",
"puppeteer": "^24.4.0", "puppeteer": "^23.11.1",
"puppeteer-extra": "^3.3.6", "puppeteer-extra": "^3.3.6",
"puppeteer-extra-plugin-stealth": "^2.11.2", "puppeteer-extra-plugin-stealth": "^2.11.2",
"sharp": "^0.33.5", "sharp": "^0.33.5",
@@ -19,9 +19,13 @@
"user-agents": "^1.1.480" "user-agents": "^1.1.480"
}, },
"devDependencies": { "devDependencies": {
"@mermaid-js/mermaid-cli": "^10.9.1", "@mermaid-js/mermaid-cli": "^11.12.0",
"autoprefixer": "^10.4.21", "autoprefixer": "^10.4.21",
"postcss": "^8.5.3", "postcss": "^8.5.3",
"postcss-cli": "^11.0.1" "postcss-cli": "^11.0.1"
},
"overrides": {
"js-yaml": "^4.1.1",
"tar-fs": "^3.1.1"
} }
} }

View File

@@ -6,6 +6,7 @@ const crypto = require("crypto");
const path = require("path"); const path = require("path");
const os = require("os"); const os = require("os");
const YAML = require("yaml"); const YAML = require("yaml");
const { buildUserAgent, checkUrl } = require("./lib/http");
const { getArchiveUrl, saveToArchive } = require("./lib/archive"); const { getArchiveUrl, saveToArchive } = require("./lib/archive");
const { scrapePage } = require("./lib/puppeteer"); const { scrapePage } = require("./lib/puppeteer");
@@ -59,6 +60,14 @@ if (duplicateBundlePath) {
// Check URL accessibility and Archive.org availability // Check URL accessibility and Archive.org availability
(async () => { (async () => {
const userAgent = buildUserAgent();
const initialCheck = await checkUrl(url, { userAgent, timeoutMs: 8000 });
if (initialCheck.errorType || (typeof initialCheck.status === "number" && initialCheck.status >= 400)) {
console.warn(`⚠ Vérification HTTP avant scraping: ${initialCheck.errorType || initialCheck.status || "indéterminé"}`);
} else {
console.log(`🌐 Vérification HTTP avant scraping: ${initialCheck.status ?? "inconnue"}`);
}
let archiveUrl = await getArchiveUrl(url); let archiveUrl = await getArchiveUrl(url);
// If the URL is not archived, attempt to save it // If the URL is not archived, attempt to save it
@@ -106,13 +115,16 @@ if (duplicateBundlePath) {
// Scrape the page and capture a screenshot // Scrape the page and capture a screenshot
console.log(`🔍 Scraping page and capturing screenshot...`); console.log(`🔍 Scraping page and capturing screenshot...`);
const metadata = await scrapePage(url, tempScreenshotPath); const metadata = await scrapePage(url, tempScreenshotPath, { userAgent });
// If Puppeteer failed, do not proceed // If Puppeteer failed, do not proceed
if (!metadata || !fs.existsSync(tempScreenshotPath)) { if (!metadata || !fs.existsSync(tempScreenshotPath)) {
console.error(`❌ Scraping failed. No bundle will be created.`); console.error(`❌ Scraping failed. No bundle will be created.`);
process.exit(1); process.exit(1);
} }
if (!metadata.httpStatus && typeof initialCheck.status === "number") {
metadata.httpStatus = initialCheck.status;
}
// Create Hugo bundle only if scraping succeeded // Create Hugo bundle only if scraping succeeded
console.log(`📦 Creating Hugo bundle for ${url}...`); console.log(`📦 Creating Hugo bundle for ${url}...`);

View File

@@ -2,17 +2,13 @@
const fs = require("fs"); const fs = require("fs");
const path = require("path"); const path = require("path");
const util = require("util");
const yaml = require("js-yaml"); const yaml = require("js-yaml");
const UserAgent = require("user-agents"); const { buildUserAgent, probeUrl, shouldRetry } = require("./lib/http");
const { execFile } = require("child_process");
const { const {
collectMarkdownLinksFromFile, collectMarkdownLinksFromFile,
extractLinksFromText, extractLinksFromText,
} = require("./lib/markdown_links"); } = require("./lib/markdown_links");
const execFileAsync = util.promisify(execFile);
const SITE_ROOT = path.resolve(__dirname, ".."); const SITE_ROOT = path.resolve(__dirname, "..");
const CONTENT_DIR = path.join(SITE_ROOT, "content"); const CONTENT_DIR = path.join(SITE_ROOT, "content");
const CONFIG_PATH = path.join(__dirname, "config", "config.json"); const CONFIG_PATH = path.join(__dirname, "config", "config.json");
@@ -29,9 +25,8 @@ const DEFAULT_CONFIG = {
cacheTtlServerErrorDays: 1, cacheTtlServerErrorDays: 1,
cacheTtlTimeoutDays: 7, cacheTtlTimeoutDays: 7,
maxConcurrentHosts: 4, maxConcurrentHosts: 4,
maxRedirects: 5,
userAgent: null, userAgent: null,
enableCookies: true,
cookieJar: path.join(__dirname, "cache", "curl_cookies.txt"),
}; };
function loadConfig() { function loadConfig() {
@@ -60,26 +55,24 @@ const CACHE_DIR = path.isAbsolute(settings.cacheDir)
const REPORT_PATH = path.isAbsolute(settings.cacheFile) const REPORT_PATH = path.isAbsolute(settings.cacheFile)
? settings.cacheFile ? settings.cacheFile
: path.join(CACHE_DIR, settings.cacheFile); : path.join(CACHE_DIR, settings.cacheFile);
const COOKIE_JAR = settings.cookieJar
? path.isAbsolute(settings.cookieJar)
? settings.cookieJar
: path.resolve(SITE_ROOT, settings.cookieJar)
: path.join(CACHE_DIR, "curl_cookies.txt");
const HOST_DELAY_MS = Math.max(0, Number(settings.hostDelayMs) || 0); const HOST_DELAY_MS = Math.max(0, Number(settings.hostDelayMs) || 0);
const RETRY_DELAY_MS = Math.max(0, Number(settings.retryDelayMs) || 0); const RETRY_DELAY_MS = Math.max(0, Number(settings.retryDelayMs) || 0);
const REQUEST_TIMEOUT_SECONDS = Math.max(1, Number(settings.requestTimeoutSeconds) || 5); const REQUEST_TIMEOUT_SECONDS = Math.max(1, Number(settings.requestTimeoutSeconds) || 5);
const REQUEST_TIMEOUT_MS = REQUEST_TIMEOUT_SECONDS * 1000;
const MAX_CONCURRENT_HOSTS = Math.max( const MAX_CONCURRENT_HOSTS = Math.max(
1, 1,
Number.isFinite(Number(settings.maxConcurrentHosts)) Number.isFinite(Number(settings.maxConcurrentHosts))
? Number(settings.maxConcurrentHosts) ? Number(settings.maxConcurrentHosts)
: DEFAULT_CONFIG.maxConcurrentHosts : DEFAULT_CONFIG.maxConcurrentHosts
); );
const DEFAULT_USER_AGENT = const MAX_REDIRECTS = Math.max(
typeof settings.userAgent === "string" && settings.userAgent.trim() 0,
? settings.userAgent.trim() Number.isFinite(Number(settings.maxRedirects))
: new UserAgent().toString(); ? Number(settings.maxRedirects)
const ENABLE_COOKIES = settings.enableCookies !== false; : DEFAULT_CONFIG.maxRedirects
);
const DEFAULT_USER_AGENT = buildUserAgent(settings.userAgent);
const CACHE_TTL_SUCCESS_MS = daysToMs( const CACHE_TTL_SUCCESS_MS = daysToMs(
pickNumber(settings.cacheTtlSuccessDays, DEFAULT_CONFIG.cacheTtlSuccessDays) pickNumber(settings.cacheTtlSuccessDays, DEFAULT_CONFIG.cacheTtlSuccessDays)
@@ -95,12 +88,12 @@ const CACHE_TTL_TIMEOUT_MS = daysToMs(
); );
fs.mkdirSync(CACHE_DIR, { recursive: true }); fs.mkdirSync(CACHE_DIR, { recursive: true });
if (ENABLE_COOKIES) {
fs.mkdirSync(path.dirname(COOKIE_JAR), { recursive: true }); const BASE_HTTP_OPTIONS = {
if (!fs.existsSync(COOKIE_JAR)) { userAgent: DEFAULT_USER_AGENT,
fs.closeSync(fs.openSync(COOKIE_JAR, "a")); timeoutMs: REQUEST_TIMEOUT_MS,
} maxRedirects: MAX_REDIRECTS,
} };
function pickNumber(value, fallback) { function pickNumber(value, fallback) {
const parsed = Number(value); const parsed = Number(value);
@@ -536,59 +529,6 @@ function extractHost(url) {
} }
} }
async function curlRequest(url, method, hostHeader) {
const args = [
"--silent",
"--location",
"--fail",
"--max-time",
`${REQUEST_TIMEOUT_SECONDS}`,
"--output",
"/dev/null",
"--write-out",
"%{http_code}",
"--user-agent",
DEFAULT_USER_AGENT,
"--request",
method,
];
if (ENABLE_COOKIES) {
args.push("--cookie", COOKIE_JAR, "--cookie-jar", COOKIE_JAR);
}
if (hostHeader) {
args.push("-H", `Host: ${hostHeader}`);
}
args.push(url);
try {
const { stdout } = await execFileAsync("curl", args);
const status = parseInt(stdout.trim(), 10);
return {
status: Number.isNaN(status) ? null : status,
errorType: null,
method: method.toUpperCase(),
};
} catch (error) {
const rawStatus = error?.stdout?.toString().trim();
const status = rawStatus ? parseInt(rawStatus, 10) : null;
const errorCode = Number(error?.code);
const timeout = errorCode === 28 ? "timeout" : null;
return {
status: Number.isNaN(status) ? null : status,
errorType: timeout,
method: method.toUpperCase(),
};
}
}
function shouldRetryWithGet(result) {
if (!result) return true;
if (result.errorType) return true;
if (typeof result.status !== "number") return true;
return result.status >= 400;
}
function getTtlMs(entry) { function getTtlMs(entry) {
if (!entry) return 0; if (!entry) return 0;
if (entry.errorType === "timeout" || entry.status === 0 || entry.status === null) { if (entry.errorType === "timeout" || entry.status === 0 || entry.status === null) {
@@ -748,17 +688,16 @@ async function checkEntries(entriesToCheck, entries, snapshotMeta) {
if (host) { if (host) {
await applyHostDelay(host); await applyHostDelay(host);
} }
const hostHeader = host || extractHost(entry.url); let result = await probeUrl(entry.url, { ...BASE_HTTP_OPTIONS, method: "HEAD" });
let result = await curlRequest(entry.url, "HEAD", hostHeader);
recordHostCheck(host); recordHostCheck(host);
if (shouldRetryWithGet(result)) { if (shouldRetry(result)) {
if (RETRY_DELAY_MS > 0) { if (RETRY_DELAY_MS > 0) {
await delay(RETRY_DELAY_MS); await delay(RETRY_DELAY_MS);
} }
if (host) { if (host) {
await applyHostDelay(host); await applyHostDelay(host);
} }
result = await curlRequest(entry.url, "GET", hostHeader); result = await probeUrl(entry.url, { ...BASE_HTTP_OPTIONS, method: "GET" });
recordHostCheck(host); recordHostCheck(host);
} }
updateEntryWithResult(entries[entry.url], result); updateEntryWithResult(entries[entry.url], result);

133
tools/lib/http.js Normal file
View File

@@ -0,0 +1,133 @@
const { fetch } = require("undici");
const UserAgent = require("user-agents");
const DEFAULT_ACCEPT =
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8";
const DEFAULT_ACCEPT_LANGUAGE = "fr-FR,fr;q=0.9,en;q=0.7";
const DEFAULT_TIMEOUT_MS = 5000;
const DEFAULT_MAX_REDIRECTS = 5;
function buildUserAgent(preferred) {
if (typeof preferred === "string" && preferred.trim()) {
return preferred.trim();
}
const ua = new UserAgent();
return ua.toString();
}
async function fetchWithRedirects(targetUrl, options, maxRedirects) {
let currentUrl = targetUrl;
let response = null;
let redirects = 0;
while (redirects <= maxRedirects) {
response = await fetch(currentUrl, { ...options, redirect: "manual" });
const location = response.headers.get("location");
if (
response.status >= 300 &&
response.status < 400 &&
location &&
redirects < maxRedirects
) {
if (response.body && typeof response.body.cancel === "function") {
try {
await response.body.cancel();
} catch (_) {
// Ignore cancellation errors; we're moving to the next hop.
}
}
currentUrl = new URL(location, currentUrl).toString();
redirects += 1;
continue;
}
break;
}
return response;
}
async function probeUrl(url, options = {}) {
const method = typeof options.method === "string" ? options.method.toUpperCase() : "HEAD";
const timeoutMs = Number.isFinite(options.timeoutMs) ? options.timeoutMs : DEFAULT_TIMEOUT_MS;
const maxRedirects = Number.isFinite(options.maxRedirects)
? options.maxRedirects
: DEFAULT_MAX_REDIRECTS;
const userAgent = buildUserAgent(options.userAgent);
const headers = {
"user-agent": userAgent,
"accept-language": DEFAULT_ACCEPT_LANGUAGE,
"accept": DEFAULT_ACCEPT,
...(options.headers || {}),
};
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), timeoutMs);
try {
const response = await fetchWithRedirects(
url,
{
method,
headers,
signal: controller.signal,
},
maxRedirects
);
const status = response ? response.status : null;
const finalUrl = response?.url || url;
if (response?.body && typeof response.body.cancel === "function") {
try {
await response.body.cancel();
} catch (_) {
// Ignore cancellation errors; the status is all we needed.
}
}
return {
status,
finalUrl,
method,
errorType: null,
};
} catch (error) {
if (error.name === "AbortError") {
return {
status: null,
finalUrl: url,
method,
errorType: "timeout",
};
}
return {
status: null,
finalUrl: url,
method,
errorType: "network",
message: error.message,
};
} finally {
clearTimeout(timer);
}
}
function shouldRetry(result) {
if (!result) return true;
if (result.errorType) return true;
if (typeof result.status !== "number") return true;
return result.status >= 400;
}
async function checkUrl(url, options = {}) {
const firstMethod = options.firstMethod || "HEAD";
let result = await probeUrl(url, { ...options, method: firstMethod });
if (options.retryWithGet !== false && shouldRetry(result)) {
result = await probeUrl(url, { ...options, method: "GET" });
}
return result;
}
module.exports = {
buildUserAgent,
checkUrl,
probeUrl,
shouldRetry,
};

View File

@@ -1,6 +1,6 @@
const puppeteer = require("puppeteer-extra"); const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth"); const StealthPlugin = require("puppeteer-extra-plugin-stealth");
const UserAgent = require("user-agents"); const { buildUserAgent } = require("./http");
puppeteer.use(StealthPlugin()); puppeteer.use(StealthPlugin());
@@ -8,9 +8,11 @@ puppeteer.use(StealthPlugin());
* Scrape a webpage to extract metadata and take a screenshot. * Scrape a webpage to extract metadata and take a screenshot.
* @param {string} url - The URL of the page to scrape. * @param {string} url - The URL of the page to scrape.
* @param {string} screenshotPath - Path where the screenshot should be saved. * @param {string} screenshotPath - Path where the screenshot should be saved.
* @param {object} options
* @param {string} [options.userAgent] - Optional user agent to use for the session.
* @returns {Promise<object>} - Metadata including title, description, keywords, language, and HTTP status. * @returns {Promise<object>} - Metadata including title, description, keywords, language, and HTTP status.
*/ */
async function scrapePage(url, screenshotPath) { async function scrapePage(url, screenshotPath, options = {}) {
console.log(`🔍 Scraping: ${url}`); console.log(`🔍 Scraping: ${url}`);
const browser = await puppeteer.launch({ const browser = await puppeteer.launch({
@@ -35,9 +37,8 @@ async function scrapePage(url, screenshotPath) {
const page = await browser.newPage(); const page = await browser.newPage();
// Generate a fresh, realistic user-agent const userAgent = buildUserAgent(options.userAgent);
const userAgent = new UserAgent(); await page.setUserAgent(userAgent);
await page.setUserAgent(userAgent.toString());
// Add headers to simulate a real browser // Add headers to simulate a real browser
await page.setExtraHTTPHeaders({ await page.setExtraHTTPHeaders({