1

Uniformisation de la vérification des liens

This commit is contained in:
2025-11-28 23:43:07 +01:00
parent 5e846aa4b4
commit 0260c1ab4e
7 changed files with 2145 additions and 1157 deletions

View File

@@ -15,29 +15,14 @@ Je le fais par plaisir du travail bien fait et par respect pour mes visiteurs, m
## Méthodologie
J'ai créé un script exploitant [cURL](https://curl.se/docs/) avec les paramètres suivants :
Le script utilise désormais un fetch HTTP maison (basé sur [`undici`](https://github.com/nodejs/undici)) qui :
```javascript
const args = [
"--silent",
"--location",
"--fail",
"--max-time",
`${REQUEST_TIMEOUT_SECONDS}`,
"--output",
"/dev/null",
"--write-out",
"%{http_code}",
"--user-agent",
DEFAULT_USER_AGENT,
"--request",
method,
url,
];
```
`DEFAULT_USER_AGENT` est un UA valide et régulièrement mis à jour.
Je fais une première requête avec la méthode [`HEAD`](https://developer.mozilla.org/fr/docs/Web/HTTP/Reference/Methods/HEAD), et si cette requête échoue, j'en envoie une autre avec la méthode `GET`, après un délais de 5s.
- génère un _user-agent_ réaliste à chaque exécution (bibliothèque `user-agents`)
- envoie d'abord une requête [`HEAD`](https://developer.mozilla.org/fr/docs/Web/HTTP/Reference/Methods/HEAD) puis, en cas d'échec ou de code ≥ 400, une requête `GET` après un délai de 5s
- suit les redirections manuellement (jusqu'à 5 sauts) et abandonne au-delà
- applique un _timeout_ de 5 s par requête
- envoie des en-têtes classiques d'un navigateur (`Accept`, `Accept-Language`)
- n'enregistre pas de cookies
Trois cas de figure se présentent à ce stade.
@@ -91,9 +76,9 @@ Les requêtes ayant abouti à un _timeout_ ne sont pas renouvelées avant 1 sema
### Autres cas
Il arrive que cURL me renvoie une erreur HTTP 0 (qui n'existe pas réellement).
L'examen des journaux détaillés de ces requêtes m'apprend qu'en général (mais pas toujours), le problème est essentiellement lié aux certificats du serveur (obsolescence, nom de domaine qui ne correspond pas, etc.).
Il arrive que le fetch me renvoie un statut nul/0 (qui n'existe pas réellement).
Dans la majorité des cas, le problème est lié aux certificats du serveur (obsolescence, nom de domaine qui ne correspond pas, etc.) ou à un refus de connexion.
Les requêtes aboutissant à un code HTTP 0 ne sont pas renouvelées avant 1 semaine.
Les requêtes aboutissant à un code HTTP 0 ou à une erreur réseau ne sont pas renouvelées avant 1 semaine.
## Rapport

2998
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@@ -11,7 +11,7 @@
"luxon": "^3.7.2",
"postcss-import": "^16.1.0",
"postcss-nested": "^7.0.2",
"puppeteer": "^24.4.0",
"puppeteer": "^23.11.1",
"puppeteer-extra": "^3.3.6",
"puppeteer-extra-plugin-stealth": "^2.11.2",
"sharp": "^0.33.5",
@@ -19,9 +19,13 @@
"user-agents": "^1.1.480"
},
"devDependencies": {
"@mermaid-js/mermaid-cli": "^10.9.1",
"@mermaid-js/mermaid-cli": "^11.12.0",
"autoprefixer": "^10.4.21",
"postcss": "^8.5.3",
"postcss-cli": "^11.0.1"
},
"overrides": {
"js-yaml": "^4.1.1",
"tar-fs": "^3.1.1"
}
}

View File

@@ -6,6 +6,7 @@ const crypto = require("crypto");
const path = require("path");
const os = require("os");
const YAML = require("yaml");
const { buildUserAgent, checkUrl } = require("./lib/http");
const { getArchiveUrl, saveToArchive } = require("./lib/archive");
const { scrapePage } = require("./lib/puppeteer");
@@ -59,6 +60,14 @@ if (duplicateBundlePath) {
// Check URL accessibility and Archive.org availability
(async () => {
const userAgent = buildUserAgent();
const initialCheck = await checkUrl(url, { userAgent, timeoutMs: 8000 });
if (initialCheck.errorType || (typeof initialCheck.status === "number" && initialCheck.status >= 400)) {
console.warn(`⚠ Vérification HTTP avant scraping: ${initialCheck.errorType || initialCheck.status || "indéterminé"}`);
} else {
console.log(`🌐 Vérification HTTP avant scraping: ${initialCheck.status ?? "inconnue"}`);
}
let archiveUrl = await getArchiveUrl(url);
// If the URL is not archived, attempt to save it
@@ -106,13 +115,16 @@ if (duplicateBundlePath) {
// Scrape the page and capture a screenshot
console.log(`🔍 Scraping page and capturing screenshot...`);
const metadata = await scrapePage(url, tempScreenshotPath);
const metadata = await scrapePage(url, tempScreenshotPath, { userAgent });
// If Puppeteer failed, do not proceed
if (!metadata || !fs.existsSync(tempScreenshotPath)) {
console.error(`❌ Scraping failed. No bundle will be created.`);
process.exit(1);
}
if (!metadata.httpStatus && typeof initialCheck.status === "number") {
metadata.httpStatus = initialCheck.status;
}
// Create Hugo bundle only if scraping succeeded
console.log(`📦 Creating Hugo bundle for ${url}...`);

View File

@@ -2,17 +2,13 @@
const fs = require("fs");
const path = require("path");
const util = require("util");
const yaml = require("js-yaml");
const UserAgent = require("user-agents");
const { execFile } = require("child_process");
const { buildUserAgent, probeUrl, shouldRetry } = require("./lib/http");
const {
collectMarkdownLinksFromFile,
extractLinksFromText,
} = require("./lib/markdown_links");
const execFileAsync = util.promisify(execFile);
const SITE_ROOT = path.resolve(__dirname, "..");
const CONTENT_DIR = path.join(SITE_ROOT, "content");
const CONFIG_PATH = path.join(__dirname, "config", "config.json");
@@ -29,9 +25,8 @@ const DEFAULT_CONFIG = {
cacheTtlServerErrorDays: 1,
cacheTtlTimeoutDays: 7,
maxConcurrentHosts: 4,
maxRedirects: 5,
userAgent: null,
enableCookies: true,
cookieJar: path.join(__dirname, "cache", "curl_cookies.txt"),
};
function loadConfig() {
@@ -60,26 +55,24 @@ const CACHE_DIR = path.isAbsolute(settings.cacheDir)
const REPORT_PATH = path.isAbsolute(settings.cacheFile)
? settings.cacheFile
: path.join(CACHE_DIR, settings.cacheFile);
const COOKIE_JAR = settings.cookieJar
? path.isAbsolute(settings.cookieJar)
? settings.cookieJar
: path.resolve(SITE_ROOT, settings.cookieJar)
: path.join(CACHE_DIR, "curl_cookies.txt");
const HOST_DELAY_MS = Math.max(0, Number(settings.hostDelayMs) || 0);
const RETRY_DELAY_MS = Math.max(0, Number(settings.retryDelayMs) || 0);
const REQUEST_TIMEOUT_SECONDS = Math.max(1, Number(settings.requestTimeoutSeconds) || 5);
const REQUEST_TIMEOUT_MS = REQUEST_TIMEOUT_SECONDS * 1000;
const MAX_CONCURRENT_HOSTS = Math.max(
1,
Number.isFinite(Number(settings.maxConcurrentHosts))
? Number(settings.maxConcurrentHosts)
: DEFAULT_CONFIG.maxConcurrentHosts
);
const DEFAULT_USER_AGENT =
typeof settings.userAgent === "string" && settings.userAgent.trim()
? settings.userAgent.trim()
: new UserAgent().toString();
const ENABLE_COOKIES = settings.enableCookies !== false;
const MAX_REDIRECTS = Math.max(
0,
Number.isFinite(Number(settings.maxRedirects))
? Number(settings.maxRedirects)
: DEFAULT_CONFIG.maxRedirects
);
const DEFAULT_USER_AGENT = buildUserAgent(settings.userAgent);
const CACHE_TTL_SUCCESS_MS = daysToMs(
pickNumber(settings.cacheTtlSuccessDays, DEFAULT_CONFIG.cacheTtlSuccessDays)
@@ -95,12 +88,12 @@ const CACHE_TTL_TIMEOUT_MS = daysToMs(
);
fs.mkdirSync(CACHE_DIR, { recursive: true });
if (ENABLE_COOKIES) {
fs.mkdirSync(path.dirname(COOKIE_JAR), { recursive: true });
if (!fs.existsSync(COOKIE_JAR)) {
fs.closeSync(fs.openSync(COOKIE_JAR, "a"));
}
}
const BASE_HTTP_OPTIONS = {
userAgent: DEFAULT_USER_AGENT,
timeoutMs: REQUEST_TIMEOUT_MS,
maxRedirects: MAX_REDIRECTS,
};
function pickNumber(value, fallback) {
const parsed = Number(value);
@@ -536,59 +529,6 @@ function extractHost(url) {
}
}
async function curlRequest(url, method, hostHeader) {
const args = [
"--silent",
"--location",
"--fail",
"--max-time",
`${REQUEST_TIMEOUT_SECONDS}`,
"--output",
"/dev/null",
"--write-out",
"%{http_code}",
"--user-agent",
DEFAULT_USER_AGENT,
"--request",
method,
];
if (ENABLE_COOKIES) {
args.push("--cookie", COOKIE_JAR, "--cookie-jar", COOKIE_JAR);
}
if (hostHeader) {
args.push("-H", `Host: ${hostHeader}`);
}
args.push(url);
try {
const { stdout } = await execFileAsync("curl", args);
const status = parseInt(stdout.trim(), 10);
return {
status: Number.isNaN(status) ? null : status,
errorType: null,
method: method.toUpperCase(),
};
} catch (error) {
const rawStatus = error?.stdout?.toString().trim();
const status = rawStatus ? parseInt(rawStatus, 10) : null;
const errorCode = Number(error?.code);
const timeout = errorCode === 28 ? "timeout" : null;
return {
status: Number.isNaN(status) ? null : status,
errorType: timeout,
method: method.toUpperCase(),
};
}
}
function shouldRetryWithGet(result) {
if (!result) return true;
if (result.errorType) return true;
if (typeof result.status !== "number") return true;
return result.status >= 400;
}
function getTtlMs(entry) {
if (!entry) return 0;
if (entry.errorType === "timeout" || entry.status === 0 || entry.status === null) {
@@ -748,17 +688,16 @@ async function checkEntries(entriesToCheck, entries, snapshotMeta) {
if (host) {
await applyHostDelay(host);
}
const hostHeader = host || extractHost(entry.url);
let result = await curlRequest(entry.url, "HEAD", hostHeader);
let result = await probeUrl(entry.url, { ...BASE_HTTP_OPTIONS, method: "HEAD" });
recordHostCheck(host);
if (shouldRetryWithGet(result)) {
if (shouldRetry(result)) {
if (RETRY_DELAY_MS > 0) {
await delay(RETRY_DELAY_MS);
}
if (host) {
await applyHostDelay(host);
}
result = await curlRequest(entry.url, "GET", hostHeader);
result = await probeUrl(entry.url, { ...BASE_HTTP_OPTIONS, method: "GET" });
recordHostCheck(host);
}
updateEntryWithResult(entries[entry.url], result);

133
tools/lib/http.js Normal file
View File

@@ -0,0 +1,133 @@
const { fetch } = require("undici");
const UserAgent = require("user-agents");
const DEFAULT_ACCEPT =
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8";
const DEFAULT_ACCEPT_LANGUAGE = "fr-FR,fr;q=0.9,en;q=0.7";
const DEFAULT_TIMEOUT_MS = 5000;
const DEFAULT_MAX_REDIRECTS = 5;
function buildUserAgent(preferred) {
if (typeof preferred === "string" && preferred.trim()) {
return preferred.trim();
}
const ua = new UserAgent();
return ua.toString();
}
async function fetchWithRedirects(targetUrl, options, maxRedirects) {
let currentUrl = targetUrl;
let response = null;
let redirects = 0;
while (redirects <= maxRedirects) {
response = await fetch(currentUrl, { ...options, redirect: "manual" });
const location = response.headers.get("location");
if (
response.status >= 300 &&
response.status < 400 &&
location &&
redirects < maxRedirects
) {
if (response.body && typeof response.body.cancel === "function") {
try {
await response.body.cancel();
} catch (_) {
// Ignore cancellation errors; we're moving to the next hop.
}
}
currentUrl = new URL(location, currentUrl).toString();
redirects += 1;
continue;
}
break;
}
return response;
}
async function probeUrl(url, options = {}) {
const method = typeof options.method === "string" ? options.method.toUpperCase() : "HEAD";
const timeoutMs = Number.isFinite(options.timeoutMs) ? options.timeoutMs : DEFAULT_TIMEOUT_MS;
const maxRedirects = Number.isFinite(options.maxRedirects)
? options.maxRedirects
: DEFAULT_MAX_REDIRECTS;
const userAgent = buildUserAgent(options.userAgent);
const headers = {
"user-agent": userAgent,
"accept-language": DEFAULT_ACCEPT_LANGUAGE,
"accept": DEFAULT_ACCEPT,
...(options.headers || {}),
};
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), timeoutMs);
try {
const response = await fetchWithRedirects(
url,
{
method,
headers,
signal: controller.signal,
},
maxRedirects
);
const status = response ? response.status : null;
const finalUrl = response?.url || url;
if (response?.body && typeof response.body.cancel === "function") {
try {
await response.body.cancel();
} catch (_) {
// Ignore cancellation errors; the status is all we needed.
}
}
return {
status,
finalUrl,
method,
errorType: null,
};
} catch (error) {
if (error.name === "AbortError") {
return {
status: null,
finalUrl: url,
method,
errorType: "timeout",
};
}
return {
status: null,
finalUrl: url,
method,
errorType: "network",
message: error.message,
};
} finally {
clearTimeout(timer);
}
}
function shouldRetry(result) {
if (!result) return true;
if (result.errorType) return true;
if (typeof result.status !== "number") return true;
return result.status >= 400;
}
async function checkUrl(url, options = {}) {
const firstMethod = options.firstMethod || "HEAD";
let result = await probeUrl(url, { ...options, method: firstMethod });
if (options.retryWithGet !== false && shouldRetry(result)) {
result = await probeUrl(url, { ...options, method: "GET" });
}
return result;
}
module.exports = {
buildUserAgent,
checkUrl,
probeUrl,
shouldRetry,
};

View File

@@ -1,6 +1,6 @@
const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
const UserAgent = require("user-agents");
const { buildUserAgent } = require("./http");
puppeteer.use(StealthPlugin());
@@ -8,9 +8,11 @@ puppeteer.use(StealthPlugin());
* Scrape a webpage to extract metadata and take a screenshot.
* @param {string} url - The URL of the page to scrape.
* @param {string} screenshotPath - Path where the screenshot should be saved.
* @param {object} options
* @param {string} [options.userAgent] - Optional user agent to use for the session.
* @returns {Promise<object>} - Metadata including title, description, keywords, language, and HTTP status.
*/
async function scrapePage(url, screenshotPath) {
async function scrapePage(url, screenshotPath, options = {}) {
console.log(`🔍 Scraping: ${url}`);
const browser = await puppeteer.launch({
@@ -35,9 +37,8 @@ async function scrapePage(url, screenshotPath) {
const page = await browser.newPage();
// Generate a fresh, realistic user-agent
const userAgent = new UserAgent();
await page.setUserAgent(userAgent.toString());
const userAgent = buildUserAgent(options.userAgent);
await page.setUserAgent(userAgent);
// Add headers to simulate a real browser
await page.setExtraHTTPHeaders({