Uniformisation de la vérification des liens
This commit is contained in:
@@ -6,6 +6,7 @@ const crypto = require("crypto");
|
||||
const path = require("path");
|
||||
const os = require("os");
|
||||
const YAML = require("yaml");
|
||||
const { buildUserAgent, checkUrl } = require("./lib/http");
|
||||
const { getArchiveUrl, saveToArchive } = require("./lib/archive");
|
||||
const { scrapePage } = require("./lib/puppeteer");
|
||||
|
||||
@@ -59,6 +60,14 @@ if (duplicateBundlePath) {
|
||||
|
||||
// Check URL accessibility and Archive.org availability
|
||||
(async () => {
|
||||
const userAgent = buildUserAgent();
|
||||
const initialCheck = await checkUrl(url, { userAgent, timeoutMs: 8000 });
|
||||
if (initialCheck.errorType || (typeof initialCheck.status === "number" && initialCheck.status >= 400)) {
|
||||
console.warn(`⚠ Vérification HTTP avant scraping: ${initialCheck.errorType || initialCheck.status || "indéterminé"}`);
|
||||
} else {
|
||||
console.log(`🌐 Vérification HTTP avant scraping: ${initialCheck.status ?? "inconnue"}`);
|
||||
}
|
||||
|
||||
let archiveUrl = await getArchiveUrl(url);
|
||||
|
||||
// If the URL is not archived, attempt to save it
|
||||
@@ -106,13 +115,16 @@ if (duplicateBundlePath) {
|
||||
|
||||
// Scrape the page and capture a screenshot
|
||||
console.log(`🔍 Scraping page and capturing screenshot...`);
|
||||
const metadata = await scrapePage(url, tempScreenshotPath);
|
||||
const metadata = await scrapePage(url, tempScreenshotPath, { userAgent });
|
||||
|
||||
// If Puppeteer failed, do not proceed
|
||||
if (!metadata || !fs.existsSync(tempScreenshotPath)) {
|
||||
console.error(`❌ Scraping failed. No bundle will be created.`);
|
||||
process.exit(1);
|
||||
}
|
||||
if (!metadata.httpStatus && typeof initialCheck.status === "number") {
|
||||
metadata.httpStatus = initialCheck.status;
|
||||
}
|
||||
|
||||
// Create Hugo bundle only if scraping succeeded
|
||||
console.log(`📦 Creating Hugo bundle for ${url}...`);
|
||||
|
||||
@@ -2,17 +2,13 @@
|
||||
|
||||
const fs = require("fs");
|
||||
const path = require("path");
|
||||
const util = require("util");
|
||||
const yaml = require("js-yaml");
|
||||
const UserAgent = require("user-agents");
|
||||
const { execFile } = require("child_process");
|
||||
const { buildUserAgent, probeUrl, shouldRetry } = require("./lib/http");
|
||||
const {
|
||||
collectMarkdownLinksFromFile,
|
||||
extractLinksFromText,
|
||||
} = require("./lib/markdown_links");
|
||||
|
||||
const execFileAsync = util.promisify(execFile);
|
||||
|
||||
const SITE_ROOT = path.resolve(__dirname, "..");
|
||||
const CONTENT_DIR = path.join(SITE_ROOT, "content");
|
||||
const CONFIG_PATH = path.join(__dirname, "config", "config.json");
|
||||
@@ -29,9 +25,8 @@ const DEFAULT_CONFIG = {
|
||||
cacheTtlServerErrorDays: 1,
|
||||
cacheTtlTimeoutDays: 7,
|
||||
maxConcurrentHosts: 4,
|
||||
maxRedirects: 5,
|
||||
userAgent: null,
|
||||
enableCookies: true,
|
||||
cookieJar: path.join(__dirname, "cache", "curl_cookies.txt"),
|
||||
};
|
||||
|
||||
function loadConfig() {
|
||||
@@ -60,26 +55,24 @@ const CACHE_DIR = path.isAbsolute(settings.cacheDir)
|
||||
const REPORT_PATH = path.isAbsolute(settings.cacheFile)
|
||||
? settings.cacheFile
|
||||
: path.join(CACHE_DIR, settings.cacheFile);
|
||||
const COOKIE_JAR = settings.cookieJar
|
||||
? path.isAbsolute(settings.cookieJar)
|
||||
? settings.cookieJar
|
||||
: path.resolve(SITE_ROOT, settings.cookieJar)
|
||||
: path.join(CACHE_DIR, "curl_cookies.txt");
|
||||
|
||||
const HOST_DELAY_MS = Math.max(0, Number(settings.hostDelayMs) || 0);
|
||||
const RETRY_DELAY_MS = Math.max(0, Number(settings.retryDelayMs) || 0);
|
||||
const REQUEST_TIMEOUT_SECONDS = Math.max(1, Number(settings.requestTimeoutSeconds) || 5);
|
||||
const REQUEST_TIMEOUT_MS = REQUEST_TIMEOUT_SECONDS * 1000;
|
||||
const MAX_CONCURRENT_HOSTS = Math.max(
|
||||
1,
|
||||
Number.isFinite(Number(settings.maxConcurrentHosts))
|
||||
? Number(settings.maxConcurrentHosts)
|
||||
: DEFAULT_CONFIG.maxConcurrentHosts
|
||||
);
|
||||
const DEFAULT_USER_AGENT =
|
||||
typeof settings.userAgent === "string" && settings.userAgent.trim()
|
||||
? settings.userAgent.trim()
|
||||
: new UserAgent().toString();
|
||||
const ENABLE_COOKIES = settings.enableCookies !== false;
|
||||
const MAX_REDIRECTS = Math.max(
|
||||
0,
|
||||
Number.isFinite(Number(settings.maxRedirects))
|
||||
? Number(settings.maxRedirects)
|
||||
: DEFAULT_CONFIG.maxRedirects
|
||||
);
|
||||
const DEFAULT_USER_AGENT = buildUserAgent(settings.userAgent);
|
||||
|
||||
const CACHE_TTL_SUCCESS_MS = daysToMs(
|
||||
pickNumber(settings.cacheTtlSuccessDays, DEFAULT_CONFIG.cacheTtlSuccessDays)
|
||||
@@ -95,12 +88,12 @@ const CACHE_TTL_TIMEOUT_MS = daysToMs(
|
||||
);
|
||||
|
||||
fs.mkdirSync(CACHE_DIR, { recursive: true });
|
||||
if (ENABLE_COOKIES) {
|
||||
fs.mkdirSync(path.dirname(COOKIE_JAR), { recursive: true });
|
||||
if (!fs.existsSync(COOKIE_JAR)) {
|
||||
fs.closeSync(fs.openSync(COOKIE_JAR, "a"));
|
||||
}
|
||||
}
|
||||
|
||||
const BASE_HTTP_OPTIONS = {
|
||||
userAgent: DEFAULT_USER_AGENT,
|
||||
timeoutMs: REQUEST_TIMEOUT_MS,
|
||||
maxRedirects: MAX_REDIRECTS,
|
||||
};
|
||||
|
||||
function pickNumber(value, fallback) {
|
||||
const parsed = Number(value);
|
||||
@@ -536,59 +529,6 @@ function extractHost(url) {
|
||||
}
|
||||
}
|
||||
|
||||
async function curlRequest(url, method, hostHeader) {
|
||||
const args = [
|
||||
"--silent",
|
||||
"--location",
|
||||
"--fail",
|
||||
"--max-time",
|
||||
`${REQUEST_TIMEOUT_SECONDS}`,
|
||||
"--output",
|
||||
"/dev/null",
|
||||
"--write-out",
|
||||
"%{http_code}",
|
||||
"--user-agent",
|
||||
DEFAULT_USER_AGENT,
|
||||
"--request",
|
||||
method,
|
||||
];
|
||||
|
||||
if (ENABLE_COOKIES) {
|
||||
args.push("--cookie", COOKIE_JAR, "--cookie-jar", COOKIE_JAR);
|
||||
}
|
||||
if (hostHeader) {
|
||||
args.push("-H", `Host: ${hostHeader}`);
|
||||
}
|
||||
args.push(url);
|
||||
|
||||
try {
|
||||
const { stdout } = await execFileAsync("curl", args);
|
||||
const status = parseInt(stdout.trim(), 10);
|
||||
return {
|
||||
status: Number.isNaN(status) ? null : status,
|
||||
errorType: null,
|
||||
method: method.toUpperCase(),
|
||||
};
|
||||
} catch (error) {
|
||||
const rawStatus = error?.stdout?.toString().trim();
|
||||
const status = rawStatus ? parseInt(rawStatus, 10) : null;
|
||||
const errorCode = Number(error?.code);
|
||||
const timeout = errorCode === 28 ? "timeout" : null;
|
||||
return {
|
||||
status: Number.isNaN(status) ? null : status,
|
||||
errorType: timeout,
|
||||
method: method.toUpperCase(),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
function shouldRetryWithGet(result) {
|
||||
if (!result) return true;
|
||||
if (result.errorType) return true;
|
||||
if (typeof result.status !== "number") return true;
|
||||
return result.status >= 400;
|
||||
}
|
||||
|
||||
function getTtlMs(entry) {
|
||||
if (!entry) return 0;
|
||||
if (entry.errorType === "timeout" || entry.status === 0 || entry.status === null) {
|
||||
@@ -748,17 +688,16 @@ async function checkEntries(entriesToCheck, entries, snapshotMeta) {
|
||||
if (host) {
|
||||
await applyHostDelay(host);
|
||||
}
|
||||
const hostHeader = host || extractHost(entry.url);
|
||||
let result = await curlRequest(entry.url, "HEAD", hostHeader);
|
||||
let result = await probeUrl(entry.url, { ...BASE_HTTP_OPTIONS, method: "HEAD" });
|
||||
recordHostCheck(host);
|
||||
if (shouldRetryWithGet(result)) {
|
||||
if (shouldRetry(result)) {
|
||||
if (RETRY_DELAY_MS > 0) {
|
||||
await delay(RETRY_DELAY_MS);
|
||||
}
|
||||
if (host) {
|
||||
await applyHostDelay(host);
|
||||
}
|
||||
result = await curlRequest(entry.url, "GET", hostHeader);
|
||||
result = await probeUrl(entry.url, { ...BASE_HTTP_OPTIONS, method: "GET" });
|
||||
recordHostCheck(host);
|
||||
}
|
||||
updateEntryWithResult(entries[entry.url], result);
|
||||
|
||||
133
tools/lib/http.js
Normal file
133
tools/lib/http.js
Normal file
@@ -0,0 +1,133 @@
|
||||
const { fetch } = require("undici");
|
||||
const UserAgent = require("user-agents");
|
||||
|
||||
const DEFAULT_ACCEPT =
|
||||
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8";
|
||||
const DEFAULT_ACCEPT_LANGUAGE = "fr-FR,fr;q=0.9,en;q=0.7";
|
||||
const DEFAULT_TIMEOUT_MS = 5000;
|
||||
const DEFAULT_MAX_REDIRECTS = 5;
|
||||
|
||||
function buildUserAgent(preferred) {
|
||||
if (typeof preferred === "string" && preferred.trim()) {
|
||||
return preferred.trim();
|
||||
}
|
||||
const ua = new UserAgent();
|
||||
return ua.toString();
|
||||
}
|
||||
|
||||
async function fetchWithRedirects(targetUrl, options, maxRedirects) {
|
||||
let currentUrl = targetUrl;
|
||||
let response = null;
|
||||
let redirects = 0;
|
||||
|
||||
while (redirects <= maxRedirects) {
|
||||
response = await fetch(currentUrl, { ...options, redirect: "manual" });
|
||||
const location = response.headers.get("location");
|
||||
if (
|
||||
response.status >= 300 &&
|
||||
response.status < 400 &&
|
||||
location &&
|
||||
redirects < maxRedirects
|
||||
) {
|
||||
if (response.body && typeof response.body.cancel === "function") {
|
||||
try {
|
||||
await response.body.cancel();
|
||||
} catch (_) {
|
||||
// Ignore cancellation errors; we're moving to the next hop.
|
||||
}
|
||||
}
|
||||
currentUrl = new URL(location, currentUrl).toString();
|
||||
redirects += 1;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
return response;
|
||||
}
|
||||
|
||||
async function probeUrl(url, options = {}) {
|
||||
const method = typeof options.method === "string" ? options.method.toUpperCase() : "HEAD";
|
||||
const timeoutMs = Number.isFinite(options.timeoutMs) ? options.timeoutMs : DEFAULT_TIMEOUT_MS;
|
||||
const maxRedirects = Number.isFinite(options.maxRedirects)
|
||||
? options.maxRedirects
|
||||
: DEFAULT_MAX_REDIRECTS;
|
||||
const userAgent = buildUserAgent(options.userAgent);
|
||||
const headers = {
|
||||
"user-agent": userAgent,
|
||||
"accept-language": DEFAULT_ACCEPT_LANGUAGE,
|
||||
"accept": DEFAULT_ACCEPT,
|
||||
...(options.headers || {}),
|
||||
};
|
||||
|
||||
const controller = new AbortController();
|
||||
const timer = setTimeout(() => controller.abort(), timeoutMs);
|
||||
|
||||
try {
|
||||
const response = await fetchWithRedirects(
|
||||
url,
|
||||
{
|
||||
method,
|
||||
headers,
|
||||
signal: controller.signal,
|
||||
},
|
||||
maxRedirects
|
||||
);
|
||||
const status = response ? response.status : null;
|
||||
const finalUrl = response?.url || url;
|
||||
if (response?.body && typeof response.body.cancel === "function") {
|
||||
try {
|
||||
await response.body.cancel();
|
||||
} catch (_) {
|
||||
// Ignore cancellation errors; the status is all we needed.
|
||||
}
|
||||
}
|
||||
return {
|
||||
status,
|
||||
finalUrl,
|
||||
method,
|
||||
errorType: null,
|
||||
};
|
||||
} catch (error) {
|
||||
if (error.name === "AbortError") {
|
||||
return {
|
||||
status: null,
|
||||
finalUrl: url,
|
||||
method,
|
||||
errorType: "timeout",
|
||||
};
|
||||
}
|
||||
return {
|
||||
status: null,
|
||||
finalUrl: url,
|
||||
method,
|
||||
errorType: "network",
|
||||
message: error.message,
|
||||
};
|
||||
} finally {
|
||||
clearTimeout(timer);
|
||||
}
|
||||
}
|
||||
|
||||
function shouldRetry(result) {
|
||||
if (!result) return true;
|
||||
if (result.errorType) return true;
|
||||
if (typeof result.status !== "number") return true;
|
||||
return result.status >= 400;
|
||||
}
|
||||
|
||||
async function checkUrl(url, options = {}) {
|
||||
const firstMethod = options.firstMethod || "HEAD";
|
||||
let result = await probeUrl(url, { ...options, method: firstMethod });
|
||||
if (options.retryWithGet !== false && shouldRetry(result)) {
|
||||
result = await probeUrl(url, { ...options, method: "GET" });
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
buildUserAgent,
|
||||
checkUrl,
|
||||
probeUrl,
|
||||
shouldRetry,
|
||||
};
|
||||
@@ -1,6 +1,6 @@
|
||||
const puppeteer = require("puppeteer-extra");
|
||||
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
|
||||
const UserAgent = require("user-agents");
|
||||
const { buildUserAgent } = require("./http");
|
||||
|
||||
puppeteer.use(StealthPlugin());
|
||||
|
||||
@@ -8,9 +8,11 @@ puppeteer.use(StealthPlugin());
|
||||
* Scrape a webpage to extract metadata and take a screenshot.
|
||||
* @param {string} url - The URL of the page to scrape.
|
||||
* @param {string} screenshotPath - Path where the screenshot should be saved.
|
||||
* @param {object} options
|
||||
* @param {string} [options.userAgent] - Optional user agent to use for the session.
|
||||
* @returns {Promise<object>} - Metadata including title, description, keywords, language, and HTTP status.
|
||||
*/
|
||||
async function scrapePage(url, screenshotPath) {
|
||||
async function scrapePage(url, screenshotPath, options = {}) {
|
||||
console.log(`🔍 Scraping: ${url}`);
|
||||
|
||||
const browser = await puppeteer.launch({
|
||||
@@ -35,9 +37,8 @@ async function scrapePage(url, screenshotPath) {
|
||||
|
||||
const page = await browser.newPage();
|
||||
|
||||
// Generate a fresh, realistic user-agent
|
||||
const userAgent = new UserAgent();
|
||||
await page.setUserAgent(userAgent.toString());
|
||||
const userAgent = buildUserAgent(options.userAgent);
|
||||
await page.setUserAgent(userAgent);
|
||||
|
||||
// Add headers to simulate a real browser
|
||||
await page.setExtraHTTPHeaders({
|
||||
|
||||
Reference in New Issue
Block a user