Utilisation de playwright pour la vérification des liens externes
This commit is contained in:
@@ -3,7 +3,7 @@
|
||||
const fs = require("fs");
|
||||
const path = require("path");
|
||||
const yaml = require("js-yaml");
|
||||
const { buildUserAgent, checkUrl } = require("./lib/http");
|
||||
const { buildUserAgent, checkUrl, checkWithPlaywright } = require("./lib/http");
|
||||
const {
|
||||
collectMarkdownLinksFromFile,
|
||||
extractLinksFromText,
|
||||
@@ -27,6 +27,9 @@ const DEFAULT_CONFIG = {
|
||||
maxRedirects: 5,
|
||||
userAgent: null,
|
||||
ignoreHosts: [],
|
||||
usePlaywright: false,
|
||||
playwrightTimeoutSeconds: 10,
|
||||
playwrightExecutablePath: null,
|
||||
};
|
||||
|
||||
function loadConfig() {
|
||||
@@ -73,6 +76,19 @@ const MAX_REDIRECTS = Math.max(
|
||||
);
|
||||
const DEFAULT_USER_AGENT = buildUserAgent(settings.userAgent);
|
||||
const IGNORE_HOSTS = parseIgnoreHosts(settings.ignoreHosts);
|
||||
const PLAYWRIGHT_ENABLED = settings.usePlaywright === true;
|
||||
const PLAYWRIGHT_TIMEOUT_MS = Math.max(
|
||||
1000,
|
||||
(Number.isFinite(Number(settings.playwrightTimeoutSeconds))
|
||||
? Number(settings.playwrightTimeoutSeconds)
|
||||
: DEFAULT_CONFIG.playwrightTimeoutSeconds) * 1000
|
||||
);
|
||||
const PLAYWRIGHT_EXECUTABLE =
|
||||
typeof settings.playwrightExecutablePath === "string" &&
|
||||
settings.playwrightExecutablePath.trim().length > 0
|
||||
? settings.playwrightExecutablePath.trim()
|
||||
: null;
|
||||
const PLAYWRIGHT_RECHECK_STATUSES = new Set([403, 426, 429, 502]);
|
||||
|
||||
const CACHE_TTL_SUCCESS_MS = daysToMs(
|
||||
pickNumber(settings.cacheTtlSuccessDays, DEFAULT_CONFIG.cacheTtlSuccessDays)
|
||||
@@ -432,6 +448,23 @@ function filterIgnoredHosts(occurrences, ignoreHosts) {
|
||||
return filtered;
|
||||
}
|
||||
|
||||
function shouldRecheckWithPlaywright(result) {
|
||||
if (!PLAYWRIGHT_ENABLED) {
|
||||
return false;
|
||||
}
|
||||
if (!result) {
|
||||
return true;
|
||||
}
|
||||
if (result.errorType === "timeout" || result.errorType === "network") {
|
||||
return true;
|
||||
}
|
||||
const status = typeof result.status === "number" ? result.status : null;
|
||||
if (status === null) {
|
||||
return true;
|
||||
}
|
||||
return PLAYWRIGHT_RECHECK_STATUSES.has(status);
|
||||
}
|
||||
|
||||
function recordOccurrence(map, filePath, line, url) {
|
||||
if (!map.has(url)) {
|
||||
map.set(url, []);
|
||||
@@ -795,12 +828,24 @@ async function checkEntries(entriesToCheck, entries, snapshotMeta) {
|
||||
if (host) {
|
||||
await applyHostDelay(host);
|
||||
}
|
||||
const result = await checkUrl(entry.url, {
|
||||
let result = await checkUrl(entry.url, {
|
||||
...BASE_HTTP_OPTIONS,
|
||||
firstMethod: "GET",
|
||||
retryWithGet: false,
|
||||
});
|
||||
recordHostCheck(host);
|
||||
if (shouldRecheckWithPlaywright(result)) {
|
||||
if (host) {
|
||||
await applyHostDelay(host);
|
||||
}
|
||||
const playResult = await checkWithPlaywright(entry.url, {
|
||||
userAgent: DEFAULT_USER_AGENT,
|
||||
timeoutMs: PLAYWRIGHT_TIMEOUT_MS,
|
||||
executablePath: PLAYWRIGHT_EXECUTABLE,
|
||||
});
|
||||
recordHostCheck(host);
|
||||
result = playResult;
|
||||
}
|
||||
updateEntryWithResult(entries[entry.url], result);
|
||||
persistEntriesSnapshot(entries, snapshotMeta);
|
||||
processed += 1;
|
||||
|
||||
@@ -6,7 +6,6 @@
|
||||
"cacheDir": "tools/cache",
|
||||
"cacheFile": "external_links.yaml",
|
||||
"hostDelayMs": 2000,
|
||||
"retryDelayMs": 5000,
|
||||
"requestTimeoutSeconds": 5,
|
||||
"cacheTtlSuccessDays": 30,
|
||||
"cacheTtlClientErrorDays": 7,
|
||||
@@ -15,6 +14,9 @@
|
||||
"userAgent": null,
|
||||
"enableCookies": true,
|
||||
"cookieJar": "tools/cache/curl_cookies.txt",
|
||||
"usePlaywright": true,
|
||||
"playwrightTimeoutSeconds": 10,
|
||||
"playwrightExecutablePath": "/nix/store/jaf9gnbln0cbs2vspfdblc4ff6vv1kk5-chromium-142.0.7444.175/bin/chromium",
|
||||
"ignoreHosts": [
|
||||
"10.0.2.1",
|
||||
"web.archive.org",
|
||||
@@ -107,4 +109,4 @@
|
||||
"goaccess": {
|
||||
"url": null
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,6 +12,10 @@ const DEFAULT_USER_AGENTS = [
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
|
||||
];
|
||||
const DEFAULT_VIEWPORT = { width: 1366, height: 768 };
|
||||
const DEFAULT_PLAYWRIGHT_ARGS = ["--disable-blink-features=AutomationControlled"];
|
||||
|
||||
let playwrightModule = null;
|
||||
|
||||
function buildUserAgent(preferred) {
|
||||
if (typeof preferred === "string" && preferred.trim()) {
|
||||
@@ -112,6 +116,71 @@ function buildNavigationHeaders(url, userAgent, extraHeaders = {}) {
|
||||
return baseHeaders;
|
||||
}
|
||||
|
||||
function loadPlaywright() {
|
||||
if (playwrightModule) {
|
||||
return playwrightModule;
|
||||
}
|
||||
playwrightModule = require("playwright");
|
||||
return playwrightModule;
|
||||
}
|
||||
|
||||
// Vérifie une URL via Playwright, en se rapprochant d'une navigation réelle.
|
||||
async function checkWithPlaywright(url, options = {}) {
|
||||
const userAgent = buildUserAgent(options.userAgent);
|
||||
const timeoutMs = Number.isFinite(options.timeoutMs) ? options.timeoutMs : DEFAULT_TIMEOUT_MS;
|
||||
const executablePath =
|
||||
typeof options.executablePath === "string" && options.executablePath.trim()
|
||||
? options.executablePath.trim()
|
||||
: null;
|
||||
const playwright = loadPlaywright();
|
||||
|
||||
let browser = null;
|
||||
let context = null;
|
||||
try {
|
||||
browser = await playwright.chromium.launch({
|
||||
headless: true,
|
||||
executablePath: executablePath || undefined,
|
||||
args: DEFAULT_PLAYWRIGHT_ARGS,
|
||||
});
|
||||
context = await browser.newContext({
|
||||
viewport: { ...DEFAULT_VIEWPORT },
|
||||
userAgent,
|
||||
extraHTTPHeaders: buildNavigationHeaders(url, userAgent),
|
||||
});
|
||||
const page = await context.newPage();
|
||||
|
||||
try {
|
||||
const response = await page.goto(url, { waitUntil: "domcontentloaded", timeout: timeoutMs });
|
||||
const status = response ? response.status() : null;
|
||||
const finalUrl = page.url() || url;
|
||||
return {
|
||||
status,
|
||||
finalUrl,
|
||||
method: "GET",
|
||||
errorType: null,
|
||||
};
|
||||
} catch (error) {
|
||||
return {
|
||||
status: null,
|
||||
finalUrl: url,
|
||||
method: "GET",
|
||||
errorType: error?.name === "TimeoutError" ? "timeout" : "network",
|
||||
message: error?.message || null,
|
||||
};
|
||||
} finally {
|
||||
if (context) {
|
||||
await context.close();
|
||||
}
|
||||
if (browser) {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
// Toute erreur de chargement/initialisation Playwright doit interrompre le script.
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
async function fetchWithRedirects(targetUrl, options, maxRedirects) {
|
||||
let currentUrl = targetUrl;
|
||||
let response = null;
|
||||
@@ -226,4 +295,5 @@ module.exports = {
|
||||
checkUrl,
|
||||
probeUrl,
|
||||
shouldRetry,
|
||||
checkWithPlaywright,
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user