Amélioration de la détection de liens morts
This commit is contained in:
@@ -3,7 +3,7 @@
|
||||
const fs = require("fs");
|
||||
const path = require("path");
|
||||
const yaml = require("js-yaml");
|
||||
const { buildUserAgent, probeUrl, shouldRetry } = require("./lib/http");
|
||||
const { buildUserAgent, checkUrl } = require("./lib/http");
|
||||
const {
|
||||
collectMarkdownLinksFromFile,
|
||||
extractLinksFromText,
|
||||
@@ -18,7 +18,6 @@ const DEFAULT_CONFIG = {
|
||||
cacheDir: path.join(__dirname, "cache"),
|
||||
cacheFile: "external_links.yaml",
|
||||
hostDelayMs: 2000,
|
||||
retryDelayMs: 5000,
|
||||
requestTimeoutSeconds: 5,
|
||||
cacheTtlSuccessDays: 30,
|
||||
cacheTtlClientErrorDays: 7,
|
||||
@@ -27,6 +26,7 @@ const DEFAULT_CONFIG = {
|
||||
maxConcurrentHosts: 4,
|
||||
maxRedirects: 5,
|
||||
userAgent: null,
|
||||
ignoreHosts: [],
|
||||
};
|
||||
|
||||
function loadConfig() {
|
||||
@@ -57,7 +57,6 @@ const REPORT_PATH = path.isAbsolute(settings.cacheFile)
|
||||
: path.join(CACHE_DIR, settings.cacheFile);
|
||||
|
||||
const HOST_DELAY_MS = Math.max(0, Number(settings.hostDelayMs) || 0);
|
||||
const RETRY_DELAY_MS = Math.max(0, Number(settings.retryDelayMs) || 0);
|
||||
const REQUEST_TIMEOUT_SECONDS = Math.max(1, Number(settings.requestTimeoutSeconds) || 5);
|
||||
const REQUEST_TIMEOUT_MS = REQUEST_TIMEOUT_SECONDS * 1000;
|
||||
const MAX_CONCURRENT_HOSTS = Math.max(
|
||||
@@ -73,6 +72,7 @@ const MAX_REDIRECTS = Math.max(
|
||||
: DEFAULT_CONFIG.maxRedirects
|
||||
);
|
||||
const DEFAULT_USER_AGENT = buildUserAgent(settings.userAgent);
|
||||
const IGNORE_HOSTS = parseIgnoreHosts(settings.ignoreHosts);
|
||||
|
||||
const CACHE_TTL_SUCCESS_MS = daysToMs(
|
||||
pickNumber(settings.cacheTtlSuccessDays, DEFAULT_CONFIG.cacheTtlSuccessDays)
|
||||
@@ -103,6 +103,68 @@ function pickNumber(value, fallback) {
|
||||
return fallback;
|
||||
}
|
||||
|
||||
// Normalise un hôte pour comparaison.
|
||||
function normalizeHost(value) {
|
||||
if (typeof value !== "string") {
|
||||
return null;
|
||||
}
|
||||
const normalized = value.trim().toLowerCase();
|
||||
return normalized || null;
|
||||
}
|
||||
|
||||
// Retire le port éventuel d'une chaîne décrivant un hôte.
|
||||
function stripPort(hostValue) {
|
||||
if (typeof hostValue !== "string") {
|
||||
return null;
|
||||
}
|
||||
const trimmed = hostValue.trim();
|
||||
if (!trimmed) {
|
||||
return null;
|
||||
}
|
||||
const bracketMatch = trimmed.match(/^\[([^\]]+)\](?::\d+)?$/);
|
||||
if (bracketMatch) {
|
||||
return bracketMatch[1].toLowerCase();
|
||||
}
|
||||
const colonIndex = trimmed.lastIndexOf(":");
|
||||
if (colonIndex > -1 && trimmed.indexOf(":") === colonIndex) {
|
||||
const hostPart = trimmed.slice(0, colonIndex).trim();
|
||||
if (hostPart) {
|
||||
return hostPart.toLowerCase();
|
||||
}
|
||||
}
|
||||
return trimmed.toLowerCase();
|
||||
}
|
||||
|
||||
// Nettoie une entrée de configuration d'hôte à ignorer (schéma, chemin, port).
|
||||
function normalizeIgnoreHostEntry(value) {
|
||||
if (typeof value !== "string") {
|
||||
return null;
|
||||
}
|
||||
let candidate = value.trim().toLowerCase();
|
||||
if (!candidate) {
|
||||
return null;
|
||||
}
|
||||
candidate = candidate.replace(/^[a-z][a-z0-9+.-]*:\/\//, "");
|
||||
candidate = candidate.replace(/^\/\//, "");
|
||||
candidate = candidate.replace(/\/.*$/, "");
|
||||
candidate = candidate.replace(/[?#].*$/, "");
|
||||
return stripPort(candidate);
|
||||
}
|
||||
|
||||
// Construit l'ensemble des hôtes à ignorer à partir de la configuration.
|
||||
function parseIgnoreHosts(raw) {
|
||||
const set = new Set();
|
||||
if (Array.isArray(raw)) {
|
||||
for (const entry of raw) {
|
||||
const host = normalizeIgnoreHostEntry(entry);
|
||||
if (host) {
|
||||
set.add(host);
|
||||
}
|
||||
}
|
||||
}
|
||||
return set;
|
||||
}
|
||||
|
||||
function daysToMs(days) {
|
||||
if (!Number.isFinite(days) || days <= 0) {
|
||||
return 0;
|
||||
@@ -354,6 +416,22 @@ function filterReportLinks(links, activeUrls) {
|
||||
return filtered;
|
||||
}
|
||||
|
||||
// Supprime de la collecte les URLs dont l'hôte est ignoré.
|
||||
function filterIgnoredHosts(occurrences, ignoreHosts) {
|
||||
if (!ignoreHosts || ignoreHosts.size === 0) {
|
||||
return occurrences;
|
||||
}
|
||||
const filtered = new Map();
|
||||
for (const [url, urlOccurrences] of occurrences.entries()) {
|
||||
const host = extractHost(url);
|
||||
if (host && ignoreHosts.has(host)) {
|
||||
continue;
|
||||
}
|
||||
filtered.set(url, urlOccurrences);
|
||||
}
|
||||
return filtered;
|
||||
}
|
||||
|
||||
function recordOccurrence(map, filePath, line, url) {
|
||||
if (!map.has(url)) {
|
||||
map.set(url, []);
|
||||
@@ -551,7 +629,8 @@ function recordHostCheck(host) {
|
||||
|
||||
function extractHost(url) {
|
||||
try {
|
||||
return new URL(url).hostname;
|
||||
const hostname = new URL(url).hostname;
|
||||
return normalizeHost(hostname);
|
||||
} catch (_) {
|
||||
return null;
|
||||
}
|
||||
@@ -716,18 +795,12 @@ async function checkEntries(entriesToCheck, entries, snapshotMeta) {
|
||||
if (host) {
|
||||
await applyHostDelay(host);
|
||||
}
|
||||
let result = await probeUrl(entry.url, { ...BASE_HTTP_OPTIONS, method: "HEAD" });
|
||||
const result = await checkUrl(entry.url, {
|
||||
...BASE_HTTP_OPTIONS,
|
||||
firstMethod: "GET",
|
||||
retryWithGet: false,
|
||||
});
|
||||
recordHostCheck(host);
|
||||
if (shouldRetry(result)) {
|
||||
if (RETRY_DELAY_MS > 0) {
|
||||
await delay(RETRY_DELAY_MS);
|
||||
}
|
||||
if (host) {
|
||||
await applyHostDelay(host);
|
||||
}
|
||||
result = await probeUrl(entry.url, { ...BASE_HTTP_OPTIONS, method: "GET" });
|
||||
recordHostCheck(host);
|
||||
}
|
||||
updateEntryWithResult(entries[entry.url], result);
|
||||
persistEntriesSnapshot(entries, snapshotMeta);
|
||||
processed += 1;
|
||||
@@ -740,7 +813,8 @@ async function checkEntries(entriesToCheck, entries, snapshotMeta) {
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const occurrences = await collectOccurrences();
|
||||
let occurrences = await collectOccurrences();
|
||||
occurrences = filterIgnoredHosts(occurrences, IGNORE_HOSTS);
|
||||
if (occurrences.size === 0) {
|
||||
const emptyState = { generatedAt: new Date().toISOString(), links: [], entries: {} };
|
||||
saveState(emptyState);
|
||||
|
||||
Reference in New Issue
Block a user