1

Amélioration de la détection de liens morts

This commit is contained in:
2025-12-08 00:23:22 +01:00
parent 9f1a81f8f5
commit cb7cd917d7
3 changed files with 209 additions and 30 deletions

View File

@@ -3,7 +3,7 @@
const fs = require("fs");
const path = require("path");
const yaml = require("js-yaml");
const { buildUserAgent, probeUrl, shouldRetry } = require("./lib/http");
const { buildUserAgent, checkUrl } = require("./lib/http");
const {
collectMarkdownLinksFromFile,
extractLinksFromText,
@@ -18,7 +18,6 @@ const DEFAULT_CONFIG = {
cacheDir: path.join(__dirname, "cache"),
cacheFile: "external_links.yaml",
hostDelayMs: 2000,
retryDelayMs: 5000,
requestTimeoutSeconds: 5,
cacheTtlSuccessDays: 30,
cacheTtlClientErrorDays: 7,
@@ -27,6 +26,7 @@ const DEFAULT_CONFIG = {
maxConcurrentHosts: 4,
maxRedirects: 5,
userAgent: null,
ignoreHosts: [],
};
function loadConfig() {
@@ -57,7 +57,6 @@ const REPORT_PATH = path.isAbsolute(settings.cacheFile)
: path.join(CACHE_DIR, settings.cacheFile);
const HOST_DELAY_MS = Math.max(0, Number(settings.hostDelayMs) || 0);
const RETRY_DELAY_MS = Math.max(0, Number(settings.retryDelayMs) || 0);
const REQUEST_TIMEOUT_SECONDS = Math.max(1, Number(settings.requestTimeoutSeconds) || 5);
const REQUEST_TIMEOUT_MS = REQUEST_TIMEOUT_SECONDS * 1000;
const MAX_CONCURRENT_HOSTS = Math.max(
@@ -73,6 +72,7 @@ const MAX_REDIRECTS = Math.max(
: DEFAULT_CONFIG.maxRedirects
);
const DEFAULT_USER_AGENT = buildUserAgent(settings.userAgent);
const IGNORE_HOSTS = parseIgnoreHosts(settings.ignoreHosts);
const CACHE_TTL_SUCCESS_MS = daysToMs(
pickNumber(settings.cacheTtlSuccessDays, DEFAULT_CONFIG.cacheTtlSuccessDays)
@@ -103,6 +103,68 @@ function pickNumber(value, fallback) {
return fallback;
}
// Normalise un hôte pour comparaison.
function normalizeHost(value) {
if (typeof value !== "string") {
return null;
}
const normalized = value.trim().toLowerCase();
return normalized || null;
}
// Retire le port éventuel d'une chaîne décrivant un hôte.
function stripPort(hostValue) {
if (typeof hostValue !== "string") {
return null;
}
const trimmed = hostValue.trim();
if (!trimmed) {
return null;
}
const bracketMatch = trimmed.match(/^\[([^\]]+)\](?::\d+)?$/);
if (bracketMatch) {
return bracketMatch[1].toLowerCase();
}
const colonIndex = trimmed.lastIndexOf(":");
if (colonIndex > -1 && trimmed.indexOf(":") === colonIndex) {
const hostPart = trimmed.slice(0, colonIndex).trim();
if (hostPart) {
return hostPart.toLowerCase();
}
}
return trimmed.toLowerCase();
}
// Nettoie une entrée de configuration d'hôte à ignorer (schéma, chemin, port).
function normalizeIgnoreHostEntry(value) {
if (typeof value !== "string") {
return null;
}
let candidate = value.trim().toLowerCase();
if (!candidate) {
return null;
}
candidate = candidate.replace(/^[a-z][a-z0-9+.-]*:\/\//, "");
candidate = candidate.replace(/^\/\//, "");
candidate = candidate.replace(/\/.*$/, "");
candidate = candidate.replace(/[?#].*$/, "");
return stripPort(candidate);
}
// Construit l'ensemble des hôtes à ignorer à partir de la configuration.
function parseIgnoreHosts(raw) {
const set = new Set();
if (Array.isArray(raw)) {
for (const entry of raw) {
const host = normalizeIgnoreHostEntry(entry);
if (host) {
set.add(host);
}
}
}
return set;
}
function daysToMs(days) {
if (!Number.isFinite(days) || days <= 0) {
return 0;
@@ -354,6 +416,22 @@ function filterReportLinks(links, activeUrls) {
return filtered;
}
// Supprime de la collecte les URLs dont l'hôte est ignoré.
function filterIgnoredHosts(occurrences, ignoreHosts) {
if (!ignoreHosts || ignoreHosts.size === 0) {
return occurrences;
}
const filtered = new Map();
for (const [url, urlOccurrences] of occurrences.entries()) {
const host = extractHost(url);
if (host && ignoreHosts.has(host)) {
continue;
}
filtered.set(url, urlOccurrences);
}
return filtered;
}
function recordOccurrence(map, filePath, line, url) {
if (!map.has(url)) {
map.set(url, []);
@@ -551,7 +629,8 @@ function recordHostCheck(host) {
function extractHost(url) {
try {
return new URL(url).hostname;
const hostname = new URL(url).hostname;
return normalizeHost(hostname);
} catch (_) {
return null;
}
@@ -716,18 +795,12 @@ async function checkEntries(entriesToCheck, entries, snapshotMeta) {
if (host) {
await applyHostDelay(host);
}
let result = await probeUrl(entry.url, { ...BASE_HTTP_OPTIONS, method: "HEAD" });
const result = await checkUrl(entry.url, {
...BASE_HTTP_OPTIONS,
firstMethod: "GET",
retryWithGet: false,
});
recordHostCheck(host);
if (shouldRetry(result)) {
if (RETRY_DELAY_MS > 0) {
await delay(RETRY_DELAY_MS);
}
if (host) {
await applyHostDelay(host);
}
result = await probeUrl(entry.url, { ...BASE_HTTP_OPTIONS, method: "GET" });
recordHostCheck(host);
}
updateEntryWithResult(entries[entry.url], result);
persistEntriesSnapshot(entries, snapshotMeta);
processed += 1;
@@ -740,7 +813,8 @@ async function checkEntries(entriesToCheck, entries, snapshotMeta) {
}
async function main() {
const occurrences = await collectOccurrences();
let occurrences = await collectOccurrences();
occurrences = filterIgnoredHosts(occurrences, IGNORE_HOSTS);
if (occurrences.size === 0) {
const emptyState = { generatedAt: new Date().toISOString(), links: [], entries: {} };
saveState(emptyState);