1

Correction semi-automatique des liens morts

This commit is contained in:
2025-11-01 15:40:40 +01:00
parent f8b824c540
commit 890c95a450
36 changed files with 405 additions and 105 deletions

View File

@@ -285,7 +285,10 @@ async function checkLink(url) {
let info = runResults.get(url);
if (!info) {
const cachedInfo = cache[url];
if (!isCacheValid(cachedInfo)) {
if (cachedInfo?.manually_killed === true) {
// Do not re-test manually killed links
info = cachedInfo;
} else if (!isCacheValid(cachedInfo)) {
const host = extractHost(url);
if (host) {
await applyHostDelay(host);
@@ -304,12 +307,13 @@ async function checkLink(url) {
}
info = {
...(cachedInfo || {}),
status: result.status ?? null,
errorType: result.errorType || null,
method: result.method,
checked: new Date().toISOString(),
};
cache[url] = info;
cache[url] = info; // preserves files, manual flags, etc.
cacheDirty = true;
persistCache();
} else if (cachedInfo) {
@@ -540,10 +544,43 @@ function writeReport(entries) {
if (cachePruned) {
cacheDirty = true;
}
// Update file paths, line numbers and ensure manual flags exist
for (const entry of uniqueEntries) {
const files = Array.from(
new Set(entry.occurrences.map((o) => path.relative(SITE_ROOT, o.file)))
).sort((a, b) => a.localeCompare(b));
const locations = Array.from(
new Set(
entry.occurrences.map(
(o) => `${path.relative(SITE_ROOT, o.file)}:${o.line}`
)
)
).sort((a, b) => a.localeCompare(b));
const existing = cache[entry.url] || {};
cache[entry.url] = {
...existing,
manually_validated: existing.manually_validated === true,
manually_killed: existing.manually_killed === true,
files,
locations,
};
cacheDirty = true;
}
if (cacheDirty) {
ensureDirectoryExists(CACHE_PATH);
fs.writeFileSync(CACHE_PATH, yaml.dump(cache));
cacheDirty = false;
}
// Exclude manually killed from re-checking and reporting
const entriesToCheck = uniqueEntries.filter(
(e) => !(cache[e.url] && cache[e.url].manually_killed === true)
);
ensureDirectoryExists(PROGRESS_FILE);
fs.writeFileSync(PROGRESS_FILE, `"url","locations","status"\n`, "utf8");
const total = uniqueEntries.length;
const total = entriesToCheck.length;
if (total === 0) {
process.stdout.write("No external links found.\n");
ensureDirectoryExists(CACHE_PATH);
@@ -552,7 +589,7 @@ function writeReport(entries) {
return;
}
const hostGroups = groupEntriesByHost(uniqueEntries);
const hostGroups = groupEntriesByHost(entriesToCheck);
const concurrency = Math.max(1, Math.min(MAX_CONCURRENT_HOSTS, hostGroups.length || 1));
let processed = 0;
await runWithConcurrency(

View File

@@ -8,12 +8,12 @@
"hostDelayMs": 2000,
"retryDelayMs": 5000,
"requestTimeoutSeconds": 5,
"cacheTtlSuccessDays": 7,
"cacheTtlClientErrorDays": 0,
"cacheTtlSuccessDays": 30,
"cacheTtlClientErrorDays": 7,
"outputFormat": "markdown",
"outputFile": "tools/cache/external_links_report.md",
"userAgent": null,
"enableCookies": true,
"cookieJar": "tools/cache/curl_cookies.txt"
}
}
}

View File

@@ -15,6 +15,14 @@ function trimUnbalancedTrailing(value, openChar, closeChar) {
return result;
}
function stripTrailingPunctuation(value) {
let result = value;
while (/[.,;:!?'"\u2018\u2019\u201C\u201D]+$/.test(result)) {
result = result.slice(0, -1);
}
return result;
}
function sanitizeUrlCandidate(raw, options = {}) {
if (typeof raw !== "string") return null;
let candidate = raw.trim();
@@ -24,9 +32,7 @@ function sanitizeUrlCandidate(raw, options = {}) {
candidate = candidate.slice(1, -1).trim();
}
while (/[.,;:!?'"\u2018\u2019\u201C\u201D]+$/.test(candidate)) {
candidate = candidate.slice(0, -1);
}
candidate = stripTrailingPunctuation(candidate);
if (!options.keepTrailingParens) {
candidate = trimUnbalancedTrailing(candidate, "(", ")");
@@ -39,9 +45,15 @@ function sanitizeUrlCandidate(raw, options = {}) {
}
candidate = trimUnbalancedTrailing(candidate, "[", "]");
candidate = trimUnbalancedTrailing(candidate, "{", "}");
candidate = stripTrailingPunctuation(candidate);
candidate = candidate.replace(/[)]+$/g, (suffix) => {
const toTrim = !options.keepTrailingParens ? suffix.length : Math.max(0, suffix.length - 1);
return ")".repeat(suffix.length - toTrim);
});
candidate = candidate.replace(/[*_]+$/, "");
candidate = candidate.replace(/\[\^[^\]]*\]$/, "");
candidate = stripTrailingPunctuation(candidate);
if (!options.keepTrailingParens) {
candidate = trimUnbalancedTrailing(candidate, "(", ")");
}
@@ -201,6 +213,7 @@ async function collectMarkdownLinksFromStream(stream) {
lineNumber++;
const trimmed = line.trim();
// Skip YAML front matter entirely; only scan Markdown content
if (lineNumber === 1 && trimmed === "---") {
inFrontMatter = true;
continue;
@@ -208,11 +221,8 @@ async function collectMarkdownLinksFromStream(stream) {
if (inFrontMatter) {
if (trimmed === "---") {
inFrontMatter = false;
continue;
}
if (trimmed.startsWith("#")) {
continue;
}
continue;
}
for (const url of extractLinksFromText(line)) {

View File

@@ -24,7 +24,8 @@ test("collectMarkdownLinksFromStream preserves line numbers", async () => {
"Markdown [link](https://docs.example.org/page(with more valid content)).",
"Le **[baume du Canada](https://fr.wikipedia.org/wiki/Baume_du_Canada)**",
"(_Theropoda [incertae sedis](https://fr.wikipedia.org/wiki/Incertae_sedis)_)",
"[CDN](https://fr.wikipedia.org/wiki/Réseau_de_diffusion_de_contenu)[^2]."
"[CDN](https://fr.wikipedia.org/wiki/Réseau_de_diffusion_de_contenu)[^2].",
"(heu... [oui](https://github.com/opencart/opencart/tree/master/upload/system/storage/vendor)...)"
].join("\n");
const stream = Readable.from([content]);
const links = await collectMarkdownLinksFromStream(stream);
@@ -36,10 +37,11 @@ test("collectMarkdownLinksFromStream preserves line numbers", async () => {
{ url: "https://fr.wikipedia.org/wiki/Baume_du_Canada", line: 6 },
{ url: "https://fr.wikipedia.org/wiki/Incertae_sedis", line: 7 },
{ url: "https://fr.wikipedia.org/wiki/Réseau_de_diffusion_de_contenu", line: 8 },
{ url: "https://github.com/opencart/opencart/tree/master/upload/system/storage/vendor", line: 9 },
]);
});
test("collectMarkdownLinksFromStream ignores URLs in front matter comments", async () => {
test("collectMarkdownLinksFromStream ignores URLs in front matter entirely", async () => {
const content = [
"---",
"links:",
@@ -51,7 +53,6 @@ test("collectMarkdownLinksFromStream ignores URLs in front matter comments", asy
const stream = Readable.from([content]);
const links = await collectMarkdownLinksFromStream(stream);
assert.deepStrictEqual(links, [
{ url: "https://included.example.com", line: 4 },
{ url: "https://body.example.com", line: 6 },
]);
});

View File

@@ -0,0 +1,254 @@
const fs = require("fs");
const path = require("path");
const util = require("util");
const yaml = require("js-yaml");
const readline = require("readline");
const { execFile } = require("child_process");
const execFileAsync = util.promisify(execFile);
const SITE_ROOT = path.resolve(__dirname, "..");
const CONFIG_PATH = path.join(__dirname, "config.json");
let config = {};
if (fs.existsSync(CONFIG_PATH)) {
try {
config = JSON.parse(fs.readFileSync(CONFIG_PATH, "utf8"));
} catch (error) {
console.warn(
`Impossible de parser ${path.relative(
SITE_ROOT,
CONFIG_PATH
)}. Valeurs par défaut utilisées. (${error.message})`
);
}
}
const externalConfig = {
cacheDir: path.join(__dirname, "cache"),
cacheFile: "external_links.yaml",
...(config.externalLinks || {}),
};
const CACHE_DIR = path.isAbsolute(externalConfig.cacheDir)
? externalConfig.cacheDir
: path.resolve(SITE_ROOT, externalConfig.cacheDir);
const CACHE_PATH = path.isAbsolute(externalConfig.cacheFile)
? externalConfig.cacheFile
: path.join(CACHE_DIR, externalConfig.cacheFile);
function ensureDirectoryExists(targetFile) {
fs.mkdirSync(path.dirname(targetFile), { recursive: true });
}
function loadCache() {
if (!fs.existsSync(CACHE_PATH)) return {};
try {
return yaml.load(fs.readFileSync(CACHE_PATH, "utf8")) || {};
} catch (e) {
console.error("Erreur de lecture du cache YAML:", e.message);
return {};
}
}
function saveCache(cache) {
ensureDirectoryExists(CACHE_PATH);
fs.writeFileSync(CACHE_PATH, yaml.dump(cache), "utf8");
}
function promptFactory() {
const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
const question = (q) =>
new Promise((resolve) => rl.question(q, (ans) => resolve(ans.trim())));
return {
async ask(q) {
return await question(q);
},
close() {
rl.close();
},
};
}
async function ensureCheckRanIfNeeded() {
if (fs.existsSync(CACHE_PATH)) return;
console.log(
"Cache introuvable. Exécution préalable de tools/check_external_links.js..."
);
await execFileAsync("node", [path.join(__dirname, "check_external_links.js")], {
cwd: SITE_ROOT,
env: process.env,
});
}
function listBrokenUrls(cache) {
const result = [];
for (const [url, info] of Object.entries(cache)) {
const status = info && typeof info.status === "number" ? info.status : null;
const killed = info && info.manually_killed === true;
const validated = info && info.manually_validated === true;
if (killed) continue; // on ne traite plus ces URL
if (validated) continue; // déjà validé manuellement
if (status !== null && (status >= 400 || status === 0)) {
result.push({ url, info });
}
}
return result;
}
function getFilesForUrl(info) {
let files = [];
if (Array.isArray(info?.files) && info.files.length > 0) {
files = info.files;
} else if (Array.isArray(info?.locations) && info.locations.length > 0) {
files = Array.from(new Set(info.locations.map((s) => String(s).split(":")[0])));
}
return files.map((p) => path.resolve(SITE_ROOT, p));
}
function replaceInFile(filePath, from, to) {
if (!fs.existsSync(filePath)) return { changed: false };
const original = fs.readFileSync(filePath, "utf8");
if (!original.includes(from)) return { changed: false };
const updated = original.split(from).join(to);
if (updated !== original) {
fs.writeFileSync(filePath, updated, "utf8");
return { changed: true };
}
return { changed: false };
}
async function main() {
await ensureCheckRanIfNeeded();
let cache = loadCache();
const broken = listBrokenUrls(cache);
if (broken.length === 0) {
console.log("Aucun lien en erreur (>= 400) à traiter.");
return;
}
const p = promptFactory();
try {
for (const { url, info } of broken) {
const statusLabel = typeof info.status === "number" ? String(info.status) : "inconnu";
const locations = Array.isArray(info.locations) ? info.locations : [];
const files = Array.isArray(info.files) ? info.files : Array.from(new Set(locations.map((s) => String(s).split(":")[0])));
console.log("\nURL: ", url);
console.log("Statut: ", statusLabel);
if (locations.length > 0) {
console.log("Emplacements:");
for (const loc of locations) console.log(" - ", loc);
} else if (files.length > 0) {
console.log("Emplacements:");
for (const f of files) console.log(" - ", `${f}:?`);
} else {
console.log("Fichiers: (aucun chemin enregistré)");
}
const choice = (
await p.ask(
"Action ? [i]gnorer, [c]onfirmer, [r]emplacer, [m]ort, [q]uitter (défaut: i) : "
)
).toLowerCase() || "i";
if (choice === "q") {
console.log("Arrêt demandé.");
break;
}
if (choice === "i") {
// Ignorer
continue;
}
if (choice === "c") {
const nowIso = new Date().toISOString();
cache[url] = {
...(cache[url] || {}),
manually_validated: true,
manually_killed: cache[url]?.manually_killed === true,
status: 200,
errorType: null,
method: "MANUAL",
checked: nowIso,
};
saveCache(cache);
console.log("Marqué comme validé manuellement.");
continue;
}
if (choice === "m") {
cache[url] = {
...(cache[url] || {}),
manually_killed: true,
manually_validated: cache[url]?.manually_validated === true,
status: cache[url]?.status ?? null,
errorType: cache[url]?.errorType ?? null,
method: cache[url]?.method ?? null,
};
saveCache(cache);
console.log("Marqué comme mort (plus jamais retesté).");
continue;
}
if (choice === "r") {
if (!(Array.isArray(files) && files.length > 0)) {
console.log(
"Impossible de remplacer: aucun fichier enregistré pour cet URL. Relancez d'abord tools/check_external_links.js."
);
continue;
}
const newUrl = await p.ask("Nouvel URL: ");
if (!newUrl || !newUrl.includes("://")) {
console.log("URL invalide, opération annulée.");
continue;
}
// Remplacements dans les fichiers listés
let changedFiles = 0;
for (const rel of files) {
const abs = path.resolve(SITE_ROOT, rel);
const { changed } = replaceInFile(abs, url, newUrl);
if (changed) changedFiles++;
}
console.log(`Remplacements effectués dans ${changedFiles} fichier(s).`);
// Mettre à jour la base: déplacer l'entrée vers la nouvelle clé
const oldEntry = cache[url] || {};
const newEntryExisting = cache[newUrl] || {};
cache[newUrl] = {
...newEntryExisting,
files: Array.isArray(oldEntry.files) ? [...oldEntry.files] : files,
locations: Array.isArray(oldEntry.locations)
? [...oldEntry.locations]
: Array.isArray(oldEntry.files)
? oldEntry.files.map((f) => `${f}:?`)
: Array.isArray(locations)
? [...locations]
: [],
manually_validated: false,
manually_killed: false,
status: null,
errorType: null,
method: newEntryExisting.method || null,
checked: null,
};
delete cache[url];
saveCache(cache);
console.log("Base mise à jour pour le nouvel URL.");
continue;
}
console.log("Choix non reconnu. Ignoré.");
}
} finally {
p.close();
}
console.log("\nTerminé. Vous pouvez relancer 'node tools/check_external_links.js' pour mettre à jour les statuts.");
}
main().catch((err) => {
console.error("Erreur:", err);
process.exitCode = 1;
});