Amélioration de la détection de liens externes morts

2025-10-31 12:41:34 +01:00
parent 7442622c74
commit f8b824c540
7 changed files with 885 additions and 3474 deletions
--- a/data/external_links.yaml
+++ b/data/external_links.yaml
--- a/tools/check_external_links.js
+++ b/tools/check_external_links.js
@@ -1,73 +1,337 @@
 const fs = require("fs");
 const path = require("path");
 const yaml = require("js-yaml");
-const { scrapePage } = require("./lib/puppeteer");
-const readline = require("readline");
+const util = require("util");
+const { execFile } = require("child_process");
+const UserAgent = require("user-agents");
+const {
+  collectMarkdownLinksFromFile,
+  extractLinksFromText,
+} = require("./lib/markdown_links");

-const CONTENT_DIR = path.join(__dirname, "..", "content");
-const DATA_DIR = path.join(__dirname, "..", "data");
 const SITE_ROOT = path.resolve(__dirname, "..");
-const CACHE_PATH = path.join(DATA_DIR, "external_links.yaml");
-const CACHE_TTL_DAYS = 7;
+const CONFIG_PATH = path.join(__dirname, "config.json");
+
+let config = {};
+if (fs.existsSync(CONFIG_PATH)) {
+  try {
+    config = JSON.parse(fs.readFileSync(CONFIG_PATH, "utf8"));
+  } catch (error) {
+    console.warn(
+      `Failed to parse ${path.relative(SITE_ROOT, CONFIG_PATH)}. Using defaults. (${error.message})`
+    );
+  }
+}
+
+const externalConfig = {
+  cacheDir: path.join(__dirname, "cache"),
+  cacheFile: "external_links.yaml",
+  hostDelayMs: 2000,
+  retryDelayMs: 5000,
+  requestTimeoutSeconds: 5,
+  cacheTtlSuccessDays: 7,
+  cacheTtlClientErrorDays: 0,
+  outputFormat: "markdown",
+  outputFile: path.join(__dirname, "cache", "external_links_report.md"),
+  userAgent: null,
+  enableCookies: true,
+  cookieJar: path.join(__dirname, "cache", "curl_cookies.txt"),
+  ...(config.externalLinks || {}),
+};
+
+const CONTENT_DIR = path.join(SITE_ROOT, "content");
+const CACHE_DIR = path.isAbsolute(externalConfig.cacheDir)
+  ? externalConfig.cacheDir
+  : path.resolve(SITE_ROOT, externalConfig.cacheDir);
+const CACHE_PATH = path.isAbsolute(externalConfig.cacheFile)
+  ? externalConfig.cacheFile
+  : path.join(CACHE_DIR, externalConfig.cacheFile);
+const OUTPUT_FILE = path.isAbsolute(externalConfig.outputFile)
+  ? externalConfig.outputFile
+  : path.resolve(SITE_ROOT, externalConfig.outputFile);
+const COOKIE_JAR = externalConfig.cookieJar
+  ? path.isAbsolute(externalConfig.cookieJar)
+    ? externalConfig.cookieJar
+    : path.resolve(SITE_ROOT, externalConfig.cookieJar)
+  : path.join(CACHE_DIR, "curl_cookies.txt");
+
+const CACHE_TTL_SUCCESS_DAYS = Number(externalConfig.cacheTtlSuccessDays) || 0;
+const CACHE_TTL_CLIENT_ERROR_DAYS = Number(externalConfig.cacheTtlClientErrorDays) || 0;
+const HOST_DELAY_MS = Number(externalConfig.hostDelayMs) || 0;
+const RETRY_DELAY_MS = Number(externalConfig.retryDelayMs) || 0;
+const REQUEST_TIMEOUT_SECONDS = Number(externalConfig.requestTimeoutSeconds) || 0;
+const maxConcurrentConfig = Number(externalConfig.maxConcurrentHosts);
+const MAX_CONCURRENT_HOSTS =
+  Number.isFinite(maxConcurrentConfig) && maxConcurrentConfig > 0
+    ? maxConcurrentConfig
+    : 4;
+const DEFAULT_USER_AGENT =
+  typeof externalConfig.userAgent === "string" && externalConfig.userAgent.trim()
+    ? externalConfig.userAgent.trim()
+    : new UserAgent().toString();
+const ENABLE_COOKIES = externalConfig.enableCookies !== false;
+const PROGRESS_FILE = path.join(__dirname, "cache", "external_links_progress.csv");
+const execFileAsync = util.promisify(execFile);
+
+fs.mkdirSync(CACHE_DIR, { recursive: true });
+if (ENABLE_COOKIES) {
+  fs.mkdirSync(path.dirname(COOKIE_JAR), { recursive: true });
+  if (!fs.existsSync(COOKIE_JAR)) {
+    fs.closeSync(fs.openSync(COOKIE_JAR, "a"));
+  }
+}
+
+try {
+  if (fs.existsSync(PROGRESS_FILE)) {
+    fs.unlinkSync(PROGRESS_FILE);
+  }
+} catch (error) {
+  console.warn(`Unable to remove existing progress file: ${error.message}`);
+}

 let cache = {};
 if (fs.existsSync(CACHE_PATH)) {
  cache = yaml.load(fs.readFileSync(CACHE_PATH, "utf8")) || {};
 }
+let cacheDirty = false;

 const now = new Date();
 const BAD_LINKS = [];
+const lastHostChecks = new Map();
+const runResults = new Map();

-function isExternalLink(link) {
-  return typeof link === "string" && link.includes("://");
+function updateProgress(processed, total) {
+  process.stdout.write(`\rURL ${processed}/${total}`);
 }

 function isCacheValid(entry) {
  if (!entry?.checked) return false;
  const date = new Date(entry.checked);
-  return (now - date) / (1000 * 60 * 60 * 24) < CACHE_TTL_DAYS;
+  const ttlDays = (() => {
+    const status = entry.status;
+    if (typeof status === "number") {
+      if (status < 400) return CACHE_TTL_SUCCESS_DAYS;
+      if (status < 500) return CACHE_TTL_CLIENT_ERROR_DAYS;
+      return 0;
+    }
+    return 0;
+  })();
+  if (ttlDays <= 0) return false;
+  return (now - date) / (1000 * 60 * 60 * 24) < ttlDays;
 }

-function extractLinksFromText(text) {
-  const regex = /\bhttps?:\/\/[^\s)"'>]+/g;
-  return text.match(regex) || [];
-}
-
-async function checkLink(file, line, url) {
-  if (isCacheValid(cache[url])) return;
-
-  const meta = await scrapePage(url, null, { screenshot: false });
-  cache[url] = {
-    status: meta.httpStatus || null,
-    checked: new Date().toISOString(),
-  };
-
-  const bundle = path.relative(SITE_ROOT, file);
-
-  if (!meta.httpStatus || meta.httpStatus >= 400) {
-    BAD_LINKS.push({ bundle, url, line, status: meta.httpStatus });
-    process.stdout.write("❌");
-  } else {
-    process.stdout.write("✔");
+async function collectMarkdownLinks(filePath, occurrencesMap) {
+  const entries = await collectMarkdownLinksFromFile(filePath);
+  for (const { url, line } of entries) {
+    recordOccurrence(occurrencesMap, filePath, line, url);
  }
 }

-async function processMarkdown(filePath) {
-  const fileStream = fs.createReadStream(filePath);
-  const rl = readline.createInterface({ input: fileStream, crlfDelay: Infinity });
-  let lineNumber = 0;
-  for await (const line of rl) {
-    lineNumber++;
-    const links = extractLinksFromText(line);
-    for (const link of links) {
-      await checkLink(filePath, lineNumber, link);
+function recordOccurrence(occurrencesMap, filePath, lineNumber, url) {
+  if (!occurrencesMap.has(url)) {
+    occurrencesMap.set(url, { url, occurrences: [] });
+  }
+  const entry = occurrencesMap.get(url);
+  const alreadyRecorded = entry.occurrences.some(
+    (item) => item.file === filePath && item.line === lineNumber
+  );
+  if (!alreadyRecorded) {
+    entry.occurrences.push({ file: filePath, line: lineNumber });
+  }
+}
+
+function delay(ms) {
+  return new Promise((resolve) => setTimeout(resolve, ms));
+}
+
+async function applyHostDelay(host) {
+  if (!host) return;
+  const last = lastHostChecks.get(host);
+  if (last) {
+    const elapsed = Date.now() - last;
+    const waitTime = HOST_DELAY_MS - elapsed;
+    if (waitTime > 0) {
+      await delay(waitTime);
    }
  }
 }

-function processYamlRecursively(obj, links = []) {
-  if (typeof obj === "string" && isExternalLink(obj)) {
-    links.push(obj);
+function recordHostCheck(host) {
+  if (host) {
+    lastHostChecks.set(host, Date.now());
+  }
+}
+
+function extractHost(url) {
+  try {
+    return new URL(url).hostname;
+  } catch (_) {
+    return null;
+  }
+}
+
+function persistCache() {
+  if (!cacheDirty) return;
+  ensureDirectoryExists(CACHE_PATH);
+  fs.writeFileSync(CACHE_PATH, yaml.dump(cache));
+  cacheDirty = false;
+}
+
+function formatLocations(occurrences) {
+  return occurrences
+    .map(({ file, line }) => `${path.relative(SITE_ROOT, file)}:${line}`)
+    .join("; ");
+}
+
+function escapeCsvField(value) {
+  const stringValue = String(value);
+  if (/[",\n]/.test(stringValue)) {
+    return `"${stringValue.replace(/"/g, '""')}"`;
+  }
+  return stringValue;
+}
+
+function appendProgress(url, occurrences, status) {
+  const locationText = formatLocations(occurrences);
+  const statusText =
+    typeof status === "number" && status < 400 && status !== null ? "" : status ?? "";
+  const line = [
+    escapeCsvField(url),
+    escapeCsvField(locationText),
+    escapeCsvField(statusText),
+  ].join(",");
+  fs.appendFileSync(PROGRESS_FILE, `${line}\n`);
+}
+
+function groupEntriesByHost(entries) {
+  const result = new Map();
+  for (const entry of entries) {
+    const host = extractHost(entry.url);
+    const key = host || `__invalid__:${entry.url}`;
+    if (!result.has(key)) {
+      result.set(key, { host, entries: [] });
+    }
+    result.get(key).entries.push(entry);
+  }
+  return Array.from(result.values());
+}
+
+async function runWithConcurrency(items, worker, concurrency) {
+  const executing = new Set();
+  const promises = [];
+  for (const item of items) {
+    const promise = Promise.resolve().then(() => worker(item));
+    promises.push(promise);
+    executing.add(promise);
+    const clean = () => executing.delete(promise);
+    promise.then(clean).catch(clean);
+    if (executing.size >= concurrency) {
+      await Promise.race(executing);
+    }
+  }
+  return Promise.all(promises);
+}
+
+async function curlRequest(url, method) {
+  const args = [
+    "--silent",
+    "--location",
+    "--fail",
+    "--max-time",
+    `${REQUEST_TIMEOUT_SECONDS}`,
+    "--output",
+    "/dev/null",
+    "--write-out",
+    "%{http_code}",
+    "--user-agent",
+    DEFAULT_USER_AGENT,
+    "--request",
+    method,
+    url,
+  ];
+
+  if (ENABLE_COOKIES) {
+    args.push("--cookie", COOKIE_JAR, "--cookie-jar", COOKIE_JAR);
+  }
+
+  try {
+    const { stdout } = await execFileAsync("curl", args);
+    const status = parseInt(stdout.trim(), 10);
+    return {
+      status: Number.isNaN(status) ? null : status,
+      errorType: null,
+      method: method.toUpperCase(),
+    };
+  } catch (error) {
+    const rawStatus = error?.stdout?.toString().trim();
+    const status = rawStatus ? parseInt(rawStatus, 10) : null;
+    const errorCode = Number(error?.code);
+    const errorType = errorCode === 28 ? "timeout" : null;
+    return {
+      status: Number.isNaN(status) ? null : status,
+      errorType,
+      method: method.toUpperCase(),
+    };
+  }
+}
+
+function shouldRetryWithGet(result) {
+  if (result.errorType) return true;
+  if (result.status === null) return true;
+  return result.status >= 400;
+}
+
+async function checkLink(url) {
+  let info = runResults.get(url);
+  if (!info) {
+    const cachedInfo = cache[url];
+    if (!isCacheValid(cachedInfo)) {
+      const host = extractHost(url);
+      if (host) {
+        await applyHostDelay(host);
+      }
+
+      let result = await curlRequest(url, "HEAD");
+      recordHostCheck(host);
+
+      if (shouldRetryWithGet(result)) {
+        await delay(RETRY_DELAY_MS);
+        if (host) {
+          await applyHostDelay(host);
+        }
+        result = await curlRequest(url, "GET");
+        recordHostCheck(host);
+      }
+
+      info = {
+        status: result.status ?? null,
+        errorType: result.errorType || null,
+        method: result.method,
+        checked: new Date().toISOString(),
+      };
+      cache[url] = info;
+      cacheDirty = true;
+      persistCache();
+    } else if (cachedInfo) {
+      info = cachedInfo;
+    } else {
+      info = {
+        status: null,
+        errorType: "unknown",
+        method: "HEAD",
+        checked: new Date().toISOString(),
+      };
+    }
+    runResults.set(url, info);
+  }
+  return info;
+}
+
+function processYamlRecursively(obj, links = new Set()) {
+  if (typeof obj === "string") {
+    for (const link of extractLinksFromText(obj)) {
+      links.add(link);
+    }
  } else if (Array.isArray(obj)) {
    for (const item of obj) processYamlRecursively(item, links);
  } else if (typeof obj === "object" && obj !== null) {
@@ -76,15 +340,117 @@ function processYamlRecursively(obj, links = []) {
  return links;
 }

-async function processYaml(filePath) {
+function stripYamlInlineComment(line) {
+  let inSingle = false;
+  let inDouble = false;
+  for (let i = 0; i < line.length; i++) {
+    const ch = line[i];
+    if (ch === "'" && !inDouble) {
+      const next = line[i + 1];
+      if (inSingle && next === "'") {
+        i++;
+        continue;
+      }
+      inSingle = !inSingle;
+    } else if (ch === '"' && !inSingle) {
+      if (!inDouble) {
+        inDouble = true;
+      } else if (line[i - 1] !== "\\") {
+        inDouble = false;
+      }
+    } else if (ch === "#" && !inSingle && !inDouble) {
+      return line.slice(0, i);
+    } else if (ch === "\\" && inDouble) {
+      i++;
+    }
+  }
+  return line;
+}
+
+function isYamlCommentLine(line) {
+  return line.trim().startsWith("#");
+}
+
+function isBlockScalarIndicator(line) {
+  const cleaned = stripYamlInlineComment(line).trim();
+  return /:\s*[>|][0-9+-]*\s*$/.test(cleaned);
+}
+
+async function collectYamlLinks(filePath, occurrencesMap) {
+  let linkSet = new Set();
  try {
    const doc = yaml.load(fs.readFileSync(filePath, "utf8"));
-    const links = processYamlRecursively(doc);
-    for (const link of links) {
-      await checkLink(filePath, "?", link);
-    }
+    linkSet = processYamlRecursively(doc);
  } catch (e) {
    console.error(`Failed to parse YAML file: ${filePath}`);
+    return;
+  }
+
+  if (linkSet.size === 0) return;
+
+  const recorded = new Map();
+  const rawLines = fs.readFileSync(filePath, "utf8").split(/\r?\n/);
+  let inBlockScalar = false;
+  let blockIndent = 0;
+
+  const markRecorded = (url, lineNumber) => {
+    if (!recorded.has(url)) {
+      recorded.set(url, new Set());
+    }
+    const lines = recorded.get(url);
+    if (lines.has(lineNumber)) return;
+    lines.add(lineNumber);
+    recordOccurrence(occurrencesMap, filePath, lineNumber, url);
+  };
+
+  for (let index = 0; index < rawLines.length; index++) {
+    const lineNumber = index + 1;
+    const line = rawLines[index];
+    const indent = line.match(/^\s*/)?.[0].length ?? 0;
+    const trimmed = line.trim();
+
+    if (inBlockScalar) {
+      if (trimmed === "" && indent < blockIndent) {
+        inBlockScalar = false;
+        continue;
+      }
+      if (trimmed === "" || indent >= blockIndent) {
+        if (isYamlCommentLine(line)) {
+          continue;
+        }
+        for (const link of extractLinksFromText(line)) {
+          if (linkSet.has(link)) {
+            markRecorded(link, lineNumber);
+          }
+        }
+        continue;
+      }
+      inBlockScalar = false;
+    }
+
+    const withoutComment = stripYamlInlineComment(line);
+    const trimmedWithoutComment = withoutComment.trim();
+
+    if (isBlockScalarIndicator(line)) {
+      inBlockScalar = true;
+      blockIndent = indent + 1;
+    }
+
+    if (isYamlCommentLine(line)) continue;
+
+    if (!trimmedWithoutComment) continue;
+
+    for (const link of extractLinksFromText(withoutComment)) {
+      if (linkSet.has(link)) {
+        markRecorded(link, lineNumber);
+      }
+    }
+  }
+
+  for (const link of linkSet) {
+    if (!recorded.has(link) || recorded.get(link).size === 0) {
+      recordOccurrence(occurrencesMap, filePath, "?", link);
+    }
  }
 }

@@ -103,24 +469,155 @@ function walk(dir, exts) {
  return results;
 }

-(async () => {
-  const mdFiles = walk(CONTENT_DIR, [".md"]);
-  const yamlFiles = walk(DATA_DIR, [".yaml", ".yml"]);
-  console.log(`Scanning ${mdFiles.length} Markdown and ${yamlFiles.length} YAML files...`);
+function ensureDirectoryExists(targetFile) {
+  fs.mkdirSync(path.dirname(targetFile), { recursive: true });
+}

+function escapeMarkdownCell(value) {
+  return String(value).replace(/\|/g, "\\|").replace(/\r?\n/g, " ");
+}
+
+function generateMarkdownReport(entries) {
+  const header = [
+    "# Broken External Links",
+    "",
+    `Generated: ${new Date().toISOString()}`,
+    "",
+  ];
+  if (entries.length === 0) {
+    return header.concat(["No broken external links found."]).join("\n");
+  }
+  const rows = entries.map((entry) => {
+    const url = escapeMarkdownCell(entry.url);
+    const location = escapeMarkdownCell(entry.location);
+    const status = escapeMarkdownCell(entry.status);
+    return `| ${url} | ${location} | ${status} |`;
+  });
+  return header
+    .concat(["| URL | Location | Status |", "| --- | --- | --- |", ...rows])
+    .join("\n");
+}
+
+function generateCsvReport(entries) {
+  const lines = [`"url","location","status"`];
+  for (const entry of entries) {
+    const line = [entry.url, entry.location, entry.status]
+      .map((field) => `"${String(field).replace(/"/g, '""')}"`)
+      .join(",");
+    lines.push(line);
+  }
+  return lines.join("\n");
+}
+
+function writeReport(entries) {
+  const format = String(externalConfig.outputFormat || "markdown").toLowerCase();
+  const content =
+    format === "csv" ? generateCsvReport(entries) : generateMarkdownReport(entries);
+  ensureDirectoryExists(OUTPUT_FILE);
+  fs.writeFileSync(OUTPUT_FILE, content, "utf8");
+}
+
+(async () => {
+  const occurrencesByUrl = new Map();
+  const mdFiles = walk(CONTENT_DIR, [".md", ".markdown"]);
+  const yamlFiles = walk(CONTENT_DIR, [".yaml", ".yml"]);
  for (const file of mdFiles) {
-    await processMarkdown(file);
+    await collectMarkdownLinks(file, occurrencesByUrl);
  }
  for (const file of yamlFiles) {
-    await processYaml(file);
+    await collectYamlLinks(file, occurrencesByUrl);
  }

+  const uniqueEntries = Array.from(occurrencesByUrl.values());
+  const activeUrls = new Set(uniqueEntries.map((entry) => entry.url));
+  let cachePruned = false;
+  for (const url of Object.keys(cache)) {
+    if (!activeUrls.has(url)) {
+      delete cache[url];
+      cachePruned = true;
+    }
+  }
+  if (cachePruned) {
+    cacheDirty = true;
+  }
+  ensureDirectoryExists(PROGRESS_FILE);
+  fs.writeFileSync(PROGRESS_FILE, `"url","locations","status"\n`, "utf8");
+
+  const total = uniqueEntries.length;
+  if (total === 0) {
+    process.stdout.write("No external links found.\n");
+    ensureDirectoryExists(CACHE_PATH);
+    fs.writeFileSync(CACHE_PATH, yaml.dump(cache));
+    writeReport([]);
+    return;
+  }
+
+  const hostGroups = groupEntriesByHost(uniqueEntries);
+  const concurrency = Math.max(1, Math.min(MAX_CONCURRENT_HOSTS, hostGroups.length || 1));
+  let processed = 0;
+  await runWithConcurrency(
+    hostGroups,
+    async ({ entries }) => {
+      for (const entry of entries) {
+        const info = await checkLink(entry.url);
+        const status = typeof info?.status === "number" ? info.status : null;
+        const errorType = info?.errorType || null;
+        const hasHttpError = status !== null && status >= 400;
+        const isTimeout = errorType === "timeout";
+        const statusLabel = isTimeout ? "timeout" : status ?? "error";
+
+        if (status === null || hasHttpError || isTimeout) {
+          BAD_LINKS.push({
+            location: formatLocations(entry.occurrences),
+            url: entry.url,
+            status: statusLabel,
+          });
+        }
+
+        appendProgress(entry.url, entry.occurrences, hasHttpError || isTimeout || status === null ? statusLabel : status);
+        processed += 1;
+        updateProgress(processed, total);
+      }
+    },
+    concurrency
+  );
+  process.stdout.write("\n");
+
+  ensureDirectoryExists(CACHE_PATH);
  fs.writeFileSync(CACHE_PATH, yaml.dump(cache));

-  console.log("\n\n=== Broken External Links Report ===");
  if (BAD_LINKS.length === 0) {
-    console.log("✅ No broken external links found.");
-  } else {
-    console.table(BAD_LINKS);
+    writeReport([]);
+    console.log(
+      `No broken external links detected. Report saved to ${path.relative(
+        SITE_ROOT,
+        OUTPUT_FILE
+      )}.`
+    );
+    return;
  }
+
+  const sorted = BAD_LINKS.sort((a, b) => {
+    const rank = (entry) => {
+      if (entry.status === "timeout") return 2;
+      if (typeof entry.status === "number") {
+        return entry.status === 404 ? 0 : 1;
+      }
+      return 1;
+    };
+    const diff = rank(a) - rank(b);
+    if (diff !== 0) return diff;
+    if (typeof a.status === "number" && typeof b.status === "number") {
+      return a.status - b.status;
+    }
+    return a.url.localeCompare(b.url);
+  });
+
+  writeReport(sorted);
+  console.log(
+    `Found ${sorted.length} broken external link(s). Report saved to ${path.relative(
+      SITE_ROOT,
+      OUTPUT_FILE
+    )}.`
+  );
 })();
--- a/tools/config.json
+++ b/tools/config.json
@@ -1,5 +1,19 @@
 {
  "rebrickable": {
    "apiKey": ""
+  },
+  "externalLinks": {
+    "cacheDir": "tools/cache",
+    "cacheFile": "external_links.yaml",
+    "hostDelayMs": 2000,
+    "retryDelayMs": 5000,
+    "requestTimeoutSeconds": 5,
+    "cacheTtlSuccessDays": 7,
+    "cacheTtlClientErrorDays": 0,
+    "outputFormat": "markdown",
+    "outputFile": "tools/cache/external_links_report.md",
+    "userAgent": null,
+    "enableCookies": true,
+    "cookieJar": "tools/cache/curl_cookies.txt"
  }
-}
+}
--- a/tools/lib/markdown_links.js
+++ b/tools/lib/markdown_links.js
@@ -0,0 +1,246 @@
+const fs = require("fs");
+const readline = require("readline");
+
+function trimUnbalancedTrailing(value, openChar, closeChar) {
+  let result = value;
+  while (result.endsWith(closeChar)) {
+    const openCount = (result.match(new RegExp(`\\${openChar}`, "g")) || []).length;
+    const closeCount = (result.match(new RegExp(`\\${closeChar}`, "g")) || []).length;
+    if (closeCount > openCount) {
+      result = result.slice(0, -1);
+    } else {
+      break;
+    }
+  }
+  return result;
+}
+
+function sanitizeUrlCandidate(raw, options = {}) {
+  if (typeof raw !== "string") return null;
+  let candidate = raw.trim();
+  if (!candidate) return null;
+
+  if (candidate.startsWith("<") && candidate.endsWith(">")) {
+    candidate = candidate.slice(1, -1).trim();
+  }
+
+  while (/[.,;:!?'"\u2018\u2019\u201C\u201D]+$/.test(candidate)) {
+    candidate = candidate.slice(0, -1);
+  }
+
+  if (!options.keepTrailingParens) {
+    candidate = trimUnbalancedTrailing(candidate, "(", ")");
+  } else if (candidate.endsWith(")")) {
+    const openCount = (candidate.match(/\(/g) || []).length;
+    const closeCount = (candidate.match(/\)/g) || []).length;
+    if (closeCount > openCount) {
+      candidate = trimUnbalancedTrailing(candidate, "(", ")");
+    }
+  }
+  candidate = trimUnbalancedTrailing(candidate, "[", "]");
+  candidate = trimUnbalancedTrailing(candidate, "{", "}");
+
+  candidate = candidate.replace(/[*_]+$/, "");
+  candidate = candidate.replace(/\[\^[^\]]*\]$/, "");
+  if (!options.keepTrailingParens) {
+    candidate = trimUnbalancedTrailing(candidate, "(", ")");
+  }
+
+  if ((candidate.match(/\(/g) || []).length > (candidate.match(/\)/g) || []).length) {
+    return null;
+  }
+  if ((candidate.match(/\[/g) || []).length > (candidate.match(/]/g) || []).length) {
+    return null;
+  }
+  if ((candidate.match(/{/g) || []).length > (candidate.match(/}/g) || []).length) {
+    return null;
+  }
+
+  return candidate || null;
+}
+
+function findMatchingPair(text, startIndex, openChar, closeChar) {
+  let depth = 0;
+  for (let i = startIndex; i < text.length; i++) {
+    const ch = text[i];
+    if (ch === "\\") {
+      i++;
+      continue;
+    }
+    if (ch === openChar) {
+      depth++;
+    } else if (ch === closeChar) {
+      depth--;
+      if (depth === 0) {
+        return i;
+      }
+    }
+  }
+  return -1;
+}
+
+function parseLinkDestination(raw) {
+  if (typeof raw !== "string") return null;
+  let candidate = raw.trim();
+  if (!candidate) return null;
+
+  if (candidate.startsWith("<")) {
+    const closeIndex = candidate.indexOf(">");
+    if (closeIndex > 0) {
+      return candidate.slice(1, closeIndex).trim();
+    }
+  }
+
+  let result = "";
+  let escaping = false;
+  let parenDepth = 0;
+  for (let i = 0; i < candidate.length; i++) {
+    const ch = candidate[i];
+    if (escaping) {
+      result += ch;
+      escaping = false;
+      continue;
+    }
+    if (ch === "\\") {
+      escaping = true;
+      continue;
+    }
+    if (ch === "(") {
+      parenDepth++;
+    } else if (ch === ")" && parenDepth > 0) {
+      parenDepth--;
+    } else if (/\s/.test(ch) && parenDepth === 0) {
+      break;
+    }
+    result += ch;
+  }
+  return result;
+}
+
+function extractMarkdownDestinations(text) {
+  const urls = [];
+  for (let i = 0; i < text.length; i++) {
+    if (text[i] === "!") {
+      if (text[i + 1] !== "[") continue;
+      i += 1;
+    }
+    if (text[i] !== "[") continue;
+
+    const closeBracket = findMatchingPair(text, i, "[", "]");
+    if (closeBracket === -1) continue;
+
+    let pointer = closeBracket + 1;
+    while (pointer < text.length && /\s/.test(text[pointer])) pointer++;
+    if (pointer >= text.length || text[pointer] !== "(") {
+      i = closeBracket;
+      continue;
+    }
+
+    const openParen = pointer;
+    const closeParen = findMatchingPair(text, openParen, "(", ")");
+    if (closeParen === -1) {
+      break;
+    }
+
+    const rawDestination = text.slice(openParen + 1, closeParen);
+    const candidate = parseLinkDestination(rawDestination);
+    if (candidate) {
+      urls.push(candidate);
+    }
+    i = closeParen;
+  }
+  return urls;
+}
+
+function isExternalLink(link) {
+  return typeof link === "string" && link.includes("://");
+}
+
+function extractLinksFromText(text) {
+  if (typeof text !== "string" || !text.includes("http")) {
+    return [];
+  }
+
+  const results = [];
+  const seen = new Set();
+
+  function addCandidate(candidate, options = {}) {
+    const sanitized = sanitizeUrlCandidate(candidate, options);
+    if (!sanitized) return;
+    if (!isExternalLink(sanitized)) return;
+    if (seen.has(sanitized)) return;
+    seen.add(sanitized);
+    results.push(sanitized);
+  }
+
+  for (const url of extractMarkdownDestinations(text)) {
+    addCandidate(url, { keepTrailingParens: true });
+  }
+
+  const angleRegex = /<\s*(https?:\/\/[^>\s]+)\s*>/gi;
+  let match;
+  while ((match = angleRegex.exec(text)) !== null) {
+    addCandidate(match[1]);
+  }
+
+  const autoRegex = /https?:\/\/[^\s<>"`]+/gi;
+  while ((match = autoRegex.exec(text)) !== null) {
+    addCandidate(match[0]);
+  }
+
+  return results;
+}
+
+async function collectMarkdownLinksFromStream(stream) {
+  const rl = readline.createInterface({ input: stream, crlfDelay: Infinity });
+  const results = [];
+  let lineNumber = 0;
+  let inFrontMatter = false;
+  try {
+    for await (const line of rl) {
+      lineNumber++;
+      const trimmed = line.trim();
+
+      if (lineNumber === 1 && trimmed === "---") {
+        inFrontMatter = true;
+        continue;
+      }
+      if (inFrontMatter) {
+        if (trimmed === "---") {
+          inFrontMatter = false;
+          continue;
+        }
+        if (trimmed.startsWith("#")) {
+          continue;
+        }
+      }
+
+      for (const url of extractLinksFromText(line)) {
+        results.push({ url, line: lineNumber });
+      }
+    }
+  } finally {
+    rl.close();
+    if (typeof stream.close === "function") {
+      stream.close();
+    }
+  }
+  return results;
+}
+
+async function collectMarkdownLinksFromFile(filePath) {
+  const stream = fs.createReadStream(filePath, { encoding: "utf8" });
+  try {
+    return await collectMarkdownLinksFromStream(stream);
+  } catch (error) {
+    stream.destroy();
+    throw error;
+  }
+}
+
+module.exports = {
+  collectMarkdownLinksFromFile,
+  collectMarkdownLinksFromStream,
+  extractLinksFromText,
+  sanitizeUrlCandidate,
+};
--- a/tools/tests/archive.test.js
+++ b/tools/tests/archive.test.js
@@ -1,4 +1,4 @@
-const { getArchiveUrl, saveToArchive } = require("./lib/archive");
+const { getArchiveUrl, saveToArchive } = require("../lib/archive");

 (async () => {
  const testUrl = "https://richard-dern.fr";
--- a/tools/tests/markdown_links.test.js
+++ b/tools/tests/markdown_links.test.js
@@ -0,0 +1,68 @@
+const test = require("node:test");
+const assert = require("node:assert/strict");
+const { Readable } = require("node:stream");
+const {
+  collectMarkdownLinksFromStream,
+  extractLinksFromText,
+  sanitizeUrlCandidate,
+} = require("../lib/markdown_links");
+
+test("extractLinksFromText returns sanitized external URLs only once", () => {
+  const input =
+    "See [example](https://example.com) and <https://foo.com>. " +
+    "Autolink https://bar.com/path).\nDuplicate https://example.com!";
+  const urls = extractLinksFromText(input);
+  assert.deepStrictEqual(urls, ["https://example.com", "https://foo.com", "https://bar.com/path"]);
+});
+
+test("collectMarkdownLinksFromStream preserves line numbers", async () => {
+  const content = [
+    "Intro line with no link",
+    "Markdown [link](https://docs.example.org/page).",
+    "Plain link https://news.example.net/article.",
+    "Trailing <https://portal.example.com/path> punctuation.",
+    "Markdown [link](https://docs.example.org/page(with more valid content)).",
+    "Le **[baume du Canada](https://fr.wikipedia.org/wiki/Baume_du_Canada)**",
+    "(_Theropoda [incertae sedis](https://fr.wikipedia.org/wiki/Incertae_sedis)_)",
+    "[CDN](https://fr.wikipedia.org/wiki/Réseau_de_diffusion_de_contenu)[^2]."
+  ].join("\n");
+  const stream = Readable.from([content]);
+  const links = await collectMarkdownLinksFromStream(stream);
+  assert.deepStrictEqual(links, [
+    { url: "https://docs.example.org/page", line: 2 },
+    { url: "https://news.example.net/article", line: 3 },
+    { url: "https://portal.example.com/path", line: 4 },
+    { url: "https://docs.example.org/page(with more valid content)", line: 5 },
+    { url: "https://fr.wikipedia.org/wiki/Baume_du_Canada", line: 6 },
+    { url: "https://fr.wikipedia.org/wiki/Incertae_sedis", line: 7 },
+    { url: "https://fr.wikipedia.org/wiki/Réseau_de_diffusion_de_contenu", line: 8 },
+  ]);
+});
+
+test("collectMarkdownLinksFromStream ignores URLs in front matter comments", async () => {
+  const content = [
+    "---",
+    "links:",
+    "  # url: https://ignored.example.com",
+    "  - url: https://included.example.com",
+    "---",
+    "Body with https://body.example.com link.",
+  ].join("\n");
+  const stream = Readable.from([content]);
+  const links = await collectMarkdownLinksFromStream(stream);
+  assert.deepStrictEqual(links, [
+    { url: "https://included.example.com", line: 4 },
+    { url: "https://body.example.com", line: 6 },
+  ]);
+});
+
+test("sanitizeUrlCandidate removes spurious trailing punctuation", () => {
+  const cases = [
+    ["https://example.com).", "https://example.com"],
+    ["https://example.com!\"", "https://example.com"],
+    ["<https://example.com>", "https://example.com"],
+  ];
+  for (const [input, expected] of cases) {
+    assert.equal(sanitizeUrlCandidate(input), expected);
+  }
+});
--- a/tools/tests/puppeteer.test.js
+++ b/tools/tests/puppeteer.test.js
@@ -1,4 +1,4 @@
-const { scrapePage } = require("./lib/puppeteer");
+const { scrapePage } = require("../lib/puppeteer");
 const path = require("path");

 (async () => {