Amélioration de la détection de liens externes morts

2025-10-31 12:41:34 +01:00
parent 7442622c74
commit f8b824c540
7 changed files with 885 additions and 3474 deletions
--- a/tools/tests/archive.test.js
+++ b/tools/tests/archive.test.js
@@ -0,0 +1,21 @@
+const { getArchiveUrl, saveToArchive } = require("../lib/archive");
+
+(async () => {
+  const testUrl = "https://richard-dern.fr";
+
+  console.log(`🔍 Checking Archive.org for: ${testUrl}`);
+  let archiveUrl = await getArchiveUrl(testUrl);
+
+  if (archiveUrl) {
+    console.log(`✔ Archive found: ${archiveUrl}`);
+  } else {
+    console.log(`❌ No archive found, requesting a new one...`);
+    archiveUrl = await saveToArchive(testUrl);
+
+    if (archiveUrl) {
+      console.log(`✔ URL successfully archived: ${archiveUrl}`);
+    } else {
+      console.log(`❌ Failed to archive the URL.`);
+    }
+  }
+})();
--- a/tools/tests/markdown_links.test.js
+++ b/tools/tests/markdown_links.test.js
@@ -0,0 +1,68 @@
+const test = require("node:test");
+const assert = require("node:assert/strict");
+const { Readable } = require("node:stream");
+const {
+  collectMarkdownLinksFromStream,
+  extractLinksFromText,
+  sanitizeUrlCandidate,
+} = require("../lib/markdown_links");
+
+test("extractLinksFromText returns sanitized external URLs only once", () => {
+  const input =
+    "See [example](https://example.com) and <https://foo.com>. " +
+    "Autolink https://bar.com/path).\nDuplicate https://example.com!";
+  const urls = extractLinksFromText(input);
+  assert.deepStrictEqual(urls, ["https://example.com", "https://foo.com", "https://bar.com/path"]);
+});
+
+test("collectMarkdownLinksFromStream preserves line numbers", async () => {
+  const content = [
+    "Intro line with no link",
+    "Markdown [link](https://docs.example.org/page).",
+    "Plain link https://news.example.net/article.",
+    "Trailing <https://portal.example.com/path> punctuation.",
+    "Markdown [link](https://docs.example.org/page(with more valid content)).",
+    "Le **[baume du Canada](https://fr.wikipedia.org/wiki/Baume_du_Canada)**",
+    "(_Theropoda [incertae sedis](https://fr.wikipedia.org/wiki/Incertae_sedis)_)",
+    "[CDN](https://fr.wikipedia.org/wiki/Réseau_de_diffusion_de_contenu)[^2]."
+  ].join("\n");
+  const stream = Readable.from([content]);
+  const links = await collectMarkdownLinksFromStream(stream);
+  assert.deepStrictEqual(links, [
+    { url: "https://docs.example.org/page", line: 2 },
+    { url: "https://news.example.net/article", line: 3 },
+    { url: "https://portal.example.com/path", line: 4 },
+    { url: "https://docs.example.org/page(with more valid content)", line: 5 },
+    { url: "https://fr.wikipedia.org/wiki/Baume_du_Canada", line: 6 },
+    { url: "https://fr.wikipedia.org/wiki/Incertae_sedis", line: 7 },
+    { url: "https://fr.wikipedia.org/wiki/Réseau_de_diffusion_de_contenu", line: 8 },
+  ]);
+});
+
+test("collectMarkdownLinksFromStream ignores URLs in front matter comments", async () => {
+  const content = [
+    "---",
+    "links:",
+    "  # url: https://ignored.example.com",
+    "  - url: https://included.example.com",
+    "---",
+    "Body with https://body.example.com link.",
+  ].join("\n");
+  const stream = Readable.from([content]);
+  const links = await collectMarkdownLinksFromStream(stream);
+  assert.deepStrictEqual(links, [
+    { url: "https://included.example.com", line: 4 },
+    { url: "https://body.example.com", line: 6 },
+  ]);
+});
+
+test("sanitizeUrlCandidate removes spurious trailing punctuation", () => {
+  const cases = [
+    ["https://example.com).", "https://example.com"],
+    ["https://example.com!\"", "https://example.com"],
+    ["<https://example.com>", "https://example.com"],
+  ];
+  for (const [input, expected] of cases) {
+    assert.equal(sanitizeUrlCandidate(input), expected);
+  }
+});
--- a/tools/tests/puppeteer.test.js
+++ b/tools/tests/puppeteer.test.js
@@ -0,0 +1,13 @@
+const { scrapePage } = require("../lib/puppeteer");
+const path = require("path");
+
+(async () => {
+  const testUrl = "https://richard-dern.fr";
+  const screenshotPath = path.join(__dirname, "test_screenshot.png");
+
+  console.log(`🔍 Testing Puppeteer module on: ${testUrl}`);
+  const metadata = await scrapePage(testUrl, screenshotPath);
+
+  console.log("📄 Page metadata:");
+  console.log(metadata);
+})();