Ajoute un outil de gestion interactive des liens morts

2026-03-25 23:06:23 +01:00
parent dc262bdd97
commit 3cb735333a
7 changed files with 1115 additions and 21 deletions
--- a/tools/lib/archive.js
+++ b/tools/lib/archive.js
@@ -1,37 +1,167 @@
-const ARCHIVE_API_URL = "https://archive.org/wayback/available?url=";
+const { fetch } = require("undici");
+
+const ARCHIVE_CDX_URL = "https://web.archive.org/cdx/search/cdx";
 const ARCHIVE_SAVE_URL = "https://web.archive.org/save/";
+const ARCHIVE_REQUEST_TIMEOUT_MS = 15000;

 /**
- * Check if a given URL exists in Archive.org.
- * @param {string} url - The URL to check.
- * @returns {Promise<string|null>} - The archive URL if found, otherwise null.
+ * Construit l'URL publique d'une capture Wayback.
+ * @param {string} originalUrl URL d'origine.
+ * @param {string} timestamp Horodatage Wayback.
+ * @returns {string} URL archive.org utilisable directement.
+ */
+function buildArchiveCaptureUrl(originalUrl, timestamp) {
+    return `https://web.archive.org/web/${timestamp}/${originalUrl}`;
+}
+
+/**
+ * Borne une valeur numerique a un entier strictement positif.
+ * @param {unknown} value Valeur a verifier.
+ * @param {number} fallback Valeur par defaut.
+ * @returns {number} Entier positif.
+ */
+function normalizePositiveInteger(value, fallback) {
+    const parsed = Number.parseInt(String(value), 10);
+    if (Number.isNaN(parsed)) {
+        return fallback;
+    }
+    if (parsed <= 0) {
+        return fallback;
+    }
+    return parsed;
+}
+
+/**
+ * Charge un document JSON Archive.org avec un delai maximal.
+ * @param {string|URL} url URL a appeler.
+ * @returns {Promise<unknown>} Document JSON decode.
+ */
+async function fetchArchiveJson(url) {
+    const controller = new AbortController();
+    const timer = setTimeout(() => controller.abort(), ARCHIVE_REQUEST_TIMEOUT_MS);
+    const response = await fetch(url, { signal: controller.signal }).finally(() => clearTimeout(timer));
+    if (!response.ok) {
+        throw new Error(`Erreur de l'API Archive.org (${response.status})`);
+    }
+    return response.json();
+}
+
+/**
+ * Liste les captures Wayback recentes disponibles pour une URL.
+ * @param {string} url URL d'origine a rechercher.
+ * @param {{ limit?: number }} options Options de requete.
+ * @returns {Promise<Array<{ timestamp: string, originalUrl: string, statusCode: number|null, mimetype: string|null, url: string }>>}
+ */
+async function listArchiveCaptures(url, options = {}) {
+    const limit = normalizePositiveInteger(options.limit, 10);
+    const requestUrl = new URL(ARCHIVE_CDX_URL);
+    requestUrl.searchParams.set("url", url);
+    requestUrl.searchParams.set("output", "json");
+    requestUrl.searchParams.set("fl", "timestamp,original,statuscode,mimetype,digest");
+    requestUrl.searchParams.set("filter", "statuscode:200");
+    requestUrl.searchParams.set("collapse", "digest");
+    requestUrl.searchParams.set("fastLatest", "true");
+    requestUrl.searchParams.set("limit", `-${limit}`);
+
+    const rows = await fetchArchiveJson(requestUrl);
+    if (!Array.isArray(rows)) {
+        return [];
+    }
+    if (rows.length <= 1) {
+        return [];
+    }
+
+    const header = rows[0];
+    if (!Array.isArray(header)) {
+        return [];
+    }
+
+    const timestampIndex = header.indexOf("timestamp");
+    const originalIndex = header.indexOf("original");
+    const statusCodeIndex = header.indexOf("statuscode");
+    const mimetypeIndex = header.indexOf("mimetype");
+
+    const captures = [];
+    for (const row of rows.slice(1)) {
+        if (!Array.isArray(row)) {
+            continue;
+        }
+
+        const timestamp = row[timestampIndex];
+        const originalUrl = row[originalIndex];
+        if (typeof timestamp !== "string") {
+            continue;
+        }
+        if (typeof originalUrl !== "string") {
+            continue;
+        }
+
+        let statusCode = null;
+        if (statusCodeIndex > -1) {
+            const parsedStatusCode = Number.parseInt(row[statusCodeIndex], 10);
+            if (!Number.isNaN(parsedStatusCode)) {
+                statusCode = parsedStatusCode;
+            }
+        }
+
+        let mimetype = null;
+        if (mimetypeIndex > -1) {
+            const rawMimetype = row[mimetypeIndex];
+            if (typeof rawMimetype === "string" && rawMimetype.trim()) {
+                mimetype = rawMimetype.trim();
+            }
+        }
+
+        captures.push({
+            timestamp,
+            originalUrl,
+            statusCode,
+            mimetype,
+            url: buildArchiveCaptureUrl(originalUrl, timestamp),
+        });
+    }
+
+    captures.sort((left, right) => right.timestamp.localeCompare(left.timestamp));
+    return captures.slice(0, limit);
+}
+
+/**
+ * Retourne la capture la plus recente disponible pour une URL.
+ * @param {string} url URL d'origine.
+ * @returns {Promise<string|null>} URL archive.org, ou null si aucune capture n'existe.
 */
 async function getArchiveUrl(url) {
-    try {
-        const response = await fetch(`${ARCHIVE_API_URL}${encodeURIComponent(url)}`);
-        if (!response.ok) throw new Error(`HTTP error! Status: ${response.status}`);
-        const data = await response.json();
-        return data.archived_snapshots?.closest?.url || null;
-    } catch (error) {
-        console.error(`❌ Archive.org API error: ${error.message}`);
+    const captures = await listArchiveCaptures(url, { limit: 1 });
+    if (captures.length === 0) {
        return null;
    }
+    return captures[0].url;
 }

 /**
- * Request Archive.org to save the given URL.
- * @param {string} url - The URL to archive.
- * @returns {Promise<string|null>} - The permalink of the archived page if successful, otherwise null.
+ * Demande a Archive.org d'archiver une URL.
+ * @param {string} url URL a archiver.
+ * @returns {Promise<string|null>} URL finale de la capture si disponible.
 */
 async function saveToArchive(url) {
-    try {
-        const response = await fetch(`${ARCHIVE_SAVE_URL}${encodeURIComponent(url)}`, { method: "POST" });
-        if (!response.ok) throw new Error(`HTTP error! Status: ${response.status}`);
-        return response.url.includes("/save/") ? null : response.url;
-    } catch (error) {
-        console.error(`❌ Failed to save URL to Archive.org: ${error.message}`);
+    const controller = new AbortController();
+    const timer = setTimeout(() => controller.abort(), ARCHIVE_REQUEST_TIMEOUT_MS);
+    const response = await fetch(`${ARCHIVE_SAVE_URL}${encodeURIComponent(url)}`, {
+        method: "POST",
+        signal: controller.signal,
+    }).finally(() => clearTimeout(timer));
+    if (!response.ok) {
+        throw new Error(`Erreur de sauvegarde Archive.org (${response.status})`);
+    }
+    if (response.url.includes("/save/")) {
        return null;
    }
+    return response.url;
 }

-module.exports = { getArchiveUrl, saveToArchive };
+module.exports = {
+    buildArchiveCaptureUrl,
+    listArchiveCaptures,
+    getArchiveUrl,
+    saveToArchive,
+};
--- a/tools/lib/content.js
+++ b/tools/lib/content.js
@@ -92,6 +92,77 @@ async function resolveMarkdownTargets(inputs, { rootDir = process.cwd(), skipInd
    return Array.from(targets);
 }

+/**
+ * Collecte tous les fichiers correspondant a une liste d'extensions.
+ * @param {string} rootDir Racine a parcourir.
+ * @param {string[]} extensions Extensions attendues, avec le point.
+ * @param {{ skipDirs?: string[] }} options Options de parcours.
+ * @returns {Promise<string[]>} Fichiers trouves, tries par chemin.
+ */
+async function collectFilesByExtensions(rootDir, extensions, options = {}) {
+    const normalizedExtensions = new Set();
+    for (const extension of extensions) {
+        if (typeof extension !== "string") {
+            continue;
+        }
+        const candidate = extension.trim().toLowerCase();
+        if (!candidate) {
+            continue;
+        }
+        normalizedExtensions.add(candidate);
+    }
+
+    if (normalizedExtensions.size === 0) {
+        return [];
+    }
+
+    const skipDirs = new Set([".git", "node_modules"]);
+    if (Array.isArray(options.skipDirs)) {
+        for (const directoryName of options.skipDirs) {
+            if (typeof directoryName !== "string") {
+                continue;
+            }
+            const candidate = directoryName.trim();
+            if (!candidate) {
+                continue;
+            }
+            skipDirs.add(candidate);
+        }
+    }
+
+    const files = [];
+    await walk(rootDir);
+    files.sort((a, b) => a.localeCompare(b));
+    return files;
+
+    async function walk(currentDir) {
+        const entries = await fs.readdir(currentDir, { withFileTypes: true });
+
+        for (const entry of entries) {
+            const fullPath = path.join(currentDir, entry.name);
+
+            if (entry.isDirectory()) {
+                if (skipDirs.has(entry.name)) {
+                    continue;
+                }
+                await walk(fullPath);
+                continue;
+            }
+
+            if (!entry.isFile()) {
+                continue;
+            }
+
+            const extension = path.extname(entry.name).toLowerCase();
+            if (!normalizedExtensions.has(extension)) {
+                continue;
+            }
+
+            files.push(fullPath);
+        }
+    }
+}
+
 async function collectBundles(rootDir) {
    const bundles = [];
    await walk(rootDir, rootDir, bundles);
@@ -140,5 +211,6 @@ module.exports = {
    collectMarkdownFiles,
    collectSectionIndexDirs,
    resolveMarkdownTargets,
+    collectFilesByExtensions,
    collectBundles,
 };
--- a/tools/lib/external_links_report.js
+++ b/tools/lib/external_links_report.js
@@ -0,0 +1,198 @@
+const fs = require("node:fs");
+const path = require("node:path");
+const yaml = require("js-yaml");
+const { loadToolsConfig } = require("./config");
+
+const DEFAULT_CACHE_DIR = "tools/cache";
+const DEFAULT_CACHE_FILE = "external_links.yaml";
+
+/**
+ * Resout le chemin du rapport des liens externes a partir de la configuration.
+ * @param {string} siteRoot Racine du projet.
+ * @returns {Promise<string>} Chemin absolu du rapport YAML.
+ */
+async function resolveExternalLinksReportPath(siteRoot) {
+  const rootDir = path.resolve(siteRoot);
+  const configPath = path.join(rootDir, "tools", "config", "config.json");
+  const config = await loadToolsConfig(configPath);
+
+  let cacheDir = DEFAULT_CACHE_DIR;
+  const externalLinks = config.externalLinks;
+  if (externalLinks && typeof externalLinks.cacheDir === "string" && externalLinks.cacheDir.trim()) {
+    cacheDir = externalLinks.cacheDir.trim();
+  }
+
+  let cacheFile = DEFAULT_CACHE_FILE;
+  if (externalLinks && typeof externalLinks.cacheFile === "string" && externalLinks.cacheFile.trim()) {
+    cacheFile = externalLinks.cacheFile.trim();
+  }
+
+  let resolvedCacheDir = cacheDir;
+  if (!path.isAbsolute(resolvedCacheDir)) {
+    resolvedCacheDir = path.join(rootDir, resolvedCacheDir);
+  }
+
+  if (path.isAbsolute(cacheFile)) {
+    return cacheFile;
+  }
+
+  return path.join(resolvedCacheDir, cacheFile);
+}
+
+/**
+ * Normalise la liste des emplacements associes a un lien.
+ * @param {unknown[]} rawLocations Emplacements bruts.
+ * @returns {Array<{ file: string, line: number|null, page: string|null }>}
+ */
+function normalizeLocations(rawLocations) {
+  if (!Array.isArray(rawLocations)) {
+    return [];
+  }
+
+  const locations = [];
+  for (const rawLocation of rawLocations) {
+    if (!rawLocation || typeof rawLocation !== "object") {
+      continue;
+    }
+
+    let file = null;
+    if (typeof rawLocation.file === "string" && rawLocation.file.trim()) {
+      file = rawLocation.file.trim();
+    }
+    if (!file) {
+      continue;
+    }
+
+    let line = null;
+    if (typeof rawLocation.line === "number" && Number.isFinite(rawLocation.line)) {
+      line = rawLocation.line;
+    }
+
+    let page = null;
+    if (typeof rawLocation.page === "string" && rawLocation.page.trim()) {
+      page = rawLocation.page.trim();
+    }
+
+    locations.push({ file, line, page });
+  }
+
+  return locations;
+}
+
+/**
+ * Normalise une entree du rapport.
+ * @param {unknown} rawLink Entree brute.
+ * @returns {{ url: string, status: number|null, locations: Array<{ file: string, line: number|null, page: string|null }> }|null}
+ */
+function normalizeLink(rawLink) {
+  if (!rawLink || typeof rawLink !== "object") {
+    return null;
+  }
+  if (typeof rawLink.url !== "string" || !rawLink.url.trim()) {
+    return null;
+  }
+
+  let status = null;
+  if (typeof rawLink.status === "number" && Number.isFinite(rawLink.status)) {
+    status = rawLink.status;
+  }
+  if (typeof rawLink.status === "string" && rawLink.status.trim()) {
+    const parsedStatus = Number.parseInt(rawLink.status, 10);
+    if (!Number.isNaN(parsedStatus)) {
+      status = parsedStatus;
+    }
+  }
+
+  return {
+    url: rawLink.url.trim(),
+    status,
+    locations: normalizeLocations(rawLink.locations),
+  };
+}
+
+/**
+ * Reconstitue une liste de liens a partir de la section entries du cache.
+ * @param {Record<string, unknown>} entries Entrees brutes.
+ * @returns {Array<{ url: string, status: number|null, locations: Array<{ file: string, line: number|null, page: string|null }> }>}
+ */
+function buildLinksFromEntries(entries) {
+  const links = [];
+  for (const [url, rawEntry] of Object.entries(entries)) {
+    let status = null;
+    let locations = null;
+    if (rawEntry && typeof rawEntry === "object") {
+      status = rawEntry.status;
+      locations = rawEntry.locations;
+    }
+    const normalized = normalizeLink({
+      url,
+      status,
+      locations,
+    });
+    if (normalized) {
+      links.push(normalized);
+    }
+  }
+  return links;
+}
+
+/**
+ * Charge le rapport des liens externes.
+ * @param {string} reportPath Chemin absolu ou relatif du rapport YAML.
+ * @returns {{ generatedAt: string|null, links: Array<{ url: string, status: number|null, locations: Array<{ file: string, line: number|null, page: string|null }> }> }}
+ */
+function loadExternalLinksReport(reportPath) {
+  const resolvedPath = path.resolve(reportPath);
+  if (!fs.existsSync(resolvedPath)) {
+    return { generatedAt: null, links: [] };
+  }
+
+  const raw = yaml.load(fs.readFileSync(resolvedPath, "utf8")) || {};
+  let links = [];
+  if (Array.isArray(raw.links)) {
+    for (const rawLink of raw.links) {
+      const normalized = normalizeLink(rawLink);
+      if (normalized) {
+        links.push(normalized);
+      }
+    }
+  } else if (raw.entries && typeof raw.entries === "object") {
+    links = buildLinksFromEntries(raw.entries);
+  }
+
+  return {
+    generatedAt: raw.generatedAt || null,
+    links,
+  };
+}
+
+/**
+ * Filtre les liens du rapport par code de statut HTTP.
+ * @param {{ links?: Array<{ status: number|null }> }} report Rapport charge.
+ * @param {number} statusCode Code a retenir.
+ * @returns {Array<{ url: string, status: number|null, locations: Array<{ file: string, line: number|null, page: string|null }> }>}
+ */
+function getLinksByStatus(report, statusCode) {
+  if (!report || !Array.isArray(report.links)) {
+    return [];
+  }
+
+  const links = [];
+  for (const link of report.links) {
+    if (!link || typeof link !== "object") {
+      continue;
+    }
+    if (link.status !== statusCode) {
+      continue;
+    }
+    links.push(link);
+  }
+
+  return links;
+}
+
+module.exports = {
+  resolveExternalLinksReportPath,
+  loadExternalLinksReport,
+  getLinksByStatus,
+};
--- a/tools/lib/url_replacements.js
+++ b/tools/lib/url_replacements.js
@@ -0,0 +1,92 @@
+const fs = require("node:fs/promises");
+const path = require("node:path");
+const { collectFilesByExtensions } = require("./content");
+
+const DEFAULT_URL_TEXT_EXTENSIONS = Object.freeze([
+  ".json",
+  ".markdown",
+  ".md",
+  ".yaml",
+  ".yml",
+]);
+
+/**
+ * Compte les occurrences exactes d'une chaine dans un texte.
+ * @param {string} text Texte a analyser.
+ * @param {string} needle Chaine recherchee.
+ * @returns {number} Nombre d'occurrences trouvees.
+ */
+function countOccurrences(text, needle) {
+  if (typeof text !== "string") {
+    return 0;
+  }
+  if (typeof needle !== "string" || !needle) {
+    return 0;
+  }
+  return text.split(needle).length - 1;
+}
+
+/**
+ * Retourne la liste des fichiers textuels contenant une URL donnee.
+ * @param {string} rootDir Racine a parcourir.
+ * @param {string} targetUrl URL a rechercher.
+ * @param {{ extensions?: string[] }} options Options de recherche.
+ * @returns {Promise<Array<{ filePath: string, occurrences: number }>>}
+ */
+async function findUrlOccurrences(rootDir, targetUrl, options = {}) {
+  let extensions = DEFAULT_URL_TEXT_EXTENSIONS;
+  if (Array.isArray(options.extensions)) {
+    extensions = options.extensions;
+  }
+  const files = await collectFilesByExtensions(rootDir, extensions);
+  const matches = [];
+
+  for (const filePath of files) {
+    const content = await fs.readFile(filePath, "utf8");
+    const occurrences = countOccurrences(content, targetUrl);
+    if (occurrences <= 0) {
+      continue;
+    }
+    matches.push({ filePath, occurrences });
+  }
+
+  return matches;
+}
+
+/**
+ * Remplace toutes les occurrences exactes d'une URL dans une liste de fichiers.
+ * @param {string} rootDir Racine de recherche.
+ * @param {string} targetUrl URL a remplacer.
+ * @param {string} replacementUrl URL de remplacement.
+ * @param {{ extensions?: string[], matches?: Array<{ filePath: string, occurrences: number }> }} options Options d'ecriture.
+ * @returns {Promise<{ changedFiles: string[], totalOccurrences: number }>}
+ */
+async function replaceUrlInFiles(rootDir, targetUrl, replacementUrl, options = {}) {
+  let matches = [];
+  if (Array.isArray(options.matches)) {
+    matches = options.matches;
+  } else {
+    matches = await findUrlOccurrences(rootDir, targetUrl, options);
+  }
+
+  const changedFiles = [];
+  let totalOccurrences = 0;
+
+  for (const match of matches) {
+    const filePath = path.resolve(match.filePath);
+    const content = await fs.readFile(filePath, "utf8");
+    const updatedContent = content.split(targetUrl).join(replacementUrl);
+    await fs.writeFile(filePath, updatedContent, "utf8");
+    changedFiles.push(filePath);
+    totalOccurrences += match.occurrences;
+  }
+
+  return { changedFiles, totalOccurrences };
+}
+
+module.exports = {
+  DEFAULT_URL_TEXT_EXTENSIONS,
+  countOccurrences,
+  findUrlOccurrences,
+  replaceUrlInFiles,
+};