1

Ajoute un outil de gestion interactive des liens morts

This commit is contained in:
2026-03-25 23:06:23 +01:00
parent dc262bdd97
commit 3cb735333a
7 changed files with 1115 additions and 21 deletions

View File

@@ -1,37 +1,167 @@
const ARCHIVE_API_URL = "https://archive.org/wayback/available?url=";
const { fetch } = require("undici");
const ARCHIVE_CDX_URL = "https://web.archive.org/cdx/search/cdx";
const ARCHIVE_SAVE_URL = "https://web.archive.org/save/";
const ARCHIVE_REQUEST_TIMEOUT_MS = 15000;
/**
* Check if a given URL exists in Archive.org.
* @param {string} url - The URL to check.
* @returns {Promise<string|null>} - The archive URL if found, otherwise null.
* Construit l'URL publique d'une capture Wayback.
* @param {string} originalUrl URL d'origine.
* @param {string} timestamp Horodatage Wayback.
* @returns {string} URL archive.org utilisable directement.
*/
function buildArchiveCaptureUrl(originalUrl, timestamp) {
return `https://web.archive.org/web/${timestamp}/${originalUrl}`;
}
/**
* Borne une valeur numerique a un entier strictement positif.
* @param {unknown} value Valeur a verifier.
* @param {number} fallback Valeur par defaut.
* @returns {number} Entier positif.
*/
function normalizePositiveInteger(value, fallback) {
const parsed = Number.parseInt(String(value), 10);
if (Number.isNaN(parsed)) {
return fallback;
}
if (parsed <= 0) {
return fallback;
}
return parsed;
}
/**
* Charge un document JSON Archive.org avec un delai maximal.
* @param {string|URL} url URL a appeler.
* @returns {Promise<unknown>} Document JSON decode.
*/
async function fetchArchiveJson(url) {
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), ARCHIVE_REQUEST_TIMEOUT_MS);
const response = await fetch(url, { signal: controller.signal }).finally(() => clearTimeout(timer));
if (!response.ok) {
throw new Error(`Erreur de l'API Archive.org (${response.status})`);
}
return response.json();
}
/**
* Liste les captures Wayback recentes disponibles pour une URL.
* @param {string} url URL d'origine a rechercher.
* @param {{ limit?: number }} options Options de requete.
* @returns {Promise<Array<{ timestamp: string, originalUrl: string, statusCode: number|null, mimetype: string|null, url: string }>>}
*/
async function listArchiveCaptures(url, options = {}) {
const limit = normalizePositiveInteger(options.limit, 10);
const requestUrl = new URL(ARCHIVE_CDX_URL);
requestUrl.searchParams.set("url", url);
requestUrl.searchParams.set("output", "json");
requestUrl.searchParams.set("fl", "timestamp,original,statuscode,mimetype,digest");
requestUrl.searchParams.set("filter", "statuscode:200");
requestUrl.searchParams.set("collapse", "digest");
requestUrl.searchParams.set("fastLatest", "true");
requestUrl.searchParams.set("limit", `-${limit}`);
const rows = await fetchArchiveJson(requestUrl);
if (!Array.isArray(rows)) {
return [];
}
if (rows.length <= 1) {
return [];
}
const header = rows[0];
if (!Array.isArray(header)) {
return [];
}
const timestampIndex = header.indexOf("timestamp");
const originalIndex = header.indexOf("original");
const statusCodeIndex = header.indexOf("statuscode");
const mimetypeIndex = header.indexOf("mimetype");
const captures = [];
for (const row of rows.slice(1)) {
if (!Array.isArray(row)) {
continue;
}
const timestamp = row[timestampIndex];
const originalUrl = row[originalIndex];
if (typeof timestamp !== "string") {
continue;
}
if (typeof originalUrl !== "string") {
continue;
}
let statusCode = null;
if (statusCodeIndex > -1) {
const parsedStatusCode = Number.parseInt(row[statusCodeIndex], 10);
if (!Number.isNaN(parsedStatusCode)) {
statusCode = parsedStatusCode;
}
}
let mimetype = null;
if (mimetypeIndex > -1) {
const rawMimetype = row[mimetypeIndex];
if (typeof rawMimetype === "string" && rawMimetype.trim()) {
mimetype = rawMimetype.trim();
}
}
captures.push({
timestamp,
originalUrl,
statusCode,
mimetype,
url: buildArchiveCaptureUrl(originalUrl, timestamp),
});
}
captures.sort((left, right) => right.timestamp.localeCompare(left.timestamp));
return captures.slice(0, limit);
}
/**
* Retourne la capture la plus recente disponible pour une URL.
* @param {string} url URL d'origine.
* @returns {Promise<string|null>} URL archive.org, ou null si aucune capture n'existe.
*/
async function getArchiveUrl(url) {
try {
const response = await fetch(`${ARCHIVE_API_URL}${encodeURIComponent(url)}`);
if (!response.ok) throw new Error(`HTTP error! Status: ${response.status}`);
const data = await response.json();
return data.archived_snapshots?.closest?.url || null;
} catch (error) {
console.error(`❌ Archive.org API error: ${error.message}`);
const captures = await listArchiveCaptures(url, { limit: 1 });
if (captures.length === 0) {
return null;
}
return captures[0].url;
}
/**
* Request Archive.org to save the given URL.
* @param {string} url - The URL to archive.
* @returns {Promise<string|null>} - The permalink of the archived page if successful, otherwise null.
* Demande a Archive.org d'archiver une URL.
* @param {string} url URL a archiver.
* @returns {Promise<string|null>} URL finale de la capture si disponible.
*/
async function saveToArchive(url) {
try {
const response = await fetch(`${ARCHIVE_SAVE_URL}${encodeURIComponent(url)}`, { method: "POST" });
if (!response.ok) throw new Error(`HTTP error! Status: ${response.status}`);
return response.url.includes("/save/") ? null : response.url;
} catch (error) {
console.error(`❌ Failed to save URL to Archive.org: ${error.message}`);
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), ARCHIVE_REQUEST_TIMEOUT_MS);
const response = await fetch(`${ARCHIVE_SAVE_URL}${encodeURIComponent(url)}`, {
method: "POST",
signal: controller.signal,
}).finally(() => clearTimeout(timer));
if (!response.ok) {
throw new Error(`Erreur de sauvegarde Archive.org (${response.status})`);
}
if (response.url.includes("/save/")) {
return null;
}
return response.url;
}
module.exports = { getArchiveUrl, saveToArchive };
module.exports = {
buildArchiveCaptureUrl,
listArchiveCaptures,
getArchiveUrl,
saveToArchive,
};

View File

@@ -92,6 +92,77 @@ async function resolveMarkdownTargets(inputs, { rootDir = process.cwd(), skipInd
return Array.from(targets);
}
/**
* Collecte tous les fichiers correspondant a une liste d'extensions.
* @param {string} rootDir Racine a parcourir.
* @param {string[]} extensions Extensions attendues, avec le point.
* @param {{ skipDirs?: string[] }} options Options de parcours.
* @returns {Promise<string[]>} Fichiers trouves, tries par chemin.
*/
async function collectFilesByExtensions(rootDir, extensions, options = {}) {
const normalizedExtensions = new Set();
for (const extension of extensions) {
if (typeof extension !== "string") {
continue;
}
const candidate = extension.trim().toLowerCase();
if (!candidate) {
continue;
}
normalizedExtensions.add(candidate);
}
if (normalizedExtensions.size === 0) {
return [];
}
const skipDirs = new Set([".git", "node_modules"]);
if (Array.isArray(options.skipDirs)) {
for (const directoryName of options.skipDirs) {
if (typeof directoryName !== "string") {
continue;
}
const candidate = directoryName.trim();
if (!candidate) {
continue;
}
skipDirs.add(candidate);
}
}
const files = [];
await walk(rootDir);
files.sort((a, b) => a.localeCompare(b));
return files;
async function walk(currentDir) {
const entries = await fs.readdir(currentDir, { withFileTypes: true });
for (const entry of entries) {
const fullPath = path.join(currentDir, entry.name);
if (entry.isDirectory()) {
if (skipDirs.has(entry.name)) {
continue;
}
await walk(fullPath);
continue;
}
if (!entry.isFile()) {
continue;
}
const extension = path.extname(entry.name).toLowerCase();
if (!normalizedExtensions.has(extension)) {
continue;
}
files.push(fullPath);
}
}
}
async function collectBundles(rootDir) {
const bundles = [];
await walk(rootDir, rootDir, bundles);
@@ -140,5 +211,6 @@ module.exports = {
collectMarkdownFiles,
collectSectionIndexDirs,
resolveMarkdownTargets,
collectFilesByExtensions,
collectBundles,
};

View File

@@ -0,0 +1,198 @@
const fs = require("node:fs");
const path = require("node:path");
const yaml = require("js-yaml");
const { loadToolsConfig } = require("./config");
const DEFAULT_CACHE_DIR = "tools/cache";
const DEFAULT_CACHE_FILE = "external_links.yaml";
/**
* Resout le chemin du rapport des liens externes a partir de la configuration.
* @param {string} siteRoot Racine du projet.
* @returns {Promise<string>} Chemin absolu du rapport YAML.
*/
async function resolveExternalLinksReportPath(siteRoot) {
const rootDir = path.resolve(siteRoot);
const configPath = path.join(rootDir, "tools", "config", "config.json");
const config = await loadToolsConfig(configPath);
let cacheDir = DEFAULT_CACHE_DIR;
const externalLinks = config.externalLinks;
if (externalLinks && typeof externalLinks.cacheDir === "string" && externalLinks.cacheDir.trim()) {
cacheDir = externalLinks.cacheDir.trim();
}
let cacheFile = DEFAULT_CACHE_FILE;
if (externalLinks && typeof externalLinks.cacheFile === "string" && externalLinks.cacheFile.trim()) {
cacheFile = externalLinks.cacheFile.trim();
}
let resolvedCacheDir = cacheDir;
if (!path.isAbsolute(resolvedCacheDir)) {
resolvedCacheDir = path.join(rootDir, resolvedCacheDir);
}
if (path.isAbsolute(cacheFile)) {
return cacheFile;
}
return path.join(resolvedCacheDir, cacheFile);
}
/**
* Normalise la liste des emplacements associes a un lien.
* @param {unknown[]} rawLocations Emplacements bruts.
* @returns {Array<{ file: string, line: number|null, page: string|null }>}
*/
function normalizeLocations(rawLocations) {
if (!Array.isArray(rawLocations)) {
return [];
}
const locations = [];
for (const rawLocation of rawLocations) {
if (!rawLocation || typeof rawLocation !== "object") {
continue;
}
let file = null;
if (typeof rawLocation.file === "string" && rawLocation.file.trim()) {
file = rawLocation.file.trim();
}
if (!file) {
continue;
}
let line = null;
if (typeof rawLocation.line === "number" && Number.isFinite(rawLocation.line)) {
line = rawLocation.line;
}
let page = null;
if (typeof rawLocation.page === "string" && rawLocation.page.trim()) {
page = rawLocation.page.trim();
}
locations.push({ file, line, page });
}
return locations;
}
/**
* Normalise une entree du rapport.
* @param {unknown} rawLink Entree brute.
* @returns {{ url: string, status: number|null, locations: Array<{ file: string, line: number|null, page: string|null }> }|null}
*/
function normalizeLink(rawLink) {
if (!rawLink || typeof rawLink !== "object") {
return null;
}
if (typeof rawLink.url !== "string" || !rawLink.url.trim()) {
return null;
}
let status = null;
if (typeof rawLink.status === "number" && Number.isFinite(rawLink.status)) {
status = rawLink.status;
}
if (typeof rawLink.status === "string" && rawLink.status.trim()) {
const parsedStatus = Number.parseInt(rawLink.status, 10);
if (!Number.isNaN(parsedStatus)) {
status = parsedStatus;
}
}
return {
url: rawLink.url.trim(),
status,
locations: normalizeLocations(rawLink.locations),
};
}
/**
* Reconstitue une liste de liens a partir de la section entries du cache.
* @param {Record<string, unknown>} entries Entrees brutes.
* @returns {Array<{ url: string, status: number|null, locations: Array<{ file: string, line: number|null, page: string|null }> }>}
*/
function buildLinksFromEntries(entries) {
const links = [];
for (const [url, rawEntry] of Object.entries(entries)) {
let status = null;
let locations = null;
if (rawEntry && typeof rawEntry === "object") {
status = rawEntry.status;
locations = rawEntry.locations;
}
const normalized = normalizeLink({
url,
status,
locations,
});
if (normalized) {
links.push(normalized);
}
}
return links;
}
/**
* Charge le rapport des liens externes.
* @param {string} reportPath Chemin absolu ou relatif du rapport YAML.
* @returns {{ generatedAt: string|null, links: Array<{ url: string, status: number|null, locations: Array<{ file: string, line: number|null, page: string|null }> }> }}
*/
function loadExternalLinksReport(reportPath) {
const resolvedPath = path.resolve(reportPath);
if (!fs.existsSync(resolvedPath)) {
return { generatedAt: null, links: [] };
}
const raw = yaml.load(fs.readFileSync(resolvedPath, "utf8")) || {};
let links = [];
if (Array.isArray(raw.links)) {
for (const rawLink of raw.links) {
const normalized = normalizeLink(rawLink);
if (normalized) {
links.push(normalized);
}
}
} else if (raw.entries && typeof raw.entries === "object") {
links = buildLinksFromEntries(raw.entries);
}
return {
generatedAt: raw.generatedAt || null,
links,
};
}
/**
* Filtre les liens du rapport par code de statut HTTP.
* @param {{ links?: Array<{ status: number|null }> }} report Rapport charge.
* @param {number} statusCode Code a retenir.
* @returns {Array<{ url: string, status: number|null, locations: Array<{ file: string, line: number|null, page: string|null }> }>}
*/
function getLinksByStatus(report, statusCode) {
if (!report || !Array.isArray(report.links)) {
return [];
}
const links = [];
for (const link of report.links) {
if (!link || typeof link !== "object") {
continue;
}
if (link.status !== statusCode) {
continue;
}
links.push(link);
}
return links;
}
module.exports = {
resolveExternalLinksReportPath,
loadExternalLinksReport,
getLinksByStatus,
};

View File

@@ -0,0 +1,92 @@
const fs = require("node:fs/promises");
const path = require("node:path");
const { collectFilesByExtensions } = require("./content");
const DEFAULT_URL_TEXT_EXTENSIONS = Object.freeze([
".json",
".markdown",
".md",
".yaml",
".yml",
]);
/**
* Compte les occurrences exactes d'une chaine dans un texte.
* @param {string} text Texte a analyser.
* @param {string} needle Chaine recherchee.
* @returns {number} Nombre d'occurrences trouvees.
*/
function countOccurrences(text, needle) {
if (typeof text !== "string") {
return 0;
}
if (typeof needle !== "string" || !needle) {
return 0;
}
return text.split(needle).length - 1;
}
/**
* Retourne la liste des fichiers textuels contenant une URL donnee.
* @param {string} rootDir Racine a parcourir.
* @param {string} targetUrl URL a rechercher.
* @param {{ extensions?: string[] }} options Options de recherche.
* @returns {Promise<Array<{ filePath: string, occurrences: number }>>}
*/
async function findUrlOccurrences(rootDir, targetUrl, options = {}) {
let extensions = DEFAULT_URL_TEXT_EXTENSIONS;
if (Array.isArray(options.extensions)) {
extensions = options.extensions;
}
const files = await collectFilesByExtensions(rootDir, extensions);
const matches = [];
for (const filePath of files) {
const content = await fs.readFile(filePath, "utf8");
const occurrences = countOccurrences(content, targetUrl);
if (occurrences <= 0) {
continue;
}
matches.push({ filePath, occurrences });
}
return matches;
}
/**
* Remplace toutes les occurrences exactes d'une URL dans une liste de fichiers.
* @param {string} rootDir Racine de recherche.
* @param {string} targetUrl URL a remplacer.
* @param {string} replacementUrl URL de remplacement.
* @param {{ extensions?: string[], matches?: Array<{ filePath: string, occurrences: number }> }} options Options d'ecriture.
* @returns {Promise<{ changedFiles: string[], totalOccurrences: number }>}
*/
async function replaceUrlInFiles(rootDir, targetUrl, replacementUrl, options = {}) {
let matches = [];
if (Array.isArray(options.matches)) {
matches = options.matches;
} else {
matches = await findUrlOccurrences(rootDir, targetUrl, options);
}
const changedFiles = [];
let totalOccurrences = 0;
for (const match of matches) {
const filePath = path.resolve(match.filePath);
const content = await fs.readFile(filePath, "utf8");
const updatedContent = content.split(targetUrl).join(replacementUrl);
await fs.writeFile(filePath, updatedContent, "utf8");
changedFiles.push(filePath);
totalOccurrences += match.occurrences;
}
return { changedFiles, totalOccurrences };
}
module.exports = {
DEFAULT_URL_TEXT_EXTENSIONS,
countOccurrences,
findUrlOccurrences,
replaceUrlInFiles,
};