const { fetch } = require("undici"); const ARCHIVE_CDX_URL = "https://web.archive.org/cdx/search/cdx"; const ARCHIVE_SAVE_URL = "https://web.archive.org/save/"; const ARCHIVE_REQUEST_TIMEOUT_MS = 15000; /** * Construit l'URL publique d'une capture Wayback. * @param {string} originalUrl URL d'origine. * @param {string} timestamp Horodatage Wayback. * @returns {string} URL archive.org utilisable directement. */ function buildArchiveCaptureUrl(originalUrl, timestamp) { return `https://web.archive.org/web/${timestamp}/${originalUrl}`; } /** * Borne une valeur numerique a un entier strictement positif. * @param {unknown} value Valeur a verifier. * @param {number} fallback Valeur par defaut. * @returns {number} Entier positif. */ function normalizePositiveInteger(value, fallback) { const parsed = Number.parseInt(String(value), 10); if (Number.isNaN(parsed)) { return fallback; } if (parsed <= 0) { return fallback; } return parsed; } /** * Charge un document JSON Archive.org avec un delai maximal. * @param {string|URL} url URL a appeler. * @returns {Promise} Document JSON decode. */ async function fetchArchiveJson(url) { const controller = new AbortController(); const timer = setTimeout(() => controller.abort(), ARCHIVE_REQUEST_TIMEOUT_MS); const response = await fetch(url, { signal: controller.signal }).finally(() => clearTimeout(timer)); if (!response.ok) { throw new Error(`Erreur de l'API Archive.org (${response.status})`); } return response.json(); } /** * Liste les captures Wayback recentes disponibles pour une URL. * @param {string} url URL d'origine a rechercher. * @param {{ limit?: number }} options Options de requete. * @returns {Promise>} */ async function listArchiveCaptures(url, options = {}) { const limit = normalizePositiveInteger(options.limit, 10); const requestUrl = new URL(ARCHIVE_CDX_URL); requestUrl.searchParams.set("url", url); requestUrl.searchParams.set("output", "json"); requestUrl.searchParams.set("fl", "timestamp,original,statuscode,mimetype,digest"); requestUrl.searchParams.set("filter", "statuscode:200"); requestUrl.searchParams.set("collapse", "digest"); requestUrl.searchParams.set("fastLatest", "true"); requestUrl.searchParams.set("limit", `-${limit}`); const rows = await fetchArchiveJson(requestUrl); if (!Array.isArray(rows)) { return []; } if (rows.length <= 1) { return []; } const header = rows[0]; if (!Array.isArray(header)) { return []; } const timestampIndex = header.indexOf("timestamp"); const originalIndex = header.indexOf("original"); const statusCodeIndex = header.indexOf("statuscode"); const mimetypeIndex = header.indexOf("mimetype"); const captures = []; for (const row of rows.slice(1)) { if (!Array.isArray(row)) { continue; } const timestamp = row[timestampIndex]; const originalUrl = row[originalIndex]; if (typeof timestamp !== "string") { continue; } if (typeof originalUrl !== "string") { continue; } let statusCode = null; if (statusCodeIndex > -1) { const parsedStatusCode = Number.parseInt(row[statusCodeIndex], 10); if (!Number.isNaN(parsedStatusCode)) { statusCode = parsedStatusCode; } } let mimetype = null; if (mimetypeIndex > -1) { const rawMimetype = row[mimetypeIndex]; if (typeof rawMimetype === "string" && rawMimetype.trim()) { mimetype = rawMimetype.trim(); } } captures.push({ timestamp, originalUrl, statusCode, mimetype, url: buildArchiveCaptureUrl(originalUrl, timestamp), }); } captures.sort((left, right) => right.timestamp.localeCompare(left.timestamp)); return captures.slice(0, limit); } /** * Retourne la capture la plus recente disponible pour une URL. * @param {string} url URL d'origine. * @returns {Promise} URL archive.org, ou null si aucune capture n'existe. */ async function getArchiveUrl(url) { const captures = await listArchiveCaptures(url, { limit: 1 }); if (captures.length === 0) { return null; } return captures[0].url; } /** * Demande a Archive.org d'archiver une URL. * @param {string} url URL a archiver. * @returns {Promise} URL finale de la capture si disponible. */ async function saveToArchive(url) { const controller = new AbortController(); const timer = setTimeout(() => controller.abort(), ARCHIVE_REQUEST_TIMEOUT_MS); const response = await fetch(`${ARCHIVE_SAVE_URL}${encodeURIComponent(url)}`, { method: "POST", signal: controller.signal, }).finally(() => clearTimeout(timer)); if (!response.ok) { throw new Error(`Erreur de sauvegarde Archive.org (${response.status})`); } if (response.url.includes("/save/")) { return null; } return response.url; } module.exports = { buildArchiveCaptureUrl, listArchiveCaptures, getArchiveUrl, saveToArchive, };