1
Files
2025/tools/lib/archive.js

168 lines
5.3 KiB
JavaScript

const { fetch } = require("undici");
const ARCHIVE_CDX_URL = "https://web.archive.org/cdx/search/cdx";
const ARCHIVE_SAVE_URL = "https://web.archive.org/save/";
const ARCHIVE_REQUEST_TIMEOUT_MS = 15000;
/**
* Construit l'URL publique d'une capture Wayback.
* @param {string} originalUrl URL d'origine.
* @param {string} timestamp Horodatage Wayback.
* @returns {string} URL archive.org utilisable directement.
*/
function buildArchiveCaptureUrl(originalUrl, timestamp) {
return `https://web.archive.org/web/${timestamp}/${originalUrl}`;
}
/**
* Borne une valeur numerique a un entier strictement positif.
* @param {unknown} value Valeur a verifier.
* @param {number} fallback Valeur par defaut.
* @returns {number} Entier positif.
*/
function normalizePositiveInteger(value, fallback) {
const parsed = Number.parseInt(String(value), 10);
if (Number.isNaN(parsed)) {
return fallback;
}
if (parsed <= 0) {
return fallback;
}
return parsed;
}
/**
* Charge un document JSON Archive.org avec un delai maximal.
* @param {string|URL} url URL a appeler.
* @returns {Promise<unknown>} Document JSON decode.
*/
async function fetchArchiveJson(url) {
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), ARCHIVE_REQUEST_TIMEOUT_MS);
const response = await fetch(url, { signal: controller.signal }).finally(() => clearTimeout(timer));
if (!response.ok) {
throw new Error(`Erreur de l'API Archive.org (${response.status})`);
}
return response.json();
}
/**
* Liste les captures Wayback recentes disponibles pour une URL.
* @param {string} url URL d'origine a rechercher.
* @param {{ limit?: number }} options Options de requete.
* @returns {Promise<Array<{ timestamp: string, originalUrl: string, statusCode: number|null, mimetype: string|null, url: string }>>}
*/
async function listArchiveCaptures(url, options = {}) {
const limit = normalizePositiveInteger(options.limit, 10);
const requestUrl = new URL(ARCHIVE_CDX_URL);
requestUrl.searchParams.set("url", url);
requestUrl.searchParams.set("output", "json");
requestUrl.searchParams.set("fl", "timestamp,original,statuscode,mimetype,digest");
requestUrl.searchParams.set("filter", "statuscode:200");
requestUrl.searchParams.set("collapse", "digest");
requestUrl.searchParams.set("fastLatest", "true");
requestUrl.searchParams.set("limit", `-${limit}`);
const rows = await fetchArchiveJson(requestUrl);
if (!Array.isArray(rows)) {
return [];
}
if (rows.length <= 1) {
return [];
}
const header = rows[0];
if (!Array.isArray(header)) {
return [];
}
const timestampIndex = header.indexOf("timestamp");
const originalIndex = header.indexOf("original");
const statusCodeIndex = header.indexOf("statuscode");
const mimetypeIndex = header.indexOf("mimetype");
const captures = [];
for (const row of rows.slice(1)) {
if (!Array.isArray(row)) {
continue;
}
const timestamp = row[timestampIndex];
const originalUrl = row[originalIndex];
if (typeof timestamp !== "string") {
continue;
}
if (typeof originalUrl !== "string") {
continue;
}
let statusCode = null;
if (statusCodeIndex > -1) {
const parsedStatusCode = Number.parseInt(row[statusCodeIndex], 10);
if (!Number.isNaN(parsedStatusCode)) {
statusCode = parsedStatusCode;
}
}
let mimetype = null;
if (mimetypeIndex > -1) {
const rawMimetype = row[mimetypeIndex];
if (typeof rawMimetype === "string" && rawMimetype.trim()) {
mimetype = rawMimetype.trim();
}
}
captures.push({
timestamp,
originalUrl,
statusCode,
mimetype,
url: buildArchiveCaptureUrl(originalUrl, timestamp),
});
}
captures.sort((left, right) => right.timestamp.localeCompare(left.timestamp));
return captures.slice(0, limit);
}
/**
* Retourne la capture la plus recente disponible pour une URL.
* @param {string} url URL d'origine.
* @returns {Promise<string|null>} URL archive.org, ou null si aucune capture n'existe.
*/
async function getArchiveUrl(url) {
const captures = await listArchiveCaptures(url, { limit: 1 });
if (captures.length === 0) {
return null;
}
return captures[0].url;
}
/**
* Demande a Archive.org d'archiver une URL.
* @param {string} url URL a archiver.
* @returns {Promise<string|null>} URL finale de la capture si disponible.
*/
async function saveToArchive(url) {
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), ARCHIVE_REQUEST_TIMEOUT_MS);
const response = await fetch(`${ARCHIVE_SAVE_URL}${encodeURIComponent(url)}`, {
method: "POST",
signal: controller.signal,
}).finally(() => clearTimeout(timer));
if (!response.ok) {
throw new Error(`Erreur de sauvegarde Archive.org (${response.status})`);
}
if (response.url.includes("/save/")) {
return null;
}
return response.url;
}
module.exports = {
buildArchiveCaptureUrl,
listArchiveCaptures,
getArchiveUrl,
saveToArchive,
};