168 lines
5.3 KiB
JavaScript
168 lines
5.3 KiB
JavaScript
const { fetch } = require("undici");
|
|
|
|
const ARCHIVE_CDX_URL = "https://web.archive.org/cdx/search/cdx";
|
|
const ARCHIVE_SAVE_URL = "https://web.archive.org/save/";
|
|
const ARCHIVE_REQUEST_TIMEOUT_MS = 15000;
|
|
|
|
/**
|
|
* Construit l'URL publique d'une capture Wayback.
|
|
* @param {string} originalUrl URL d'origine.
|
|
* @param {string} timestamp Horodatage Wayback.
|
|
* @returns {string} URL archive.org utilisable directement.
|
|
*/
|
|
function buildArchiveCaptureUrl(originalUrl, timestamp) {
|
|
return `https://web.archive.org/web/${timestamp}/${originalUrl}`;
|
|
}
|
|
|
|
/**
|
|
* Borne une valeur numerique a un entier strictement positif.
|
|
* @param {unknown} value Valeur a verifier.
|
|
* @param {number} fallback Valeur par defaut.
|
|
* @returns {number} Entier positif.
|
|
*/
|
|
function normalizePositiveInteger(value, fallback) {
|
|
const parsed = Number.parseInt(String(value), 10);
|
|
if (Number.isNaN(parsed)) {
|
|
return fallback;
|
|
}
|
|
if (parsed <= 0) {
|
|
return fallback;
|
|
}
|
|
return parsed;
|
|
}
|
|
|
|
/**
|
|
* Charge un document JSON Archive.org avec un delai maximal.
|
|
* @param {string|URL} url URL a appeler.
|
|
* @returns {Promise<unknown>} Document JSON decode.
|
|
*/
|
|
async function fetchArchiveJson(url) {
|
|
const controller = new AbortController();
|
|
const timer = setTimeout(() => controller.abort(), ARCHIVE_REQUEST_TIMEOUT_MS);
|
|
const response = await fetch(url, { signal: controller.signal }).finally(() => clearTimeout(timer));
|
|
if (!response.ok) {
|
|
throw new Error(`Erreur de l'API Archive.org (${response.status})`);
|
|
}
|
|
return response.json();
|
|
}
|
|
|
|
/**
|
|
* Liste les captures Wayback recentes disponibles pour une URL.
|
|
* @param {string} url URL d'origine a rechercher.
|
|
* @param {{ limit?: number }} options Options de requete.
|
|
* @returns {Promise<Array<{ timestamp: string, originalUrl: string, statusCode: number|null, mimetype: string|null, url: string }>>}
|
|
*/
|
|
async function listArchiveCaptures(url, options = {}) {
|
|
const limit = normalizePositiveInteger(options.limit, 10);
|
|
const requestUrl = new URL(ARCHIVE_CDX_URL);
|
|
requestUrl.searchParams.set("url", url);
|
|
requestUrl.searchParams.set("output", "json");
|
|
requestUrl.searchParams.set("fl", "timestamp,original,statuscode,mimetype,digest");
|
|
requestUrl.searchParams.set("filter", "statuscode:200");
|
|
requestUrl.searchParams.set("collapse", "digest");
|
|
requestUrl.searchParams.set("fastLatest", "true");
|
|
requestUrl.searchParams.set("limit", `-${limit}`);
|
|
|
|
const rows = await fetchArchiveJson(requestUrl);
|
|
if (!Array.isArray(rows)) {
|
|
return [];
|
|
}
|
|
if (rows.length <= 1) {
|
|
return [];
|
|
}
|
|
|
|
const header = rows[0];
|
|
if (!Array.isArray(header)) {
|
|
return [];
|
|
}
|
|
|
|
const timestampIndex = header.indexOf("timestamp");
|
|
const originalIndex = header.indexOf("original");
|
|
const statusCodeIndex = header.indexOf("statuscode");
|
|
const mimetypeIndex = header.indexOf("mimetype");
|
|
|
|
const captures = [];
|
|
for (const row of rows.slice(1)) {
|
|
if (!Array.isArray(row)) {
|
|
continue;
|
|
}
|
|
|
|
const timestamp = row[timestampIndex];
|
|
const originalUrl = row[originalIndex];
|
|
if (typeof timestamp !== "string") {
|
|
continue;
|
|
}
|
|
if (typeof originalUrl !== "string") {
|
|
continue;
|
|
}
|
|
|
|
let statusCode = null;
|
|
if (statusCodeIndex > -1) {
|
|
const parsedStatusCode = Number.parseInt(row[statusCodeIndex], 10);
|
|
if (!Number.isNaN(parsedStatusCode)) {
|
|
statusCode = parsedStatusCode;
|
|
}
|
|
}
|
|
|
|
let mimetype = null;
|
|
if (mimetypeIndex > -1) {
|
|
const rawMimetype = row[mimetypeIndex];
|
|
if (typeof rawMimetype === "string" && rawMimetype.trim()) {
|
|
mimetype = rawMimetype.trim();
|
|
}
|
|
}
|
|
|
|
captures.push({
|
|
timestamp,
|
|
originalUrl,
|
|
statusCode,
|
|
mimetype,
|
|
url: buildArchiveCaptureUrl(originalUrl, timestamp),
|
|
});
|
|
}
|
|
|
|
captures.sort((left, right) => right.timestamp.localeCompare(left.timestamp));
|
|
return captures.slice(0, limit);
|
|
}
|
|
|
|
/**
|
|
* Retourne la capture la plus recente disponible pour une URL.
|
|
* @param {string} url URL d'origine.
|
|
* @returns {Promise<string|null>} URL archive.org, ou null si aucune capture n'existe.
|
|
*/
|
|
async function getArchiveUrl(url) {
|
|
const captures = await listArchiveCaptures(url, { limit: 1 });
|
|
if (captures.length === 0) {
|
|
return null;
|
|
}
|
|
return captures[0].url;
|
|
}
|
|
|
|
/**
|
|
* Demande a Archive.org d'archiver une URL.
|
|
* @param {string} url URL a archiver.
|
|
* @returns {Promise<string|null>} URL finale de la capture si disponible.
|
|
*/
|
|
async function saveToArchive(url) {
|
|
const controller = new AbortController();
|
|
const timer = setTimeout(() => controller.abort(), ARCHIVE_REQUEST_TIMEOUT_MS);
|
|
const response = await fetch(`${ARCHIVE_SAVE_URL}${encodeURIComponent(url)}`, {
|
|
method: "POST",
|
|
signal: controller.signal,
|
|
}).finally(() => clearTimeout(timer));
|
|
if (!response.ok) {
|
|
throw new Error(`Erreur de sauvegarde Archive.org (${response.status})`);
|
|
}
|
|
if (response.url.includes("/save/")) {
|
|
return null;
|
|
}
|
|
return response.url;
|
|
}
|
|
|
|
module.exports = {
|
|
buildArchiveCaptureUrl,
|
|
listArchiveCaptures,
|
|
getArchiveUrl,
|
|
saveToArchive,
|
|
};
|