Initial commit

2025-03-28 12:57:37 +01:00
commit ed9ddcfdc8
1841 changed files with 42303 additions and 0 deletions
--- a/tools/lib/archive.js
+++ b/tools/lib/archive.js
@@ -0,0 +1,37 @@
+const ARCHIVE_API_URL = "https://archive.org/wayback/available?url=";
+const ARCHIVE_SAVE_URL = "https://web.archive.org/save/";
+
+/**
+ * Check if a given URL exists in Archive.org.
+ * @param {string} url - The URL to check.
+ * @returns {Promise<string|null>} - The archive URL if found, otherwise null.
+ */
+async function getArchiveUrl(url) {
+    try {
+        const response = await fetch(`${ARCHIVE_API_URL}${encodeURIComponent(url)}`);
+        if (!response.ok) throw new Error(`HTTP error! Status: ${response.status}`);
+        const data = await response.json();
+        return data.archived_snapshots?.closest?.url || null;
+    } catch (error) {
+        console.error(`❌ Archive.org API error: ${error.message}`);
+        return null;
+    }
+}
+
+/**
+ * Request Archive.org to save the given URL.
+ * @param {string} url - The URL to archive.
+ * @returns {Promise<string|null>} - The permalink of the archived page if successful, otherwise null.
+ */
+async function saveToArchive(url) {
+    try {
+        const response = await fetch(`${ARCHIVE_SAVE_URL}${encodeURIComponent(url)}`, { method: "POST" });
+        if (!response.ok) throw new Error(`HTTP error! Status: ${response.status}`);
+        return response.url.includes("/save/") ? null : response.url;
+    } catch (error) {
+        console.error(`❌ Failed to save URL to Archive.org: ${error.message}`);
+        return null;
+    }
+}
+
+module.exports = { getArchiveUrl, saveToArchive };
--- a/tools/lib/puppeteer.js
+++ b/tools/lib/puppeteer.js
@@ -0,0 +1,106 @@
+const puppeteer = require("puppeteer-extra");
+const StealthPlugin = require("puppeteer-extra-plugin-stealth");
+const UserAgent = require("user-agents");
+
+puppeteer.use(StealthPlugin());
+
+/**
+ * Scrape a webpage to extract metadata and take a screenshot.
+ * @param {string} url - The URL of the page to scrape.
+ * @param {string} screenshotPath - Path where the screenshot should be saved.
+ * @returns {Promise<object>} - Metadata including title, description, keywords, language, and HTTP status.
+ */
+async function scrapePage(url, screenshotPath, { screenshot = true }) {
+    console.log(`🔍 Scraping: ${url}`);
+
+    const browser = await puppeteer.launch({
+        headless: true,
+        ignoreHTTPSErrors: true, // ✅ Ignore invalid SSL certificates
+        args: [
+            "--no-sandbox",
+            "--disable-setuid-sandbox",
+            "--disable-blink-features=AutomationControlled",
+            "--disable-web-security",
+            "--disable-features=site-per-process",
+            "--ignore-certificate-errors", // ✅ Disable strict SSL checking
+            "--ssl-version-min=tls1", // ✅ Allow older SSL/TLS versions
+            "--disable-features=IsolateOrigins,site-per-process", // ✅ Avoid site isolation (fixes blocked resources)
+            "--disable-site-isolation-trials",
+            "--disable-backgrounding-occluded-windows",
+            "--disable-renderer-backgrounding",
+            "--disable-background-timer-throttling",
+            "--disable-client-side-phishing-detection",
+        ],
+    });
+
+    const page = await browser.newPage();
+
+    // Generate a fresh, realistic user-agent
+    const userAgent = new UserAgent();
+    await page.setUserAgent(userAgent.toString());
+
+    // Add headers to simulate a real browser
+    await page.setExtraHTTPHeaders({
+        "Referer": "https://www.google.com/",
+        "Accept-Language": "fr-FR;fr;en-US,en;q=0.9",
+        "Upgrade-Insecure-Requests": "1",
+    });
+
+    // Prevent detection of Puppeteer
+    await page.evaluateOnNewDocument(() => {
+        Object.defineProperty(navigator, "webdriver", { get: () => undefined });
+    });
+
+    await page.setViewport({ width: 1920, height: 1080 });
+
+    let metadata = {
+        title: "",
+        description: "",
+        keywords: [],
+        lang: "unknown",
+        httpStatus: null,
+    };
+
+    try {
+        await page.emulateMediaFeatures([
+            { name: "prefers-color-scheme", value: "dark" },
+        ]);
+        const response = await page.goto(url, { waitUntil: "domcontentloaded", timeout: 30000 });
+        metadata.httpStatus = response.status();
+
+        // Extract metadata
+        metadata.title = await page.title();
+
+        metadata.description = await page.$eval('meta[name="description"]', el => el.content).catch(() => "");
+        metadata.keywords = await page.$eval('meta[name="keywords"]', el => el.content)
+            .then(content => content.split(",").map(k => k.trim()))
+            .catch(() => []);
+
+        // 🌍 Detect page language
+        metadata.lang = await page.evaluate(() => {
+            // 1️⃣ Try to get language from <html lang="xx">
+            let lang = document.documentElement.lang;
+            if (lang) return lang.toLowerCase();
+
+            // 2️⃣ Try meta tags (og:locale)
+            const metaLang = document.querySelector('meta[property="og:locale"]');
+            if (metaLang) return metaLang.content.split("_")[0].toLowerCase(); // Convert "fr_FR" to "fr"
+
+            return "unknown";
+        });
+
+
+        if (screenshot && screenshotPath) {
+            await page.screenshot({ path: screenshotPath, fullPage: true });
+            console.log(`✔ Screenshot saved: ${screenshotPath}`);
+        }
+
+    } catch (error) {
+        console.error(`❌ Error scraping page: ${error.message}`);
+    }
+
+    await browser.close();
+    return metadata;
+}
+
+module.exports = { scrapePage };