1

Initial commit

This commit is contained in:
2025-03-28 12:57:37 +01:00
commit ed9ddcfdc8
1841 changed files with 42303 additions and 0 deletions

37
tools/lib/archive.js Normal file
View File

@@ -0,0 +1,37 @@
const ARCHIVE_API_URL = "https://archive.org/wayback/available?url=";
const ARCHIVE_SAVE_URL = "https://web.archive.org/save/";
/**
* Check if a given URL exists in Archive.org.
* @param {string} url - The URL to check.
* @returns {Promise<string|null>} - The archive URL if found, otherwise null.
*/
async function getArchiveUrl(url) {
try {
const response = await fetch(`${ARCHIVE_API_URL}${encodeURIComponent(url)}`);
if (!response.ok) throw new Error(`HTTP error! Status: ${response.status}`);
const data = await response.json();
return data.archived_snapshots?.closest?.url || null;
} catch (error) {
console.error(`❌ Archive.org API error: ${error.message}`);
return null;
}
}
/**
* Request Archive.org to save the given URL.
* @param {string} url - The URL to archive.
* @returns {Promise<string|null>} - The permalink of the archived page if successful, otherwise null.
*/
async function saveToArchive(url) {
try {
const response = await fetch(`${ARCHIVE_SAVE_URL}${encodeURIComponent(url)}`, { method: "POST" });
if (!response.ok) throw new Error(`HTTP error! Status: ${response.status}`);
return response.url.includes("/save/") ? null : response.url;
} catch (error) {
console.error(`❌ Failed to save URL to Archive.org: ${error.message}`);
return null;
}
}
module.exports = { getArchiveUrl, saveToArchive };

106
tools/lib/puppeteer.js Normal file
View File

@@ -0,0 +1,106 @@
const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
const UserAgent = require("user-agents");
puppeteer.use(StealthPlugin());
/**
* Scrape a webpage to extract metadata and take a screenshot.
* @param {string} url - The URL of the page to scrape.
* @param {string} screenshotPath - Path where the screenshot should be saved.
* @returns {Promise<object>} - Metadata including title, description, keywords, language, and HTTP status.
*/
async function scrapePage(url, screenshotPath, { screenshot = true }) {
console.log(`🔍 Scraping: ${url}`);
const browser = await puppeteer.launch({
headless: true,
ignoreHTTPSErrors: true, // ✅ Ignore invalid SSL certificates
args: [
"--no-sandbox",
"--disable-setuid-sandbox",
"--disable-blink-features=AutomationControlled",
"--disable-web-security",
"--disable-features=site-per-process",
"--ignore-certificate-errors", // ✅ Disable strict SSL checking
"--ssl-version-min=tls1", // ✅ Allow older SSL/TLS versions
"--disable-features=IsolateOrigins,site-per-process", // ✅ Avoid site isolation (fixes blocked resources)
"--disable-site-isolation-trials",
"--disable-backgrounding-occluded-windows",
"--disable-renderer-backgrounding",
"--disable-background-timer-throttling",
"--disable-client-side-phishing-detection",
],
});
const page = await browser.newPage();
// Generate a fresh, realistic user-agent
const userAgent = new UserAgent();
await page.setUserAgent(userAgent.toString());
// Add headers to simulate a real browser
await page.setExtraHTTPHeaders({
"Referer": "https://www.google.com/",
"Accept-Language": "fr-FR;fr;en-US,en;q=0.9",
"Upgrade-Insecure-Requests": "1",
});
// Prevent detection of Puppeteer
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, "webdriver", { get: () => undefined });
});
await page.setViewport({ width: 1920, height: 1080 });
let metadata = {
title: "",
description: "",
keywords: [],
lang: "unknown",
httpStatus: null,
};
try {
await page.emulateMediaFeatures([
{ name: "prefers-color-scheme", value: "dark" },
]);
const response = await page.goto(url, { waitUntil: "domcontentloaded", timeout: 30000 });
metadata.httpStatus = response.status();
// Extract metadata
metadata.title = await page.title();
metadata.description = await page.$eval('meta[name="description"]', el => el.content).catch(() => "");
metadata.keywords = await page.$eval('meta[name="keywords"]', el => el.content)
.then(content => content.split(",").map(k => k.trim()))
.catch(() => []);
// 🌍 Detect page language
metadata.lang = await page.evaluate(() => {
// 1⃣ Try to get language from <html lang="xx">
let lang = document.documentElement.lang;
if (lang) return lang.toLowerCase();
// 2⃣ Try meta tags (og:locale)
const metaLang = document.querySelector('meta[property="og:locale"]');
if (metaLang) return metaLang.content.split("_")[0].toLowerCase(); // Convert "fr_FR" to "fr"
return "unknown";
});
if (screenshot && screenshotPath) {
await page.screenshot({ path: screenshotPath, fullPage: true });
console.log(`✔ Screenshot saved: ${screenshotPath}`);
}
} catch (error) {
console.error(`❌ Error scraping page: ${error.message}`);
}
await browser.close();
return metadata;
}
module.exports = { scrapePage };