1
Files
2025/tools/lib/puppeteer.js
2025-03-28 12:57:37 +01:00

107 lines
3.8 KiB
JavaScript
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
const UserAgent = require("user-agents");
puppeteer.use(StealthPlugin());
/**
* Scrape a webpage to extract metadata and take a screenshot.
* @param {string} url - The URL of the page to scrape.
* @param {string} screenshotPath - Path where the screenshot should be saved.
* @returns {Promise<object>} - Metadata including title, description, keywords, language, and HTTP status.
*/
async function scrapePage(url, screenshotPath, { screenshot = true }) {
console.log(`🔍 Scraping: ${url}`);
const browser = await puppeteer.launch({
headless: true,
ignoreHTTPSErrors: true, // ✅ Ignore invalid SSL certificates
args: [
"--no-sandbox",
"--disable-setuid-sandbox",
"--disable-blink-features=AutomationControlled",
"--disable-web-security",
"--disable-features=site-per-process",
"--ignore-certificate-errors", // ✅ Disable strict SSL checking
"--ssl-version-min=tls1", // ✅ Allow older SSL/TLS versions
"--disable-features=IsolateOrigins,site-per-process", // ✅ Avoid site isolation (fixes blocked resources)
"--disable-site-isolation-trials",
"--disable-backgrounding-occluded-windows",
"--disable-renderer-backgrounding",
"--disable-background-timer-throttling",
"--disable-client-side-phishing-detection",
],
});
const page = await browser.newPage();
// Generate a fresh, realistic user-agent
const userAgent = new UserAgent();
await page.setUserAgent(userAgent.toString());
// Add headers to simulate a real browser
await page.setExtraHTTPHeaders({
"Referer": "https://www.google.com/",
"Accept-Language": "fr-FR;fr;en-US,en;q=0.9",
"Upgrade-Insecure-Requests": "1",
});
// Prevent detection of Puppeteer
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, "webdriver", { get: () => undefined });
});
await page.setViewport({ width: 1920, height: 1080 });
let metadata = {
title: "",
description: "",
keywords: [],
lang: "unknown",
httpStatus: null,
};
try {
await page.emulateMediaFeatures([
{ name: "prefers-color-scheme", value: "dark" },
]);
const response = await page.goto(url, { waitUntil: "domcontentloaded", timeout: 30000 });
metadata.httpStatus = response.status();
// Extract metadata
metadata.title = await page.title();
metadata.description = await page.$eval('meta[name="description"]', el => el.content).catch(() => "");
metadata.keywords = await page.$eval('meta[name="keywords"]', el => el.content)
.then(content => content.split(",").map(k => k.trim()))
.catch(() => []);
// 🌍 Detect page language
metadata.lang = await page.evaluate(() => {
// 1⃣ Try to get language from <html lang="xx">
let lang = document.documentElement.lang;
if (lang) return lang.toLowerCase();
// 2⃣ Try meta tags (og:locale)
const metaLang = document.querySelector('meta[property="og:locale"]');
if (metaLang) return metaLang.content.split("_")[0].toLowerCase(); // Convert "fr_FR" to "fr"
return "unknown";
});
if (screenshot && screenshotPath) {
await page.screenshot({ path: screenshotPath, fullPage: true });
console.log(`✔ Screenshot saved: ${screenshotPath}`);
}
} catch (error) {
console.error(`❌ Error scraping page: ${error.message}`);
}
await browser.close();
return metadata;
}
module.exports = { scrapePage };