const puppeteer = require("puppeteer-extra"); const StealthPlugin = require("puppeteer-extra-plugin-stealth"); const UserAgent = require("user-agents"); puppeteer.use(StealthPlugin()); /** * Scrape a webpage to extract metadata and take a screenshot. * @param {string} url - The URL of the page to scrape. * @param {string} screenshotPath - Path where the screenshot should be saved. * @returns {Promise} - Metadata including title, description, keywords, language, and HTTP status. */ async function scrapePage(url, screenshotPath, { screenshot = true }) { console.log(`🔍 Scraping: ${url}`); const browser = await puppeteer.launch({ headless: true, ignoreHTTPSErrors: true, // ✅ Ignore invalid SSL certificates args: [ "--no-sandbox", "--disable-setuid-sandbox", "--disable-blink-features=AutomationControlled", "--disable-web-security", "--disable-features=site-per-process", "--ignore-certificate-errors", // ✅ Disable strict SSL checking "--ssl-version-min=tls1", // ✅ Allow older SSL/TLS versions "--disable-features=IsolateOrigins,site-per-process", // ✅ Avoid site isolation (fixes blocked resources) "--disable-site-isolation-trials", "--disable-backgrounding-occluded-windows", "--disable-renderer-backgrounding", "--disable-background-timer-throttling", "--disable-client-side-phishing-detection", ], }); const page = await browser.newPage(); // Generate a fresh, realistic user-agent const userAgent = new UserAgent(); await page.setUserAgent(userAgent.toString()); // Add headers to simulate a real browser await page.setExtraHTTPHeaders({ "Referer": "https://www.google.com/", "Accept-Language": "fr-FR;fr;en-US,en;q=0.9", "Upgrade-Insecure-Requests": "1", }); // Prevent detection of Puppeteer await page.evaluateOnNewDocument(() => { Object.defineProperty(navigator, "webdriver", { get: () => undefined }); }); await page.setViewport({ width: 1920, height: 1080 }); let metadata = { title: "", description: "", keywords: [], lang: "unknown", httpStatus: null, }; try { await page.emulateMediaFeatures([ { name: "prefers-color-scheme", value: "dark" }, ]); const response = await page.goto(url, { waitUntil: "domcontentloaded", timeout: 30000 }); metadata.httpStatus = response.status(); // Extract metadata metadata.title = await page.title(); metadata.description = await page.$eval('meta[name="description"]', el => el.content).catch(() => ""); metadata.keywords = await page.$eval('meta[name="keywords"]', el => el.content) .then(content => content.split(",").map(k => k.trim())) .catch(() => []); // 🌍 Detect page language metadata.lang = await page.evaluate(() => { // 1️⃣ Try to get language from let lang = document.documentElement.lang; if (lang) return lang.toLowerCase(); // 2️⃣ Try meta tags (og:locale) const metaLang = document.querySelector('meta[property="og:locale"]'); if (metaLang) return metaLang.content.split("_")[0].toLowerCase(); // Convert "fr_FR" to "fr" return "unknown"; }); if (screenshot && screenshotPath) { await page.screenshot({ path: screenshotPath, fullPage: true }); console.log(`✔ Screenshot saved: ${screenshotPath}`); } } catch (error) { console.error(`❌ Error scraping page: ${error.message}`); } await browser.close(); return metadata; } module.exports = { scrapePage };