107 lines
3.8 KiB
JavaScript
107 lines
3.8 KiB
JavaScript
const puppeteer = require("puppeteer-extra");
|
||
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
|
||
const UserAgent = require("user-agents");
|
||
|
||
puppeteer.use(StealthPlugin());
|
||
|
||
/**
|
||
* Scrape a webpage to extract metadata and take a screenshot.
|
||
* @param {string} url - The URL of the page to scrape.
|
||
* @param {string} screenshotPath - Path where the screenshot should be saved.
|
||
* @returns {Promise<object>} - Metadata including title, description, keywords, language, and HTTP status.
|
||
*/
|
||
async function scrapePage(url, screenshotPath, { screenshot = true }) {
|
||
console.log(`🔍 Scraping: ${url}`);
|
||
|
||
const browser = await puppeteer.launch({
|
||
headless: true,
|
||
ignoreHTTPSErrors: true, // ✅ Ignore invalid SSL certificates
|
||
args: [
|
||
"--no-sandbox",
|
||
"--disable-setuid-sandbox",
|
||
"--disable-blink-features=AutomationControlled",
|
||
"--disable-web-security",
|
||
"--disable-features=site-per-process",
|
||
"--ignore-certificate-errors", // ✅ Disable strict SSL checking
|
||
"--ssl-version-min=tls1", // ✅ Allow older SSL/TLS versions
|
||
"--disable-features=IsolateOrigins,site-per-process", // ✅ Avoid site isolation (fixes blocked resources)
|
||
"--disable-site-isolation-trials",
|
||
"--disable-backgrounding-occluded-windows",
|
||
"--disable-renderer-backgrounding",
|
||
"--disable-background-timer-throttling",
|
||
"--disable-client-side-phishing-detection",
|
||
],
|
||
});
|
||
|
||
const page = await browser.newPage();
|
||
|
||
// Generate a fresh, realistic user-agent
|
||
const userAgent = new UserAgent();
|
||
await page.setUserAgent(userAgent.toString());
|
||
|
||
// Add headers to simulate a real browser
|
||
await page.setExtraHTTPHeaders({
|
||
"Referer": "https://www.google.com/",
|
||
"Accept-Language": "fr-FR;fr;en-US,en;q=0.9",
|
||
"Upgrade-Insecure-Requests": "1",
|
||
});
|
||
|
||
// Prevent detection of Puppeteer
|
||
await page.evaluateOnNewDocument(() => {
|
||
Object.defineProperty(navigator, "webdriver", { get: () => undefined });
|
||
});
|
||
|
||
await page.setViewport({ width: 1920, height: 1080 });
|
||
|
||
let metadata = {
|
||
title: "",
|
||
description: "",
|
||
keywords: [],
|
||
lang: "unknown",
|
||
httpStatus: null,
|
||
};
|
||
|
||
try {
|
||
await page.emulateMediaFeatures([
|
||
{ name: "prefers-color-scheme", value: "dark" },
|
||
]);
|
||
const response = await page.goto(url, { waitUntil: "domcontentloaded", timeout: 30000 });
|
||
metadata.httpStatus = response.status();
|
||
|
||
// Extract metadata
|
||
metadata.title = await page.title();
|
||
|
||
metadata.description = await page.$eval('meta[name="description"]', el => el.content).catch(() => "");
|
||
metadata.keywords = await page.$eval('meta[name="keywords"]', el => el.content)
|
||
.then(content => content.split(",").map(k => k.trim()))
|
||
.catch(() => []);
|
||
|
||
// 🌍 Detect page language
|
||
metadata.lang = await page.evaluate(() => {
|
||
// 1️⃣ Try to get language from <html lang="xx">
|
||
let lang = document.documentElement.lang;
|
||
if (lang) return lang.toLowerCase();
|
||
|
||
// 2️⃣ Try meta tags (og:locale)
|
||
const metaLang = document.querySelector('meta[property="og:locale"]');
|
||
if (metaLang) return metaLang.content.split("_")[0].toLowerCase(); // Convert "fr_FR" to "fr"
|
||
|
||
return "unknown";
|
||
});
|
||
|
||
|
||
if (screenshot && screenshotPath) {
|
||
await page.screenshot({ path: screenshotPath, fullPage: true });
|
||
console.log(`✔ Screenshot saved: ${screenshotPath}`);
|
||
}
|
||
|
||
} catch (error) {
|
||
console.error(`❌ Error scraping page: ${error.message}`);
|
||
}
|
||
|
||
await browser.close();
|
||
return metadata;
|
||
}
|
||
|
||
module.exports = { scrapePage };
|