Initial commit
This commit is contained in:
186
tools/add_link.js
Normal file
186
tools/add_link.js
Normal file
@@ -0,0 +1,186 @@
|
||||
#!/usr/bin/env node
|
||||
|
||||
const { execSync } = require("child_process");
|
||||
const fs = require("fs");
|
||||
const crypto = require("crypto");
|
||||
const path = require("path");
|
||||
const os = require("os");
|
||||
const YAML = require("yaml");
|
||||
const { getArchiveUrl, saveToArchive } = require("./lib/archive");
|
||||
const { scrapePage } = require("./lib/puppeteer");
|
||||
|
||||
const KNOWN_LINKS_FILE = "data/known_links.yaml"; // YAML file with { hash: path }
|
||||
|
||||
if (process.argv.length < 3) {
|
||||
console.error("Usage: add_link.js <URL> [optional: YYYY-MM-DD]");
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const url = process.argv[2];
|
||||
const customDate = process.argv[3] || null;
|
||||
|
||||
// Generate an MD5 hash of the URL
|
||||
const urlHash = crypto.createHash("md5").update(url).digest("hex").slice(0, 8);
|
||||
|
||||
// Ensure the known_links file is stored at the correct location
|
||||
const hugoRoot = path.resolve(process.cwd());
|
||||
const knownLinksPath = path.join(hugoRoot, KNOWN_LINKS_FILE);
|
||||
|
||||
// Load known links from YAML
|
||||
let knownLinks = {};
|
||||
if (fs.existsSync(knownLinksPath)) {
|
||||
try {
|
||||
knownLinks = YAML.parse(fs.readFileSync(knownLinksPath, "utf8")) || {};
|
||||
} catch (err) {
|
||||
console.error(`❌ Unable to parse ${KNOWN_LINKS_FILE}: ${err.message}`);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
if (knownLinks[urlHash]) {
|
||||
console.log(`⚠ Link already exists: ${url}`);
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
// Check URL accessibility and Archive.org availability
|
||||
(async () => {
|
||||
let archiveUrl = await getArchiveUrl(url);
|
||||
|
||||
// If the URL is not archived, attempt to save it
|
||||
if (!archiveUrl) {
|
||||
console.log(`📂 No archive found. Attempting to save ${url}...`);
|
||||
archiveUrl = await saveToArchive(url);
|
||||
if (!archiveUrl) {
|
||||
console.log(`⚠ Warning: Unable to archive ${url}. Continuing without archive.`);
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`📂 Archive URL ${archiveUrl}...`);
|
||||
|
||||
// Determine the entry date
|
||||
let entryDate = customDate ? new Date(customDate) : new Date();
|
||||
if (isNaN(entryDate.getTime())) {
|
||||
console.error("❌ Invalid date format. Use YYYY-MM-DD.");
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const now = new Date(); // Current date for status
|
||||
const formattedEntryDate = entryDate.toISOString().split("T")[0]; // YYYY-MM-DD
|
||||
const formattedStatusDate = now.toISOString(); // ISO format
|
||||
const formattedDateFrench = entryDate.toLocaleDateString("fr-FR", {
|
||||
year: "numeric",
|
||||
month: "long",
|
||||
day: "numeric",
|
||||
hour: "2-digit",
|
||||
minute: "2-digit",
|
||||
});
|
||||
|
||||
const year = entryDate.getFullYear();
|
||||
const month = String(entryDate.getMonth() + 1).padStart(2, "0");
|
||||
const day = String(entryDate.getDate()).padStart(2, "0");
|
||||
|
||||
// Define paths
|
||||
const bundlePath = path.join(hugoRoot, `content/interets/liens-interessants/${year}/${month}/${day}/${urlHash}/`);
|
||||
const imagesPath = path.join(bundlePath, "images");
|
||||
const dataPath = path.join(bundlePath, "data");
|
||||
const finalScreenshotPath = path.join(imagesPath, "screenshot.png");
|
||||
const metadataPath = path.join(dataPath, "screenshot.yaml");
|
||||
|
||||
// Store screenshot in a temporary location first
|
||||
const tempScreenshotPath = path.join(os.tmpdir(), `screenshot_${urlHash}.png`);
|
||||
|
||||
// Scrape the page and capture a screenshot
|
||||
console.log(`🔍 Scraping page and capturing screenshot...`);
|
||||
const metadata = await scrapePage(url, tempScreenshotPath);
|
||||
|
||||
// If Puppeteer failed, do not proceed
|
||||
if (!metadata || !fs.existsSync(tempScreenshotPath)) {
|
||||
console.error(`❌ Scraping failed. No bundle will be created.`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// Create Hugo bundle only if scraping succeeded
|
||||
console.log(`📦 Creating Hugo bundle for ${url}...`);
|
||||
execSync(`hugo new --kind liens-interessants interets/liens-interessants/${year}/${month}/${day}/${urlHash}/`, { stdio: "inherit" });
|
||||
|
||||
if (!fs.existsSync(bundlePath)) {
|
||||
console.error("❌ Failed to create the bundle.");
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// Move the screenshot to the final destination
|
||||
if (!fs.existsSync(imagesPath)) fs.mkdirSync(imagesPath, { recursive: true });
|
||||
fs.renameSync(tempScreenshotPath, finalScreenshotPath);
|
||||
|
||||
// Modify the frontmatter
|
||||
const indexPath = path.join(bundlePath, "index.md");
|
||||
let content = fs.readFileSync(indexPath, "utf8");
|
||||
|
||||
// Inject date
|
||||
content = content.replace(/^date: .*/m, `date: ${formattedEntryDate}`);
|
||||
|
||||
// Inject status
|
||||
const statusEntry = `{"date": "${formattedStatusDate}", "http_code": ${metadata.httpStatus || "null"}}`;
|
||||
content = content.replace("status: []", `status: [${statusEntry}]`);
|
||||
|
||||
// Inject title and description
|
||||
if (metadata.title) {
|
||||
content = content.replace(/title: ".*?"/, `title: "${metadata.title.replace(/"/g, '\\"')}"`);
|
||||
}
|
||||
if (metadata.description) {
|
||||
content = content.replace("> [description]", `> ${metadata.description.replace(/"/g, '\\"')}`);
|
||||
} else {
|
||||
content = content.replace("> [description]\n\n", ""); // Remove placeholder if no description
|
||||
}
|
||||
|
||||
// Inject keywords
|
||||
if (metadata.keywords.length > 0) {
|
||||
content = content.replace("keywords: []", `keywords: ["${metadata.keywords.join('", "')}"]`);
|
||||
}
|
||||
|
||||
// Inject cover
|
||||
content = content.replace('cover: ""', `cover: "images/screenshot.png"`);
|
||||
|
||||
// Inject links (and supprimer urls/links éventuels déjà présents)
|
||||
const links = [];
|
||||
|
||||
links.push({
|
||||
name: "Page d'origine",
|
||||
url: url,
|
||||
lang: metadata.lang || "unknown",
|
||||
});
|
||||
|
||||
if (archiveUrl) {
|
||||
links.push({
|
||||
name: "Archive",
|
||||
url: archiveUrl,
|
||||
archive: true,
|
||||
});
|
||||
}
|
||||
|
||||
const linksYaml = YAML.stringify({ links }).trim();
|
||||
content = content.replace(/^urls: \[\]\n?/m, "");
|
||||
content = content.replace(/^links: \[\]\n?/m, "");
|
||||
content = content.replace(/^---/, `---\n${linksYaml}`);
|
||||
|
||||
fs.writeFileSync(indexPath, content);
|
||||
|
||||
// Create metadata folder if necessary
|
||||
if (!fs.existsSync(dataPath)) fs.mkdirSync(dataPath, { recursive: true });
|
||||
|
||||
// Write metadata for the screenshot
|
||||
console.log("📝 Writing metadata...");
|
||||
const metadataContent = `title: "Capture d'écran de ${url}"
|
||||
description: "Capture effectuée le ${formattedDateFrench}"
|
||||
attribution: "Richard Dern"
|
||||
file: "images/screenshot.png"
|
||||
`;
|
||||
fs.writeFileSync(metadataPath, metadataContent);
|
||||
console.log(`✔ Metadata saved: ${metadataPath}`);
|
||||
|
||||
// Append the hash to known_links.yaml
|
||||
knownLinks[urlHash] = path.relative(hugoRoot, bundlePath);
|
||||
fs.writeFileSync(knownLinksPath, YAML.stringify(knownLinks));
|
||||
|
||||
console.log(`🎉 Link successfully added! Bundle path: ${bundlePath}`);
|
||||
console.log(bundlePath);
|
||||
})();
|
||||
66
tools/add_link_to_article.js
Normal file
66
tools/add_link_to_article.js
Normal file
@@ -0,0 +1,66 @@
|
||||
#!/usr/bin/env node
|
||||
|
||||
const fs = require("fs");
|
||||
const path = require("path");
|
||||
const yaml = require("js-yaml");
|
||||
const { scrapePage } = require("./lib/puppeteer");
|
||||
|
||||
if (process.argv.length < 4) {
|
||||
console.error("Usage: add_link_to_article.js <URL> <path_to_article>");
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const url = process.argv[2];
|
||||
const articlePath = process.argv[3];
|
||||
const indexPath = path.join(articlePath, "index.md");
|
||||
|
||||
// Vérifier si le fichier `index.md` existe
|
||||
if (!fs.existsSync(indexPath)) {
|
||||
console.error(`❌ The article at ${indexPath} does not exist.`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// Charger le frontmatter YAML depuis `index.md`
|
||||
const fileContent = fs.readFileSync(indexPath, "utf8");
|
||||
const frontmatterMatch = fileContent.match(/^---\n([\s\S]+?)\n---/);
|
||||
|
||||
if (!frontmatterMatch) {
|
||||
console.error("❌ No valid frontmatter found in the article.");
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
let frontmatter = yaml.load(frontmatterMatch[1]);
|
||||
if (!frontmatter.links) frontmatter.links = [];
|
||||
|
||||
// Vérifier si le lien existe déjà
|
||||
if (frontmatter.links.some(link => link.url === url)) {
|
||||
console.log(`⚠ The link "${url}" is already in the article.`);
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
// Scraper la page pour récupérer le titre et la langue
|
||||
(async () => {
|
||||
console.log(`🔍 Retrieving metadata for ${url}...`);
|
||||
const metadata = await scrapePage(url, "/dev/null"); // Ignore la capture d’écran pour ce script
|
||||
|
||||
if (!metadata.title) {
|
||||
console.error("❌ Failed to retrieve page title. Skipping.");
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
frontmatter.links.push({
|
||||
name: metadata.title,
|
||||
lang: metadata.lang || "unknown",
|
||||
url: url,
|
||||
official: false
|
||||
});
|
||||
|
||||
// Reconstruire le frontmatter YAML
|
||||
const newFrontmatter = yaml.dump(frontmatter);
|
||||
const newContent = `---\n${newFrontmatter}---\n${fileContent.replace(frontmatterMatch[0], "").trim()}`;
|
||||
|
||||
// Écrire le fichier
|
||||
fs.writeFileSync(indexPath, newContent, "utf8");
|
||||
|
||||
console.log(`✔ Link successfully added to ${indexPath}`);
|
||||
})();
|
||||
126
tools/check_external_links.js
Normal file
126
tools/check_external_links.js
Normal file
@@ -0,0 +1,126 @@
|
||||
const fs = require("fs");
|
||||
const path = require("path");
|
||||
const yaml = require("js-yaml");
|
||||
const { scrapePage } = require("./lib/puppeteer");
|
||||
const readline = require("readline");
|
||||
|
||||
const CONTENT_DIR = path.join(__dirname, "..", "content");
|
||||
const DATA_DIR = path.join(__dirname, "..", "data");
|
||||
const SITE_ROOT = path.resolve(__dirname, "..");
|
||||
const CACHE_PATH = path.join(DATA_DIR, "external_links.yaml");
|
||||
const CACHE_TTL_DAYS = 7;
|
||||
|
||||
let cache = {};
|
||||
if (fs.existsSync(CACHE_PATH)) {
|
||||
cache = yaml.load(fs.readFileSync(CACHE_PATH, "utf8")) || {};
|
||||
}
|
||||
|
||||
const now = new Date();
|
||||
const BAD_LINKS = [];
|
||||
|
||||
function isExternalLink(link) {
|
||||
return typeof link === "string" && link.includes("://");
|
||||
}
|
||||
|
||||
function isCacheValid(entry) {
|
||||
if (!entry?.checked) return false;
|
||||
const date = new Date(entry.checked);
|
||||
return (now - date) / (1000 * 60 * 60 * 24) < CACHE_TTL_DAYS;
|
||||
}
|
||||
|
||||
function extractLinksFromText(text) {
|
||||
const regex = /\bhttps?:\/\/[^\s)"'>]+/g;
|
||||
return text.match(regex) || [];
|
||||
}
|
||||
|
||||
async function checkLink(file, line, url) {
|
||||
if (isCacheValid(cache[url])) return;
|
||||
|
||||
const meta = await scrapePage(url, null, { screenshot: false });
|
||||
cache[url] = {
|
||||
status: meta.httpStatus || null,
|
||||
checked: new Date().toISOString(),
|
||||
};
|
||||
|
||||
const bundle = path.relative(SITE_ROOT, file);
|
||||
|
||||
if (!meta.httpStatus || meta.httpStatus >= 400) {
|
||||
BAD_LINKS.push({ bundle, url, line, status: meta.httpStatus });
|
||||
process.stdout.write("❌");
|
||||
} else {
|
||||
process.stdout.write("✔");
|
||||
}
|
||||
}
|
||||
|
||||
async function processMarkdown(filePath) {
|
||||
const fileStream = fs.createReadStream(filePath);
|
||||
const rl = readline.createInterface({ input: fileStream, crlfDelay: Infinity });
|
||||
let lineNumber = 0;
|
||||
for await (const line of rl) {
|
||||
lineNumber++;
|
||||
const links = extractLinksFromText(line);
|
||||
for (const link of links) {
|
||||
await checkLink(filePath, lineNumber, link);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function processYamlRecursively(obj, links = []) {
|
||||
if (typeof obj === "string" && isExternalLink(obj)) {
|
||||
links.push(obj);
|
||||
} else if (Array.isArray(obj)) {
|
||||
for (const item of obj) processYamlRecursively(item, links);
|
||||
} else if (typeof obj === "object" && obj !== null) {
|
||||
for (const key in obj) processYamlRecursively(obj[key], links);
|
||||
}
|
||||
return links;
|
||||
}
|
||||
|
||||
async function processYaml(filePath) {
|
||||
try {
|
||||
const doc = yaml.load(fs.readFileSync(filePath, "utf8"));
|
||||
const links = processYamlRecursively(doc);
|
||||
for (const link of links) {
|
||||
await checkLink(filePath, "?", link);
|
||||
}
|
||||
} catch (e) {
|
||||
console.error(`Failed to parse YAML file: ${filePath}`);
|
||||
}
|
||||
}
|
||||
|
||||
function walk(dir, exts) {
|
||||
let results = [];
|
||||
const list = fs.readdirSync(dir);
|
||||
for (const file of list) {
|
||||
const fullPath = path.resolve(dir, file);
|
||||
const stat = fs.statSync(fullPath);
|
||||
if (stat.isDirectory()) {
|
||||
results = results.concat(walk(fullPath, exts));
|
||||
} else if (exts.includes(path.extname(fullPath))) {
|
||||
results.push(fullPath);
|
||||
}
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
(async () => {
|
||||
const mdFiles = walk(CONTENT_DIR, [".md"]);
|
||||
const yamlFiles = walk(DATA_DIR, [".yaml", ".yml"]);
|
||||
console.log(`Scanning ${mdFiles.length} Markdown and ${yamlFiles.length} YAML files...`);
|
||||
|
||||
for (const file of mdFiles) {
|
||||
await processMarkdown(file);
|
||||
}
|
||||
for (const file of yamlFiles) {
|
||||
await processYaml(file);
|
||||
}
|
||||
|
||||
fs.writeFileSync(CACHE_PATH, yaml.dump(cache));
|
||||
|
||||
console.log("\n\n=== Broken External Links Report ===");
|
||||
if (BAD_LINKS.length === 0) {
|
||||
console.log("✅ No broken external links found.");
|
||||
} else {
|
||||
console.table(BAD_LINKS);
|
||||
}
|
||||
})();
|
||||
102
tools/check_internal_links.js
Normal file
102
tools/check_internal_links.js
Normal file
@@ -0,0 +1,102 @@
|
||||
const fs = require("fs");
|
||||
const path = require("path");
|
||||
const http = require("http");
|
||||
const readline = require("readline");
|
||||
|
||||
const BASE_URL = "http://127.0.0.1:1313";
|
||||
const CONTENT_DIR = path.join(__dirname, "..", "content");
|
||||
const SITE_ROOT = path.resolve(__dirname, "..");
|
||||
const BAD_LINKS = [];
|
||||
|
||||
function isInternalLink(link) {
|
||||
return !link.includes("://") && !link.startsWith("mailto:") && !link.startsWith("tel:");
|
||||
}
|
||||
|
||||
function extractLinksFromLine(line) {
|
||||
const regex = /\]\(([^)]+)\)/g;
|
||||
let match;
|
||||
const links = [];
|
||||
while ((match = regex.exec(line)) !== null) {
|
||||
links.push(match[1]);
|
||||
}
|
||||
return links;
|
||||
}
|
||||
|
||||
function getBundleRelativeUrl(mdPath, link) {
|
||||
const bundleRoot = path.dirname(mdPath);
|
||||
let urlPath;
|
||||
|
||||
if (link.startsWith("/")) {
|
||||
urlPath = link;
|
||||
} else {
|
||||
const fullPath = path.resolve(bundleRoot, link);
|
||||
const relative = path.relative(CONTENT_DIR, fullPath);
|
||||
urlPath = "/" + relative.replace(/\\/g, "/");
|
||||
}
|
||||
|
||||
return urlPath;
|
||||
}
|
||||
|
||||
async function checkLink(file, lineNumber, link) {
|
||||
const relativeUrl = getBundleRelativeUrl(file, link);
|
||||
const fullUrl = `${BASE_URL}${relativeUrl}`;
|
||||
return new Promise((resolve) => {
|
||||
http.get(fullUrl, (res) => {
|
||||
if (res.statusCode !== 200) {
|
||||
BAD_LINKS.push([path.relative(SITE_ROOT, file), link, lineNumber]);
|
||||
}
|
||||
res.resume();
|
||||
resolve();
|
||||
}).on("error", () => {
|
||||
BAD_LINKS.push([path.relative(SITE_ROOT, file), link, lineNumber]);
|
||||
resolve();
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
async function processFile(filePath) {
|
||||
const fileStream = fs.createReadStream(filePath);
|
||||
const rl = readline.createInterface({ input: fileStream, crlfDelay: Infinity });
|
||||
let lineNumber = 0;
|
||||
|
||||
for await (const line of rl) {
|
||||
lineNumber++;
|
||||
const links = extractLinksFromLine(line);
|
||||
for (const link of links) {
|
||||
if (isInternalLink(link)) {
|
||||
process.stdout.write(".");
|
||||
await checkLink(filePath, lineNumber, link);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function walk(dir) {
|
||||
let results = [];
|
||||
const list = fs.readdirSync(dir);
|
||||
list.forEach((file) => {
|
||||
file = path.resolve(dir, file);
|
||||
const stat = fs.statSync(file);
|
||||
if (stat && stat.isDirectory()) {
|
||||
results = results.concat(walk(file));
|
||||
} else if (file.endsWith(".md")) {
|
||||
results.push(file);
|
||||
}
|
||||
});
|
||||
return results;
|
||||
}
|
||||
|
||||
(async () => {
|
||||
const files = walk(CONTENT_DIR);
|
||||
console.log(`Analyzing ${files.length} Markdown files...`);
|
||||
for (const file of files) {
|
||||
await processFile(file);
|
||||
}
|
||||
|
||||
console.log("\n\n=== Broken Internal Links Report ===");
|
||||
if (BAD_LINKS.length === 0) {
|
||||
console.log("✅ No broken internal links found.");
|
||||
} else {
|
||||
console.table(BAD_LINKS.map(([f, u, l]) => ({ File: f, URL: u, Line: l })));
|
||||
}
|
||||
})();
|
||||
37
tools/lib/archive.js
Normal file
37
tools/lib/archive.js
Normal file
@@ -0,0 +1,37 @@
|
||||
const ARCHIVE_API_URL = "https://archive.org/wayback/available?url=";
|
||||
const ARCHIVE_SAVE_URL = "https://web.archive.org/save/";
|
||||
|
||||
/**
|
||||
* Check if a given URL exists in Archive.org.
|
||||
* @param {string} url - The URL to check.
|
||||
* @returns {Promise<string|null>} - The archive URL if found, otherwise null.
|
||||
*/
|
||||
async function getArchiveUrl(url) {
|
||||
try {
|
||||
const response = await fetch(`${ARCHIVE_API_URL}${encodeURIComponent(url)}`);
|
||||
if (!response.ok) throw new Error(`HTTP error! Status: ${response.status}`);
|
||||
const data = await response.json();
|
||||
return data.archived_snapshots?.closest?.url || null;
|
||||
} catch (error) {
|
||||
console.error(`❌ Archive.org API error: ${error.message}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Request Archive.org to save the given URL.
|
||||
* @param {string} url - The URL to archive.
|
||||
* @returns {Promise<string|null>} - The permalink of the archived page if successful, otherwise null.
|
||||
*/
|
||||
async function saveToArchive(url) {
|
||||
try {
|
||||
const response = await fetch(`${ARCHIVE_SAVE_URL}${encodeURIComponent(url)}`, { method: "POST" });
|
||||
if (!response.ok) throw new Error(`HTTP error! Status: ${response.status}`);
|
||||
return response.url.includes("/save/") ? null : response.url;
|
||||
} catch (error) {
|
||||
console.error(`❌ Failed to save URL to Archive.org: ${error.message}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = { getArchiveUrl, saveToArchive };
|
||||
106
tools/lib/puppeteer.js
Normal file
106
tools/lib/puppeteer.js
Normal file
@@ -0,0 +1,106 @@
|
||||
const puppeteer = require("puppeteer-extra");
|
||||
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
|
||||
const UserAgent = require("user-agents");
|
||||
|
||||
puppeteer.use(StealthPlugin());
|
||||
|
||||
/**
|
||||
* Scrape a webpage to extract metadata and take a screenshot.
|
||||
* @param {string} url - The URL of the page to scrape.
|
||||
* @param {string} screenshotPath - Path where the screenshot should be saved.
|
||||
* @returns {Promise<object>} - Metadata including title, description, keywords, language, and HTTP status.
|
||||
*/
|
||||
async function scrapePage(url, screenshotPath, { screenshot = true }) {
|
||||
console.log(`🔍 Scraping: ${url}`);
|
||||
|
||||
const browser = await puppeteer.launch({
|
||||
headless: true,
|
||||
ignoreHTTPSErrors: true, // ✅ Ignore invalid SSL certificates
|
||||
args: [
|
||||
"--no-sandbox",
|
||||
"--disable-setuid-sandbox",
|
||||
"--disable-blink-features=AutomationControlled",
|
||||
"--disable-web-security",
|
||||
"--disable-features=site-per-process",
|
||||
"--ignore-certificate-errors", // ✅ Disable strict SSL checking
|
||||
"--ssl-version-min=tls1", // ✅ Allow older SSL/TLS versions
|
||||
"--disable-features=IsolateOrigins,site-per-process", // ✅ Avoid site isolation (fixes blocked resources)
|
||||
"--disable-site-isolation-trials",
|
||||
"--disable-backgrounding-occluded-windows",
|
||||
"--disable-renderer-backgrounding",
|
||||
"--disable-background-timer-throttling",
|
||||
"--disable-client-side-phishing-detection",
|
||||
],
|
||||
});
|
||||
|
||||
const page = await browser.newPage();
|
||||
|
||||
// Generate a fresh, realistic user-agent
|
||||
const userAgent = new UserAgent();
|
||||
await page.setUserAgent(userAgent.toString());
|
||||
|
||||
// Add headers to simulate a real browser
|
||||
await page.setExtraHTTPHeaders({
|
||||
"Referer": "https://www.google.com/",
|
||||
"Accept-Language": "fr-FR;fr;en-US,en;q=0.9",
|
||||
"Upgrade-Insecure-Requests": "1",
|
||||
});
|
||||
|
||||
// Prevent detection of Puppeteer
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(navigator, "webdriver", { get: () => undefined });
|
||||
});
|
||||
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
|
||||
let metadata = {
|
||||
title: "",
|
||||
description: "",
|
||||
keywords: [],
|
||||
lang: "unknown",
|
||||
httpStatus: null,
|
||||
};
|
||||
|
||||
try {
|
||||
await page.emulateMediaFeatures([
|
||||
{ name: "prefers-color-scheme", value: "dark" },
|
||||
]);
|
||||
const response = await page.goto(url, { waitUntil: "domcontentloaded", timeout: 30000 });
|
||||
metadata.httpStatus = response.status();
|
||||
|
||||
// Extract metadata
|
||||
metadata.title = await page.title();
|
||||
|
||||
metadata.description = await page.$eval('meta[name="description"]', el => el.content).catch(() => "");
|
||||
metadata.keywords = await page.$eval('meta[name="keywords"]', el => el.content)
|
||||
.then(content => content.split(",").map(k => k.trim()))
|
||||
.catch(() => []);
|
||||
|
||||
// 🌍 Detect page language
|
||||
metadata.lang = await page.evaluate(() => {
|
||||
// 1️⃣ Try to get language from <html lang="xx">
|
||||
let lang = document.documentElement.lang;
|
||||
if (lang) return lang.toLowerCase();
|
||||
|
||||
// 2️⃣ Try meta tags (og:locale)
|
||||
const metaLang = document.querySelector('meta[property="og:locale"]');
|
||||
if (metaLang) return metaLang.content.split("_")[0].toLowerCase(); // Convert "fr_FR" to "fr"
|
||||
|
||||
return "unknown";
|
||||
});
|
||||
|
||||
|
||||
if (screenshot && screenshotPath) {
|
||||
await page.screenshot({ path: screenshotPath, fullPage: true });
|
||||
console.log(`✔ Screenshot saved: ${screenshotPath}`);
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
console.error(`❌ Error scraping page: ${error.message}`);
|
||||
}
|
||||
|
||||
await browser.close();
|
||||
return metadata;
|
||||
}
|
||||
|
||||
module.exports = { scrapePage };
|
||||
21
tools/test_archive.js
Normal file
21
tools/test_archive.js
Normal file
@@ -0,0 +1,21 @@
|
||||
const { getArchiveUrl, saveToArchive } = require("./lib/archive");
|
||||
|
||||
(async () => {
|
||||
const testUrl = "https://richard.dern.ovh";
|
||||
|
||||
console.log(`🔍 Checking Archive.org for: ${testUrl}`);
|
||||
let archiveUrl = await getArchiveUrl(testUrl);
|
||||
|
||||
if (archiveUrl) {
|
||||
console.log(`✔ Archive found: ${archiveUrl}`);
|
||||
} else {
|
||||
console.log(`❌ No archive found, requesting a new one...`);
|
||||
archiveUrl = await saveToArchive(testUrl);
|
||||
|
||||
if (archiveUrl) {
|
||||
console.log(`✔ URL successfully archived: ${archiveUrl}`);
|
||||
} else {
|
||||
console.log(`❌ Failed to archive the URL.`);
|
||||
}
|
||||
}
|
||||
})();
|
||||
13
tools/test_puppeteer.js
Normal file
13
tools/test_puppeteer.js
Normal file
@@ -0,0 +1,13 @@
|
||||
const { scrapePage } = require("./lib/puppeteer");
|
||||
const path = require("path");
|
||||
|
||||
(async () => {
|
||||
const testUrl = "https://richard.dern.ovh";
|
||||
const screenshotPath = path.join(__dirname, "test_screenshot.png");
|
||||
|
||||
console.log(`🔍 Testing Puppeteer module on: ${testUrl}`);
|
||||
const metadata = await scrapePage(testUrl, screenshotPath);
|
||||
|
||||
console.log("📄 Page metadata:");
|
||||
console.log(metadata);
|
||||
})();
|
||||
Reference in New Issue
Block a user