const fs = require("fs"); const path = require("path"); const yaml = require("js-yaml"); const { scrapePage } = require("./lib/puppeteer"); const readline = require("readline"); const CONTENT_DIR = path.join(__dirname, "..", "content"); const DATA_DIR = path.join(__dirname, "..", "data"); const SITE_ROOT = path.resolve(__dirname, ".."); const CACHE_PATH = path.join(DATA_DIR, "external_links.yaml"); const CACHE_TTL_DAYS = 7; let cache = {}; if (fs.existsSync(CACHE_PATH)) { cache = yaml.load(fs.readFileSync(CACHE_PATH, "utf8")) || {}; } const now = new Date(); const BAD_LINKS = []; function isExternalLink(link) { return typeof link === "string" && link.includes("://"); } function isCacheValid(entry) { if (!entry?.checked) return false; const date = new Date(entry.checked); return (now - date) / (1000 * 60 * 60 * 24) < CACHE_TTL_DAYS; } function extractLinksFromText(text) { const regex = /\bhttps?:\/\/[^\s)"'>]+/g; return text.match(regex) || []; } async function checkLink(file, line, url) { if (isCacheValid(cache[url])) return; const meta = await scrapePage(url, null, { screenshot: false }); cache[url] = { status: meta.httpStatus || null, checked: new Date().toISOString(), }; const bundle = path.relative(SITE_ROOT, file); if (!meta.httpStatus || meta.httpStatus >= 400) { BAD_LINKS.push({ bundle, url, line, status: meta.httpStatus }); process.stdout.write("❌"); } else { process.stdout.write("✔"); } } async function processMarkdown(filePath) { const fileStream = fs.createReadStream(filePath); const rl = readline.createInterface({ input: fileStream, crlfDelay: Infinity }); let lineNumber = 0; for await (const line of rl) { lineNumber++; const links = extractLinksFromText(line); for (const link of links) { await checkLink(filePath, lineNumber, link); } } } function processYamlRecursively(obj, links = []) { if (typeof obj === "string" && isExternalLink(obj)) { links.push(obj); } else if (Array.isArray(obj)) { for (const item of obj) processYamlRecursively(item, links); } else if (typeof obj === "object" && obj !== null) { for (const key in obj) processYamlRecursively(obj[key], links); } return links; } async function processYaml(filePath) { try { const doc = yaml.load(fs.readFileSync(filePath, "utf8")); const links = processYamlRecursively(doc); for (const link of links) { await checkLink(filePath, "?", link); } } catch (e) { console.error(`Failed to parse YAML file: ${filePath}`); } } function walk(dir, exts) { let results = []; const list = fs.readdirSync(dir); for (const file of list) { const fullPath = path.resolve(dir, file); const stat = fs.statSync(fullPath); if (stat.isDirectory()) { results = results.concat(walk(fullPath, exts)); } else if (exts.includes(path.extname(fullPath))) { results.push(fullPath); } } return results; } (async () => { const mdFiles = walk(CONTENT_DIR, [".md"]); const yamlFiles = walk(DATA_DIR, [".yaml", ".yml"]); console.log(`Scanning ${mdFiles.length} Markdown and ${yamlFiles.length} YAML files...`); for (const file of mdFiles) { await processMarkdown(file); } for (const file of yamlFiles) { await processYaml(file); } fs.writeFileSync(CACHE_PATH, yaml.dump(cache)); console.log("\n\n=== Broken External Links Report ==="); if (BAD_LINKS.length === 0) { console.log("✅ No broken external links found."); } else { console.table(BAD_LINKS); } })();