127 lines
3.5 KiB
JavaScript
127 lines
3.5 KiB
JavaScript
const fs = require("fs");
|
|
const path = require("path");
|
|
const yaml = require("js-yaml");
|
|
const { scrapePage } = require("./lib/puppeteer");
|
|
const readline = require("readline");
|
|
|
|
const CONTENT_DIR = path.join(__dirname, "..", "content");
|
|
const DATA_DIR = path.join(__dirname, "..", "data");
|
|
const SITE_ROOT = path.resolve(__dirname, "..");
|
|
const CACHE_PATH = path.join(DATA_DIR, "external_links.yaml");
|
|
const CACHE_TTL_DAYS = 7;
|
|
|
|
let cache = {};
|
|
if (fs.existsSync(CACHE_PATH)) {
|
|
cache = yaml.load(fs.readFileSync(CACHE_PATH, "utf8")) || {};
|
|
}
|
|
|
|
const now = new Date();
|
|
const BAD_LINKS = [];
|
|
|
|
function isExternalLink(link) {
|
|
return typeof link === "string" && link.includes("://");
|
|
}
|
|
|
|
function isCacheValid(entry) {
|
|
if (!entry?.checked) return false;
|
|
const date = new Date(entry.checked);
|
|
return (now - date) / (1000 * 60 * 60 * 24) < CACHE_TTL_DAYS;
|
|
}
|
|
|
|
function extractLinksFromText(text) {
|
|
const regex = /\bhttps?:\/\/[^\s)"'>]+/g;
|
|
return text.match(regex) || [];
|
|
}
|
|
|
|
async function checkLink(file, line, url) {
|
|
if (isCacheValid(cache[url])) return;
|
|
|
|
const meta = await scrapePage(url, null, { screenshot: false });
|
|
cache[url] = {
|
|
status: meta.httpStatus || null,
|
|
checked: new Date().toISOString(),
|
|
};
|
|
|
|
const bundle = path.relative(SITE_ROOT, file);
|
|
|
|
if (!meta.httpStatus || meta.httpStatus >= 400) {
|
|
BAD_LINKS.push({ bundle, url, line, status: meta.httpStatus });
|
|
process.stdout.write("❌");
|
|
} else {
|
|
process.stdout.write("✔");
|
|
}
|
|
}
|
|
|
|
async function processMarkdown(filePath) {
|
|
const fileStream = fs.createReadStream(filePath);
|
|
const rl = readline.createInterface({ input: fileStream, crlfDelay: Infinity });
|
|
let lineNumber = 0;
|
|
for await (const line of rl) {
|
|
lineNumber++;
|
|
const links = extractLinksFromText(line);
|
|
for (const link of links) {
|
|
await checkLink(filePath, lineNumber, link);
|
|
}
|
|
}
|
|
}
|
|
|
|
function processYamlRecursively(obj, links = []) {
|
|
if (typeof obj === "string" && isExternalLink(obj)) {
|
|
links.push(obj);
|
|
} else if (Array.isArray(obj)) {
|
|
for (const item of obj) processYamlRecursively(item, links);
|
|
} else if (typeof obj === "object" && obj !== null) {
|
|
for (const key in obj) processYamlRecursively(obj[key], links);
|
|
}
|
|
return links;
|
|
}
|
|
|
|
async function processYaml(filePath) {
|
|
try {
|
|
const doc = yaml.load(fs.readFileSync(filePath, "utf8"));
|
|
const links = processYamlRecursively(doc);
|
|
for (const link of links) {
|
|
await checkLink(filePath, "?", link);
|
|
}
|
|
} catch (e) {
|
|
console.error(`Failed to parse YAML file: ${filePath}`);
|
|
}
|
|
}
|
|
|
|
function walk(dir, exts) {
|
|
let results = [];
|
|
const list = fs.readdirSync(dir);
|
|
for (const file of list) {
|
|
const fullPath = path.resolve(dir, file);
|
|
const stat = fs.statSync(fullPath);
|
|
if (stat.isDirectory()) {
|
|
results = results.concat(walk(fullPath, exts));
|
|
} else if (exts.includes(path.extname(fullPath))) {
|
|
results.push(fullPath);
|
|
}
|
|
}
|
|
return results;
|
|
}
|
|
|
|
(async () => {
|
|
const mdFiles = walk(CONTENT_DIR, [".md"]);
|
|
const yamlFiles = walk(DATA_DIR, [".yaml", ".yml"]);
|
|
console.log(`Scanning ${mdFiles.length} Markdown and ${yamlFiles.length} YAML files...`);
|
|
|
|
for (const file of mdFiles) {
|
|
await processMarkdown(file);
|
|
}
|
|
for (const file of yamlFiles) {
|
|
await processYaml(file);
|
|
}
|
|
|
|
fs.writeFileSync(CACHE_PATH, yaml.dump(cache));
|
|
|
|
console.log("\n\n=== Broken External Links Report ===");
|
|
if (BAD_LINKS.length === 0) {
|
|
console.log("✅ No broken external links found.");
|
|
} else {
|
|
console.table(BAD_LINKS);
|
|
}
|
|
})();
|