const fs = require("fs"); const path = require("path"); const yaml = require("js-yaml"); const util = require("util"); const { execFile } = require("child_process"); const UserAgent = require("user-agents"); const { collectMarkdownLinksFromFile, extractLinksFromText, } = require("./lib/markdown_links"); const SITE_ROOT = path.resolve(__dirname, ".."); const CONFIG_PATH = path.join(__dirname, "config.json"); let config = {}; if (fs.existsSync(CONFIG_PATH)) { try { config = JSON.parse(fs.readFileSync(CONFIG_PATH, "utf8")); } catch (error) { console.warn( `Failed to parse ${path.relative(SITE_ROOT, CONFIG_PATH)}. Using defaults. (${error.message})` ); } } const externalConfig = { cacheDir: path.join(__dirname, "cache"), cacheFile: "external_links.yaml", hostDelayMs: 2000, retryDelayMs: 5000, requestTimeoutSeconds: 5, cacheTtlSuccessDays: 7, cacheTtlClientErrorDays: 0, outputFormat: "markdown", outputFile: path.join(__dirname, "cache", "external_links_report.md"), userAgent: null, enableCookies: true, cookieJar: path.join(__dirname, "cache", "curl_cookies.txt"), ...(config.externalLinks || {}), }; const CONTENT_DIR = path.join(SITE_ROOT, "content"); const CACHE_DIR = path.isAbsolute(externalConfig.cacheDir) ? externalConfig.cacheDir : path.resolve(SITE_ROOT, externalConfig.cacheDir); const CACHE_PATH = path.isAbsolute(externalConfig.cacheFile) ? externalConfig.cacheFile : path.join(CACHE_DIR, externalConfig.cacheFile); const OUTPUT_FILE = path.isAbsolute(externalConfig.outputFile) ? externalConfig.outputFile : path.resolve(SITE_ROOT, externalConfig.outputFile); const COOKIE_JAR = externalConfig.cookieJar ? path.isAbsolute(externalConfig.cookieJar) ? externalConfig.cookieJar : path.resolve(SITE_ROOT, externalConfig.cookieJar) : path.join(CACHE_DIR, "curl_cookies.txt"); const CACHE_TTL_SUCCESS_DAYS = Number(externalConfig.cacheTtlSuccessDays) || 0; const CACHE_TTL_CLIENT_ERROR_DAYS = Number(externalConfig.cacheTtlClientErrorDays) || 0; const HOST_DELAY_MS = Number(externalConfig.hostDelayMs) || 0; const RETRY_DELAY_MS = Number(externalConfig.retryDelayMs) || 0; const REQUEST_TIMEOUT_SECONDS = Number(externalConfig.requestTimeoutSeconds) || 0; const maxConcurrentConfig = Number(externalConfig.maxConcurrentHosts); const MAX_CONCURRENT_HOSTS = Number.isFinite(maxConcurrentConfig) && maxConcurrentConfig > 0 ? maxConcurrentConfig : 4; const DEFAULT_USER_AGENT = typeof externalConfig.userAgent === "string" && externalConfig.userAgent.trim() ? externalConfig.userAgent.trim() : new UserAgent().toString(); const ENABLE_COOKIES = externalConfig.enableCookies !== false; const PROGRESS_FILE = path.join(__dirname, "cache", "external_links_progress.csv"); const execFileAsync = util.promisify(execFile); fs.mkdirSync(CACHE_DIR, { recursive: true }); if (ENABLE_COOKIES) { fs.mkdirSync(path.dirname(COOKIE_JAR), { recursive: true }); if (!fs.existsSync(COOKIE_JAR)) { fs.closeSync(fs.openSync(COOKIE_JAR, "a")); } } try { if (fs.existsSync(PROGRESS_FILE)) { fs.unlinkSync(PROGRESS_FILE); } } catch (error) { console.warn(`Unable to remove existing progress file: ${error.message}`); } let cache = {}; if (fs.existsSync(CACHE_PATH)) { cache = yaml.load(fs.readFileSync(CACHE_PATH, "utf8")) || {}; } let cacheDirty = false; const now = new Date(); const BAD_LINKS = []; const lastHostChecks = new Map(); const runResults = new Map(); function updateProgress(processed, total) { process.stdout.write(`\rURL ${processed}/${total}`); } function isCacheValid(entry) { if (!entry?.checked) return false; const date = new Date(entry.checked); const ttlDays = (() => { const status = entry.status; if (typeof status === "number") { if (status < 400) return CACHE_TTL_SUCCESS_DAYS; if (status < 500) return CACHE_TTL_CLIENT_ERROR_DAYS; return 0; } return 0; })(); if (ttlDays <= 0) return false; return (now - date) / (1000 * 60 * 60 * 24) < ttlDays; } async function collectMarkdownLinks(filePath, occurrencesMap) { const entries = await collectMarkdownLinksFromFile(filePath); for (const { url, line } of entries) { recordOccurrence(occurrencesMap, filePath, line, url); } } function recordOccurrence(occurrencesMap, filePath, lineNumber, url) { if (!occurrencesMap.has(url)) { occurrencesMap.set(url, { url, occurrences: [] }); } const entry = occurrencesMap.get(url); const alreadyRecorded = entry.occurrences.some( (item) => item.file === filePath && item.line === lineNumber ); if (!alreadyRecorded) { entry.occurrences.push({ file: filePath, line: lineNumber }); } } function delay(ms) { return new Promise((resolve) => setTimeout(resolve, ms)); } async function applyHostDelay(host) { if (!host) return; const last = lastHostChecks.get(host); if (last) { const elapsed = Date.now() - last; const waitTime = HOST_DELAY_MS - elapsed; if (waitTime > 0) { await delay(waitTime); } } } function recordHostCheck(host) { if (host) { lastHostChecks.set(host, Date.now()); } } function extractHost(url) { try { return new URL(url).hostname; } catch (_) { return null; } } function persistCache() { if (!cacheDirty) return; ensureDirectoryExists(CACHE_PATH); fs.writeFileSync(CACHE_PATH, yaml.dump(cache)); cacheDirty = false; } function formatLocations(occurrences) { return occurrences .map(({ file, line }) => `${path.relative(SITE_ROOT, file)}:${line}`) .join("; "); } function escapeCsvField(value) { const stringValue = String(value); if (/[",\n]/.test(stringValue)) { return `"${stringValue.replace(/"/g, '""')}"`; } return stringValue; } function appendProgress(url, occurrences, status) { const locationText = formatLocations(occurrences); const statusText = typeof status === "number" && status < 400 && status !== null ? "" : status ?? ""; const line = [ escapeCsvField(url), escapeCsvField(locationText), escapeCsvField(statusText), ].join(","); fs.appendFileSync(PROGRESS_FILE, `${line}\n`); } function groupEntriesByHost(entries) { const result = new Map(); for (const entry of entries) { const host = extractHost(entry.url); const key = host || `__invalid__:${entry.url}`; if (!result.has(key)) { result.set(key, { host, entries: [] }); } result.get(key).entries.push(entry); } return Array.from(result.values()); } async function runWithConcurrency(items, worker, concurrency) { const executing = new Set(); const promises = []; for (const item of items) { const promise = Promise.resolve().then(() => worker(item)); promises.push(promise); executing.add(promise); const clean = () => executing.delete(promise); promise.then(clean).catch(clean); if (executing.size >= concurrency) { await Promise.race(executing); } } return Promise.all(promises); } async function curlRequest(url, method) { const args = [ "--silent", "--location", "--fail", "--max-time", `${REQUEST_TIMEOUT_SECONDS}`, "--output", "/dev/null", "--write-out", "%{http_code}", "--user-agent", DEFAULT_USER_AGENT, "--request", method, url, ]; if (ENABLE_COOKIES) { args.push("--cookie", COOKIE_JAR, "--cookie-jar", COOKIE_JAR); } try { const { stdout } = await execFileAsync("curl", args); const status = parseInt(stdout.trim(), 10); return { status: Number.isNaN(status) ? null : status, errorType: null, method: method.toUpperCase(), }; } catch (error) { const rawStatus = error?.stdout?.toString().trim(); const status = rawStatus ? parseInt(rawStatus, 10) : null; const errorCode = Number(error?.code); const errorType = errorCode === 28 ? "timeout" : null; return { status: Number.isNaN(status) ? null : status, errorType, method: method.toUpperCase(), }; } } function shouldRetryWithGet(result) { if (result.errorType) return true; if (result.status === null) return true; return result.status >= 400; } async function checkLink(url) { let info = runResults.get(url); if (!info) { const cachedInfo = cache[url]; if (!isCacheValid(cachedInfo)) { const host = extractHost(url); if (host) { await applyHostDelay(host); } let result = await curlRequest(url, "HEAD"); recordHostCheck(host); if (shouldRetryWithGet(result)) { await delay(RETRY_DELAY_MS); if (host) { await applyHostDelay(host); } result = await curlRequest(url, "GET"); recordHostCheck(host); } info = { status: result.status ?? null, errorType: result.errorType || null, method: result.method, checked: new Date().toISOString(), }; cache[url] = info; cacheDirty = true; persistCache(); } else if (cachedInfo) { info = cachedInfo; } else { info = { status: null, errorType: "unknown", method: "HEAD", checked: new Date().toISOString(), }; } runResults.set(url, info); } return info; } function processYamlRecursively(obj, links = new Set()) { if (typeof obj === "string") { for (const link of extractLinksFromText(obj)) { links.add(link); } } else if (Array.isArray(obj)) { for (const item of obj) processYamlRecursively(item, links); } else if (typeof obj === "object" && obj !== null) { for (const key in obj) processYamlRecursively(obj[key], links); } return links; } function stripYamlInlineComment(line) { let inSingle = false; let inDouble = false; for (let i = 0; i < line.length; i++) { const ch = line[i]; if (ch === "'" && !inDouble) { const next = line[i + 1]; if (inSingle && next === "'") { i++; continue; } inSingle = !inSingle; } else if (ch === '"' && !inSingle) { if (!inDouble) { inDouble = true; } else if (line[i - 1] !== "\\") { inDouble = false; } } else if (ch === "#" && !inSingle && !inDouble) { return line.slice(0, i); } else if (ch === "\\" && inDouble) { i++; } } return line; } function isYamlCommentLine(line) { return line.trim().startsWith("#"); } function isBlockScalarIndicator(line) { const cleaned = stripYamlInlineComment(line).trim(); return /:\s*[>|][0-9+-]*\s*$/.test(cleaned); } async function collectYamlLinks(filePath, occurrencesMap) { let linkSet = new Set(); try { const doc = yaml.load(fs.readFileSync(filePath, "utf8")); linkSet = processYamlRecursively(doc); } catch (e) { console.error(`Failed to parse YAML file: ${filePath}`); return; } if (linkSet.size === 0) return; const recorded = new Map(); const rawLines = fs.readFileSync(filePath, "utf8").split(/\r?\n/); let inBlockScalar = false; let blockIndent = 0; const markRecorded = (url, lineNumber) => { if (!recorded.has(url)) { recorded.set(url, new Set()); } const lines = recorded.get(url); if (lines.has(lineNumber)) return; lines.add(lineNumber); recordOccurrence(occurrencesMap, filePath, lineNumber, url); }; for (let index = 0; index < rawLines.length; index++) { const lineNumber = index + 1; const line = rawLines[index]; const indent = line.match(/^\s*/)?.[0].length ?? 0; const trimmed = line.trim(); if (inBlockScalar) { if (trimmed === "" && indent < blockIndent) { inBlockScalar = false; continue; } if (trimmed === "" || indent >= blockIndent) { if (isYamlCommentLine(line)) { continue; } for (const link of extractLinksFromText(line)) { if (linkSet.has(link)) { markRecorded(link, lineNumber); } } continue; } inBlockScalar = false; } const withoutComment = stripYamlInlineComment(line); const trimmedWithoutComment = withoutComment.trim(); if (isBlockScalarIndicator(line)) { inBlockScalar = true; blockIndent = indent + 1; } if (isYamlCommentLine(line)) continue; if (!trimmedWithoutComment) continue; for (const link of extractLinksFromText(withoutComment)) { if (linkSet.has(link)) { markRecorded(link, lineNumber); } } } for (const link of linkSet) { if (!recorded.has(link) || recorded.get(link).size === 0) { recordOccurrence(occurrencesMap, filePath, "?", link); } } } function walk(dir, exts) { let results = []; const list = fs.readdirSync(dir); for (const file of list) { const fullPath = path.resolve(dir, file); const stat = fs.statSync(fullPath); if (stat.isDirectory()) { results = results.concat(walk(fullPath, exts)); } else if (exts.includes(path.extname(fullPath))) { results.push(fullPath); } } return results; } function ensureDirectoryExists(targetFile) { fs.mkdirSync(path.dirname(targetFile), { recursive: true }); } function escapeMarkdownCell(value) { return String(value).replace(/\|/g, "\\|").replace(/\r?\n/g, " "); } function generateMarkdownReport(entries) { const header = [ "# Broken External Links", "", `Generated: ${new Date().toISOString()}`, "", ]; if (entries.length === 0) { return header.concat(["No broken external links found."]).join("\n"); } const rows = entries.map((entry) => { const url = escapeMarkdownCell(entry.url); const location = escapeMarkdownCell(entry.location); const status = escapeMarkdownCell(entry.status); return `| ${url} | ${location} | ${status} |`; }); return header .concat(["| URL | Location | Status |", "| --- | --- | --- |", ...rows]) .join("\n"); } function generateCsvReport(entries) { const lines = [`"url","location","status"`]; for (const entry of entries) { const line = [entry.url, entry.location, entry.status] .map((field) => `"${String(field).replace(/"/g, '""')}"`) .join(","); lines.push(line); } return lines.join("\n"); } function writeReport(entries) { const format = String(externalConfig.outputFormat || "markdown").toLowerCase(); const content = format === "csv" ? generateCsvReport(entries) : generateMarkdownReport(entries); ensureDirectoryExists(OUTPUT_FILE); fs.writeFileSync(OUTPUT_FILE, content, "utf8"); } (async () => { const occurrencesByUrl = new Map(); const mdFiles = walk(CONTENT_DIR, [".md", ".markdown"]); const yamlFiles = walk(CONTENT_DIR, [".yaml", ".yml"]); for (const file of mdFiles) { await collectMarkdownLinks(file, occurrencesByUrl); } for (const file of yamlFiles) { await collectYamlLinks(file, occurrencesByUrl); } const uniqueEntries = Array.from(occurrencesByUrl.values()); const activeUrls = new Set(uniqueEntries.map((entry) => entry.url)); let cachePruned = false; for (const url of Object.keys(cache)) { if (!activeUrls.has(url)) { delete cache[url]; cachePruned = true; } } if (cachePruned) { cacheDirty = true; } ensureDirectoryExists(PROGRESS_FILE); fs.writeFileSync(PROGRESS_FILE, `"url","locations","status"\n`, "utf8"); const total = uniqueEntries.length; if (total === 0) { process.stdout.write("No external links found.\n"); ensureDirectoryExists(CACHE_PATH); fs.writeFileSync(CACHE_PATH, yaml.dump(cache)); writeReport([]); return; } const hostGroups = groupEntriesByHost(uniqueEntries); const concurrency = Math.max(1, Math.min(MAX_CONCURRENT_HOSTS, hostGroups.length || 1)); let processed = 0; await runWithConcurrency( hostGroups, async ({ entries }) => { for (const entry of entries) { const info = await checkLink(entry.url); const status = typeof info?.status === "number" ? info.status : null; const errorType = info?.errorType || null; const hasHttpError = status !== null && status >= 400; const isTimeout = errorType === "timeout"; const statusLabel = isTimeout ? "timeout" : status ?? "error"; if (status === null || hasHttpError || isTimeout) { BAD_LINKS.push({ location: formatLocations(entry.occurrences), url: entry.url, status: statusLabel, }); } appendProgress(entry.url, entry.occurrences, hasHttpError || isTimeout || status === null ? statusLabel : status); processed += 1; updateProgress(processed, total); } }, concurrency ); process.stdout.write("\n"); ensureDirectoryExists(CACHE_PATH); fs.writeFileSync(CACHE_PATH, yaml.dump(cache)); if (BAD_LINKS.length === 0) { writeReport([]); console.log( `No broken external links detected. Report saved to ${path.relative( SITE_ROOT, OUTPUT_FILE )}.` ); return; } const sorted = BAD_LINKS.sort((a, b) => { const rank = (entry) => { if (entry.status === "timeout") return 2; if (typeof entry.status === "number") { return entry.status === 404 ? 0 : 1; } return 1; }; const diff = rank(a) - rank(b); if (diff !== 0) return diff; if (typeof a.status === "number" && typeof b.status === "number") { return a.status - b.status; } return a.url.localeCompare(b.url); }); writeReport(sorted); console.log( `Found ${sorted.length} broken external link(s). Report saved to ${path.relative( SITE_ROOT, OUTPUT_FILE )}.` ); })();