const fs = require("fs"); const readline = require("readline"); function trimUnbalancedTrailing(value, openChar, closeChar) { let result = value; while (result.endsWith(closeChar)) { const openCount = (result.match(new RegExp(`\\${openChar}`, "g")) || []).length; const closeCount = (result.match(new RegExp(`\\${closeChar}`, "g")) || []).length; if (closeCount > openCount) { result = result.slice(0, -1); } else { break; } } return result; } function sanitizeUrlCandidate(raw, options = {}) { if (typeof raw !== "string") return null; let candidate = raw.trim(); if (!candidate) return null; if (candidate.startsWith("<") && candidate.endsWith(">")) { candidate = candidate.slice(1, -1).trim(); } while (/[.,;:!?'"\u2018\u2019\u201C\u201D]+$/.test(candidate)) { candidate = candidate.slice(0, -1); } if (!options.keepTrailingParens) { candidate = trimUnbalancedTrailing(candidate, "(", ")"); } else if (candidate.endsWith(")")) { const openCount = (candidate.match(/\(/g) || []).length; const closeCount = (candidate.match(/\)/g) || []).length; if (closeCount > openCount) { candidate = trimUnbalancedTrailing(candidate, "(", ")"); } } candidate = trimUnbalancedTrailing(candidate, "[", "]"); candidate = trimUnbalancedTrailing(candidate, "{", "}"); candidate = candidate.replace(/[*_]+$/, ""); candidate = candidate.replace(/\[\^[^\]]*\]$/, ""); if (!options.keepTrailingParens) { candidate = trimUnbalancedTrailing(candidate, "(", ")"); } if ((candidate.match(/\(/g) || []).length > (candidate.match(/\)/g) || []).length) { return null; } if ((candidate.match(/\[/g) || []).length > (candidate.match(/]/g) || []).length) { return null; } if ((candidate.match(/{/g) || []).length > (candidate.match(/}/g) || []).length) { return null; } return candidate || null; } function findMatchingPair(text, startIndex, openChar, closeChar) { let depth = 0; for (let i = startIndex; i < text.length; i++) { const ch = text[i]; if (ch === "\\") { i++; continue; } if (ch === openChar) { depth++; } else if (ch === closeChar) { depth--; if (depth === 0) { return i; } } } return -1; } function parseLinkDestination(raw) { if (typeof raw !== "string") return null; let candidate = raw.trim(); if (!candidate) return null; if (candidate.startsWith("<")) { const closeIndex = candidate.indexOf(">"); if (closeIndex > 0) { return candidate.slice(1, closeIndex).trim(); } } let result = ""; let escaping = false; let parenDepth = 0; for (let i = 0; i < candidate.length; i++) { const ch = candidate[i]; if (escaping) { result += ch; escaping = false; continue; } if (ch === "\\") { escaping = true; continue; } if (ch === "(") { parenDepth++; } else if (ch === ")" && parenDepth > 0) { parenDepth--; } else if (/\s/.test(ch) && parenDepth === 0) { break; } result += ch; } return result; } function extractMarkdownDestinations(text) { const urls = []; for (let i = 0; i < text.length; i++) { if (text[i] === "!") { if (text[i + 1] !== "[") continue; i += 1; } if (text[i] !== "[") continue; const closeBracket = findMatchingPair(text, i, "[", "]"); if (closeBracket === -1) continue; let pointer = closeBracket + 1; while (pointer < text.length && /\s/.test(text[pointer])) pointer++; if (pointer >= text.length || text[pointer] !== "(") { i = closeBracket; continue; } const openParen = pointer; const closeParen = findMatchingPair(text, openParen, "(", ")"); if (closeParen === -1) { break; } const rawDestination = text.slice(openParen + 1, closeParen); const candidate = parseLinkDestination(rawDestination); if (candidate) { urls.push(candidate); } i = closeParen; } return urls; } function isExternalLink(link) { return typeof link === "string" && link.includes("://"); } function extractLinksFromText(text) { if (typeof text !== "string" || !text.includes("http")) { return []; } const results = []; const seen = new Set(); function addCandidate(candidate, options = {}) { const sanitized = sanitizeUrlCandidate(candidate, options); if (!sanitized) return; if (!isExternalLink(sanitized)) return; if (seen.has(sanitized)) return; seen.add(sanitized); results.push(sanitized); } for (const url of extractMarkdownDestinations(text)) { addCandidate(url, { keepTrailingParens: true }); } const angleRegex = /<\s*(https?:\/\/[^>\s]+)\s*>/gi; let match; while ((match = angleRegex.exec(text)) !== null) { addCandidate(match[1]); } const autoRegex = /https?:\/\/[^\s<>"`]+/gi; while ((match = autoRegex.exec(text)) !== null) { addCandidate(match[0]); } return results; } async function collectMarkdownLinksFromStream(stream) { const rl = readline.createInterface({ input: stream, crlfDelay: Infinity }); const results = []; let lineNumber = 0; let inFrontMatter = false; try { for await (const line of rl) { lineNumber++; const trimmed = line.trim(); if (lineNumber === 1 && trimmed === "---") { inFrontMatter = true; continue; } if (inFrontMatter) { if (trimmed === "---") { inFrontMatter = false; continue; } if (trimmed.startsWith("#")) { continue; } } for (const url of extractLinksFromText(line)) { results.push({ url, line: lineNumber }); } } } finally { rl.close(); if (typeof stream.close === "function") { stream.close(); } } return results; } async function collectMarkdownLinksFromFile(filePath) { const stream = fs.createReadStream(filePath, { encoding: "utf8" }); try { return await collectMarkdownLinksFromStream(stream); } catch (error) { stream.destroy(); throw error; } } module.exports = { collectMarkdownLinksFromFile, collectMarkdownLinksFromStream, extractLinksFromText, sanitizeUrlCandidate, };