const fs = require("fs"); const readline = require("readline"); function trimUnbalancedTrailing(value, openChar, closeChar) { let result = value; while (result.endsWith(closeChar)) { const openCount = (result.match(new RegExp(`\\${openChar}`, "g")) || []).length; const closeCount = (result.match(new RegExp(`\\${closeChar}`, "g")) || []).length; if (closeCount > openCount) { result = result.slice(0, -1); } else { break; } } return result; } function stripTrailingPunctuation(value) { let result = value; while (/[.,;:!?'"\u2018\u2019\u201C\u201D]+$/.test(result)) { result = result.slice(0, -1); } return result; } function sanitizeUrlCandidate(raw, options = {}) { if (typeof raw !== "string") return null; let candidate = raw.trim(); if (!candidate) return null; if (candidate.startsWith("<") && candidate.endsWith(">")) { candidate = candidate.slice(1, -1).trim(); } candidate = stripTrailingPunctuation(candidate); if (!options.keepTrailingParens) { candidate = trimUnbalancedTrailing(candidate, "(", ")"); } else if (candidate.endsWith(")")) { const openCount = (candidate.match(/\(/g) || []).length; const closeCount = (candidate.match(/\)/g) || []).length; if (closeCount > openCount) { candidate = trimUnbalancedTrailing(candidate, "(", ")"); } } candidate = trimUnbalancedTrailing(candidate, "[", "]"); candidate = trimUnbalancedTrailing(candidate, "{", "}"); candidate = stripTrailingPunctuation(candidate); candidate = candidate.replace(/[)]+$/g, (suffix) => { const toTrim = !options.keepTrailingParens ? suffix.length : Math.max(0, suffix.length - 1); return ")".repeat(suffix.length - toTrim); }); candidate = candidate.replace(/[*_]+$/, ""); candidate = candidate.replace(/\[\^[^\]]*\]$/, ""); candidate = stripTrailingPunctuation(candidate); if (!options.keepTrailingParens) { candidate = trimUnbalancedTrailing(candidate, "(", ")"); } if ((candidate.match(/\(/g) || []).length > (candidate.match(/\)/g) || []).length) { return null; } if ((candidate.match(/\[/g) || []).length > (candidate.match(/]/g) || []).length) { return null; } if ((candidate.match(/{/g) || []).length > (candidate.match(/}/g) || []).length) { return null; } return candidate || null; } function findMatchingPair(text, startIndex, openChar, closeChar) { let depth = 0; for (let i = startIndex; i < text.length; i++) { const ch = text[i]; if (ch === "\\") { i++; continue; } if (ch === openChar) { depth++; } else if (ch === closeChar) { depth--; if (depth === 0) { return i; } } } return -1; } function parseLinkDestination(raw) { if (typeof raw !== "string") return null; let candidate = raw.trim(); if (!candidate) return null; if (candidate.startsWith("<")) { const closeIndex = candidate.indexOf(">"); if (closeIndex > 0) { return candidate.slice(1, closeIndex).trim(); } } let result = ""; let escaping = false; let parenDepth = 0; for (let i = 0; i < candidate.length; i++) { const ch = candidate[i]; if (escaping) { result += ch; escaping = false; continue; } if (ch === "\\") { escaping = true; continue; } if (ch === "(") { parenDepth++; } else if (ch === ")" && parenDepth > 0) { parenDepth--; } else if (/\s/.test(ch) && parenDepth === 0) { break; } result += ch; } return result; } function extractMarkdownLinkTokens(text) { const tokens = []; for (let i = 0; i < text.length; i++) { if (text[i] === "!") { if (text[i + 1] !== "[") continue; i += 1; } if (text[i] !== "[") continue; const closeBracket = findMatchingPair(text, i, "[", "]"); if (closeBracket === -1) continue; let pointer = closeBracket + 1; while (pointer < text.length && /\s/.test(text[pointer])) pointer++; if (pointer >= text.length || text[pointer] !== "(") { i = closeBracket; continue; } const openParen = pointer; const closeParen = findMatchingPair(text, openParen, "(", ")"); if (closeParen === -1) { break; } const rawDestination = text.slice(openParen + 1, closeParen); const candidate = parseLinkDestination(rawDestination); if (candidate) { const startOffset = rawDestination.indexOf(candidate); if (startOffset > -1) { tokens.push({ url: candidate, start: openParen + 1 + startOffset, end: openParen + 1 + startOffset + candidate.length, }); } else { tokens.push({ url: candidate, start: openParen + 1, end: closeParen, }); } } i = closeParen; } return tokens; } function extractMarkdownDestinations(text) { return extractMarkdownLinkTokens(text).map((token) => token.url); } function isExternalLink(link) { return typeof link === "string" && link.includes("://"); } function stripMarkdownInlineCode(text) { if (typeof text !== "string" || !text.includes("`")) { return text; } let result = ""; let index = 0; while (index < text.length) { if (text[index] !== "`") { result += text[index]; index += 1; continue; } let fenceLength = 1; while (index + fenceLength < text.length && text[index + fenceLength] === "`") { fenceLength += 1; } const fence = "`".repeat(fenceLength); const closingIndex = text.indexOf(fence, index + fenceLength); if (closingIndex === -1) { result += text.slice(index, index + fenceLength); index += fenceLength; continue; } const spanLength = closingIndex + fenceLength - index; result += " ".repeat(spanLength); index = closingIndex + fenceLength; } return result; } function parseMarkdownFence(line) { if (typeof line !== "string") { return null; } const match = line.match(/^[ ]{0,3}([`~]{3,})/); if (!match) { return null; } return { marker: match[1][0], length: match[1].length, }; } function isFenceClosingLine(line, activeFence) { if (!activeFence || typeof line !== "string") { return false; } const match = line.match(/^[ ]{0,3}([`~]{3,})[ \t]*$/); if (!match) { return false; } if (match[1][0] !== activeFence.marker) { return false; } return match[1].length >= activeFence.length; } function isIndentedCodeLine(line) { if (typeof line !== "string" || !line) { return false; } return line.startsWith(" ") || line.startsWith("\t"); } function extractLinksFromText(text) { if (typeof text !== "string" || !text.includes("http")) { return []; } const strippedText = stripMarkdownInlineCode(text); if (typeof strippedText !== "string" || !strippedText.includes("http")) { return []; } const results = []; const seen = new Set(); const markdownLinkTokens = extractMarkdownLinkTokens(strippedText); function addCandidate(candidate, options = {}) { const sanitized = sanitizeUrlCandidate(candidate, options); if (!sanitized) return; if (!isExternalLink(sanitized)) return; if (seen.has(sanitized)) return; seen.add(sanitized); results.push(sanitized); } for (const token of markdownLinkTokens) { addCandidate(token.url, { keepTrailingParens: true }); } const angleRegex = /<\s*(https?:\/\/[^>\s]+)\s*>/gi; let match; while ((match = angleRegex.exec(strippedText)) !== null) { addCandidate(match[1]); } const autoRegex = /https?:\/\/[^\s<>"`]+/gi; while ((match = autoRegex.exec(strippedText)) !== null) { let overlapsMarkdownDestination = false; for (const token of markdownLinkTokens) { if (match.index >= token.start && match.index < token.end) { overlapsMarkdownDestination = true; break; } } if (overlapsMarkdownDestination) { continue; } addCandidate(match[0]); } return results; } async function collectMarkdownLinksFromStream(stream) { const rl = readline.createInterface({ input: stream, crlfDelay: Infinity }); const results = []; let lineNumber = 0; let inFrontMatter = false; let activeFence = null; let inIndentedCodeBlock = false; let previousLineBlank = true; try { for await (const line of rl) { lineNumber++; const trimmed = line.trim(); // Skip YAML front matter entirely; only scan Markdown content if (lineNumber === 1 && trimmed === "---") { inFrontMatter = true; continue; } if (inFrontMatter) { if (trimmed === "---") { inFrontMatter = false; } continue; } if (activeFence) { if (isFenceClosingLine(line, activeFence)) { activeFence = null; } previousLineBlank = trimmed === ""; continue; } const openingFence = parseMarkdownFence(line); if (openingFence) { activeFence = openingFence; previousLineBlank = trimmed === ""; continue; } if (inIndentedCodeBlock) { if (trimmed === "") { previousLineBlank = true; continue; } if (isIndentedCodeLine(line)) { previousLineBlank = false; continue; } inIndentedCodeBlock = false; } if (previousLineBlank && isIndentedCodeLine(line)) { inIndentedCodeBlock = true; previousLineBlank = false; continue; } for (const url of extractLinksFromText(line)) { results.push({ url, line: lineNumber }); } previousLineBlank = trimmed === ""; } } finally { rl.close(); if (typeof stream.close === "function") { stream.close(); } } return results; } async function collectMarkdownLinksFromFile(filePath) { const stream = fs.createReadStream(filePath, { encoding: "utf8" }); try { return await collectMarkdownLinksFromStream(stream); } catch (error) { stream.destroy(); throw error; } } module.exports = { collectMarkdownLinksFromFile, collectMarkdownLinksFromStream, extractLinksFromText, sanitizeUrlCandidate, };