401 lines
10 KiB
JavaScript
401 lines
10 KiB
JavaScript
const fs = require("fs");
|
|
const readline = require("readline");
|
|
|
|
function trimUnbalancedTrailing(value, openChar, closeChar) {
|
|
let result = value;
|
|
while (result.endsWith(closeChar)) {
|
|
const openCount = (result.match(new RegExp(`\\${openChar}`, "g")) || []).length;
|
|
const closeCount = (result.match(new RegExp(`\\${closeChar}`, "g")) || []).length;
|
|
if (closeCount > openCount) {
|
|
result = result.slice(0, -1);
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
|
|
function stripTrailingPunctuation(value) {
|
|
let result = value;
|
|
while (/[.,;:!?'"\u2018\u2019\u201C\u201D]+$/.test(result)) {
|
|
result = result.slice(0, -1);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
function sanitizeUrlCandidate(raw, options = {}) {
|
|
if (typeof raw !== "string") return null;
|
|
let candidate = raw.trim();
|
|
if (!candidate) return null;
|
|
|
|
if (candidate.startsWith("<") && candidate.endsWith(">")) {
|
|
candidate = candidate.slice(1, -1).trim();
|
|
}
|
|
|
|
candidate = stripTrailingPunctuation(candidate);
|
|
|
|
if (!options.keepTrailingParens) {
|
|
candidate = trimUnbalancedTrailing(candidate, "(", ")");
|
|
} else if (candidate.endsWith(")")) {
|
|
const openCount = (candidate.match(/\(/g) || []).length;
|
|
const closeCount = (candidate.match(/\)/g) || []).length;
|
|
if (closeCount > openCount) {
|
|
candidate = trimUnbalancedTrailing(candidate, "(", ")");
|
|
}
|
|
}
|
|
candidate = trimUnbalancedTrailing(candidate, "[", "]");
|
|
candidate = trimUnbalancedTrailing(candidate, "{", "}");
|
|
candidate = stripTrailingPunctuation(candidate);
|
|
|
|
candidate = candidate.replace(/[)]+$/g, (suffix) => {
|
|
const toTrim = !options.keepTrailingParens ? suffix.length : Math.max(0, suffix.length - 1);
|
|
return ")".repeat(suffix.length - toTrim);
|
|
});
|
|
candidate = candidate.replace(/[*_]+$/, "");
|
|
candidate = candidate.replace(/\[\^[^\]]*\]$/, "");
|
|
candidate = stripTrailingPunctuation(candidate);
|
|
if (!options.keepTrailingParens) {
|
|
candidate = trimUnbalancedTrailing(candidate, "(", ")");
|
|
}
|
|
|
|
if ((candidate.match(/\(/g) || []).length > (candidate.match(/\)/g) || []).length) {
|
|
return null;
|
|
}
|
|
if ((candidate.match(/\[/g) || []).length > (candidate.match(/]/g) || []).length) {
|
|
return null;
|
|
}
|
|
if ((candidate.match(/{/g) || []).length > (candidate.match(/}/g) || []).length) {
|
|
return null;
|
|
}
|
|
|
|
return candidate || null;
|
|
}
|
|
|
|
function findMatchingPair(text, startIndex, openChar, closeChar) {
|
|
let depth = 0;
|
|
for (let i = startIndex; i < text.length; i++) {
|
|
const ch = text[i];
|
|
if (ch === "\\") {
|
|
i++;
|
|
continue;
|
|
}
|
|
if (ch === openChar) {
|
|
depth++;
|
|
} else if (ch === closeChar) {
|
|
depth--;
|
|
if (depth === 0) {
|
|
return i;
|
|
}
|
|
}
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
function parseLinkDestination(raw) {
|
|
if (typeof raw !== "string") return null;
|
|
let candidate = raw.trim();
|
|
if (!candidate) return null;
|
|
|
|
if (candidate.startsWith("<")) {
|
|
const closeIndex = candidate.indexOf(">");
|
|
if (closeIndex > 0) {
|
|
return candidate.slice(1, closeIndex).trim();
|
|
}
|
|
}
|
|
|
|
let result = "";
|
|
let escaping = false;
|
|
let parenDepth = 0;
|
|
for (let i = 0; i < candidate.length; i++) {
|
|
const ch = candidate[i];
|
|
if (escaping) {
|
|
result += ch;
|
|
escaping = false;
|
|
continue;
|
|
}
|
|
if (ch === "\\") {
|
|
escaping = true;
|
|
continue;
|
|
}
|
|
if (ch === "(") {
|
|
parenDepth++;
|
|
} else if (ch === ")" && parenDepth > 0) {
|
|
parenDepth--;
|
|
} else if (/\s/.test(ch) && parenDepth === 0) {
|
|
break;
|
|
}
|
|
result += ch;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
function extractMarkdownLinkTokens(text) {
|
|
const tokens = [];
|
|
for (let i = 0; i < text.length; i++) {
|
|
if (text[i] === "!") {
|
|
if (text[i + 1] !== "[") continue;
|
|
i += 1;
|
|
}
|
|
if (text[i] !== "[") continue;
|
|
|
|
const closeBracket = findMatchingPair(text, i, "[", "]");
|
|
if (closeBracket === -1) continue;
|
|
|
|
let pointer = closeBracket + 1;
|
|
while (pointer < text.length && /\s/.test(text[pointer])) pointer++;
|
|
if (pointer >= text.length || text[pointer] !== "(") {
|
|
i = closeBracket;
|
|
continue;
|
|
}
|
|
|
|
const openParen = pointer;
|
|
const closeParen = findMatchingPair(text, openParen, "(", ")");
|
|
if (closeParen === -1) {
|
|
break;
|
|
}
|
|
|
|
const rawDestination = text.slice(openParen + 1, closeParen);
|
|
const candidate = parseLinkDestination(rawDestination);
|
|
if (candidate) {
|
|
const startOffset = rawDestination.indexOf(candidate);
|
|
if (startOffset > -1) {
|
|
tokens.push({
|
|
url: candidate,
|
|
start: openParen + 1 + startOffset,
|
|
end: openParen + 1 + startOffset + candidate.length,
|
|
});
|
|
} else {
|
|
tokens.push({
|
|
url: candidate,
|
|
start: openParen + 1,
|
|
end: closeParen,
|
|
});
|
|
}
|
|
}
|
|
i = closeParen;
|
|
}
|
|
return tokens;
|
|
}
|
|
|
|
function extractMarkdownDestinations(text) {
|
|
return extractMarkdownLinkTokens(text).map((token) => token.url);
|
|
}
|
|
|
|
function isExternalLink(link) {
|
|
return typeof link === "string" && link.includes("://");
|
|
}
|
|
|
|
function stripMarkdownInlineCode(text) {
|
|
if (typeof text !== "string" || !text.includes("`")) {
|
|
return text;
|
|
}
|
|
|
|
let result = "";
|
|
let index = 0;
|
|
|
|
while (index < text.length) {
|
|
if (text[index] !== "`") {
|
|
result += text[index];
|
|
index += 1;
|
|
continue;
|
|
}
|
|
|
|
let fenceLength = 1;
|
|
while (index + fenceLength < text.length && text[index + fenceLength] === "`") {
|
|
fenceLength += 1;
|
|
}
|
|
|
|
const fence = "`".repeat(fenceLength);
|
|
const closingIndex = text.indexOf(fence, index + fenceLength);
|
|
if (closingIndex === -1) {
|
|
result += text.slice(index, index + fenceLength);
|
|
index += fenceLength;
|
|
continue;
|
|
}
|
|
|
|
const spanLength = closingIndex + fenceLength - index;
|
|
result += " ".repeat(spanLength);
|
|
index = closingIndex + fenceLength;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
function parseMarkdownFence(line) {
|
|
if (typeof line !== "string") {
|
|
return null;
|
|
}
|
|
const match = line.match(/^[ ]{0,3}([`~]{3,})/);
|
|
if (!match) {
|
|
return null;
|
|
}
|
|
|
|
return {
|
|
marker: match[1][0],
|
|
length: match[1].length,
|
|
};
|
|
}
|
|
|
|
function isFenceClosingLine(line, activeFence) {
|
|
if (!activeFence || typeof line !== "string") {
|
|
return false;
|
|
}
|
|
|
|
const match = line.match(/^[ ]{0,3}([`~]{3,})[ \t]*$/);
|
|
if (!match) {
|
|
return false;
|
|
}
|
|
if (match[1][0] !== activeFence.marker) {
|
|
return false;
|
|
}
|
|
return match[1].length >= activeFence.length;
|
|
}
|
|
|
|
function isIndentedCodeLine(line) {
|
|
if (typeof line !== "string" || !line) {
|
|
return false;
|
|
}
|
|
return line.startsWith(" ") || line.startsWith("\t");
|
|
}
|
|
|
|
function extractLinksFromText(text) {
|
|
if (typeof text !== "string" || !text.includes("http")) {
|
|
return [];
|
|
}
|
|
|
|
const strippedText = stripMarkdownInlineCode(text);
|
|
if (typeof strippedText !== "string" || !strippedText.includes("http")) {
|
|
return [];
|
|
}
|
|
|
|
const results = [];
|
|
const seen = new Set();
|
|
const markdownLinkTokens = extractMarkdownLinkTokens(strippedText);
|
|
|
|
function addCandidate(candidate, options = {}) {
|
|
const sanitized = sanitizeUrlCandidate(candidate, options);
|
|
if (!sanitized) return;
|
|
if (!isExternalLink(sanitized)) return;
|
|
if (seen.has(sanitized)) return;
|
|
seen.add(sanitized);
|
|
results.push(sanitized);
|
|
}
|
|
|
|
for (const token of markdownLinkTokens) {
|
|
addCandidate(token.url, { keepTrailingParens: true });
|
|
}
|
|
|
|
const angleRegex = /<\s*(https?:\/\/[^>\s]+)\s*>/gi;
|
|
let match;
|
|
while ((match = angleRegex.exec(strippedText)) !== null) {
|
|
addCandidate(match[1]);
|
|
}
|
|
|
|
const autoRegex = /https?:\/\/[^\s<>"`]+/gi;
|
|
while ((match = autoRegex.exec(strippedText)) !== null) {
|
|
let overlapsMarkdownDestination = false;
|
|
for (const token of markdownLinkTokens) {
|
|
if (match.index >= token.start && match.index < token.end) {
|
|
overlapsMarkdownDestination = true;
|
|
break;
|
|
}
|
|
}
|
|
if (overlapsMarkdownDestination) {
|
|
continue;
|
|
}
|
|
addCandidate(match[0]);
|
|
}
|
|
|
|
return results;
|
|
}
|
|
|
|
async function collectMarkdownLinksFromStream(stream) {
|
|
const rl = readline.createInterface({ input: stream, crlfDelay: Infinity });
|
|
const results = [];
|
|
let lineNumber = 0;
|
|
let inFrontMatter = false;
|
|
let activeFence = null;
|
|
let inIndentedCodeBlock = false;
|
|
let previousLineBlank = true;
|
|
try {
|
|
for await (const line of rl) {
|
|
lineNumber++;
|
|
const trimmed = line.trim();
|
|
|
|
// Skip YAML front matter entirely; only scan Markdown content
|
|
if (lineNumber === 1 && trimmed === "---") {
|
|
inFrontMatter = true;
|
|
continue;
|
|
}
|
|
if (inFrontMatter) {
|
|
if (trimmed === "---") {
|
|
inFrontMatter = false;
|
|
}
|
|
continue;
|
|
}
|
|
|
|
if (activeFence) {
|
|
if (isFenceClosingLine(line, activeFence)) {
|
|
activeFence = null;
|
|
}
|
|
previousLineBlank = trimmed === "";
|
|
continue;
|
|
}
|
|
|
|
const openingFence = parseMarkdownFence(line);
|
|
if (openingFence) {
|
|
activeFence = openingFence;
|
|
previousLineBlank = trimmed === "";
|
|
continue;
|
|
}
|
|
|
|
if (inIndentedCodeBlock) {
|
|
if (trimmed === "") {
|
|
previousLineBlank = true;
|
|
continue;
|
|
}
|
|
if (isIndentedCodeLine(line)) {
|
|
previousLineBlank = false;
|
|
continue;
|
|
}
|
|
inIndentedCodeBlock = false;
|
|
}
|
|
|
|
if (previousLineBlank && isIndentedCodeLine(line)) {
|
|
inIndentedCodeBlock = true;
|
|
previousLineBlank = false;
|
|
continue;
|
|
}
|
|
|
|
for (const url of extractLinksFromText(line)) {
|
|
results.push({ url, line: lineNumber });
|
|
}
|
|
|
|
previousLineBlank = trimmed === "";
|
|
}
|
|
} finally {
|
|
rl.close();
|
|
if (typeof stream.close === "function") {
|
|
stream.close();
|
|
}
|
|
}
|
|
return results;
|
|
}
|
|
|
|
async function collectMarkdownLinksFromFile(filePath) {
|
|
const stream = fs.createReadStream(filePath, { encoding: "utf8" });
|
|
try {
|
|
return await collectMarkdownLinksFromStream(stream);
|
|
} catch (error) {
|
|
stream.destroy();
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
module.exports = {
|
|
collectMarkdownLinksFromFile,
|
|
collectMarkdownLinksFromStream,
|
|
extractLinksFromText,
|
|
sanitizeUrlCandidate,
|
|
};
|