1
Files
2025/tools/lib/markdown_links.js

247 lines
6.1 KiB
JavaScript

const fs = require("fs");
const readline = require("readline");
function trimUnbalancedTrailing(value, openChar, closeChar) {
let result = value;
while (result.endsWith(closeChar)) {
const openCount = (result.match(new RegExp(`\\${openChar}`, "g")) || []).length;
const closeCount = (result.match(new RegExp(`\\${closeChar}`, "g")) || []).length;
if (closeCount > openCount) {
result = result.slice(0, -1);
} else {
break;
}
}
return result;
}
function sanitizeUrlCandidate(raw, options = {}) {
if (typeof raw !== "string") return null;
let candidate = raw.trim();
if (!candidate) return null;
if (candidate.startsWith("<") && candidate.endsWith(">")) {
candidate = candidate.slice(1, -1).trim();
}
while (/[.,;:!?'"\u2018\u2019\u201C\u201D]+$/.test(candidate)) {
candidate = candidate.slice(0, -1);
}
if (!options.keepTrailingParens) {
candidate = trimUnbalancedTrailing(candidate, "(", ")");
} else if (candidate.endsWith(")")) {
const openCount = (candidate.match(/\(/g) || []).length;
const closeCount = (candidate.match(/\)/g) || []).length;
if (closeCount > openCount) {
candidate = trimUnbalancedTrailing(candidate, "(", ")");
}
}
candidate = trimUnbalancedTrailing(candidate, "[", "]");
candidate = trimUnbalancedTrailing(candidate, "{", "}");
candidate = candidate.replace(/[*_]+$/, "");
candidate = candidate.replace(/\[\^[^\]]*\]$/, "");
if (!options.keepTrailingParens) {
candidate = trimUnbalancedTrailing(candidate, "(", ")");
}
if ((candidate.match(/\(/g) || []).length > (candidate.match(/\)/g) || []).length) {
return null;
}
if ((candidate.match(/\[/g) || []).length > (candidate.match(/]/g) || []).length) {
return null;
}
if ((candidate.match(/{/g) || []).length > (candidate.match(/}/g) || []).length) {
return null;
}
return candidate || null;
}
function findMatchingPair(text, startIndex, openChar, closeChar) {
let depth = 0;
for (let i = startIndex; i < text.length; i++) {
const ch = text[i];
if (ch === "\\") {
i++;
continue;
}
if (ch === openChar) {
depth++;
} else if (ch === closeChar) {
depth--;
if (depth === 0) {
return i;
}
}
}
return -1;
}
function parseLinkDestination(raw) {
if (typeof raw !== "string") return null;
let candidate = raw.trim();
if (!candidate) return null;
if (candidate.startsWith("<")) {
const closeIndex = candidate.indexOf(">");
if (closeIndex > 0) {
return candidate.slice(1, closeIndex).trim();
}
}
let result = "";
let escaping = false;
let parenDepth = 0;
for (let i = 0; i < candidate.length; i++) {
const ch = candidate[i];
if (escaping) {
result += ch;
escaping = false;
continue;
}
if (ch === "\\") {
escaping = true;
continue;
}
if (ch === "(") {
parenDepth++;
} else if (ch === ")" && parenDepth > 0) {
parenDepth--;
} else if (/\s/.test(ch) && parenDepth === 0) {
break;
}
result += ch;
}
return result;
}
function extractMarkdownDestinations(text) {
const urls = [];
for (let i = 0; i < text.length; i++) {
if (text[i] === "!") {
if (text[i + 1] !== "[") continue;
i += 1;
}
if (text[i] !== "[") continue;
const closeBracket = findMatchingPair(text, i, "[", "]");
if (closeBracket === -1) continue;
let pointer = closeBracket + 1;
while (pointer < text.length && /\s/.test(text[pointer])) pointer++;
if (pointer >= text.length || text[pointer] !== "(") {
i = closeBracket;
continue;
}
const openParen = pointer;
const closeParen = findMatchingPair(text, openParen, "(", ")");
if (closeParen === -1) {
break;
}
const rawDestination = text.slice(openParen + 1, closeParen);
const candidate = parseLinkDestination(rawDestination);
if (candidate) {
urls.push(candidate);
}
i = closeParen;
}
return urls;
}
function isExternalLink(link) {
return typeof link === "string" && link.includes("://");
}
function extractLinksFromText(text) {
if (typeof text !== "string" || !text.includes("http")) {
return [];
}
const results = [];
const seen = new Set();
function addCandidate(candidate, options = {}) {
const sanitized = sanitizeUrlCandidate(candidate, options);
if (!sanitized) return;
if (!isExternalLink(sanitized)) return;
if (seen.has(sanitized)) return;
seen.add(sanitized);
results.push(sanitized);
}
for (const url of extractMarkdownDestinations(text)) {
addCandidate(url, { keepTrailingParens: true });
}
const angleRegex = /<\s*(https?:\/\/[^>\s]+)\s*>/gi;
let match;
while ((match = angleRegex.exec(text)) !== null) {
addCandidate(match[1]);
}
const autoRegex = /https?:\/\/[^\s<>"`]+/gi;
while ((match = autoRegex.exec(text)) !== null) {
addCandidate(match[0]);
}
return results;
}
async function collectMarkdownLinksFromStream(stream) {
const rl = readline.createInterface({ input: stream, crlfDelay: Infinity });
const results = [];
let lineNumber = 0;
let inFrontMatter = false;
try {
for await (const line of rl) {
lineNumber++;
const trimmed = line.trim();
if (lineNumber === 1 && trimmed === "---") {
inFrontMatter = true;
continue;
}
if (inFrontMatter) {
if (trimmed === "---") {
inFrontMatter = false;
continue;
}
if (trimmed.startsWith("#")) {
continue;
}
}
for (const url of extractLinksFromText(line)) {
results.push({ url, line: lineNumber });
}
}
} finally {
rl.close();
if (typeof stream.close === "function") {
stream.close();
}
}
return results;
}
async function collectMarkdownLinksFromFile(filePath) {
const stream = fs.createReadStream(filePath, { encoding: "utf8" });
try {
return await collectMarkdownLinksFromStream(stream);
} catch (error) {
stream.destroy();
throw error;
}
}
module.exports = {
collectMarkdownLinksFromFile,
collectMarkdownLinksFromStream,
extractLinksFromText,
sanitizeUrlCandidate,
};