/** * Soul创业派对 - 内容解析工具 * 解析 TipTap HTML 为阅读页可展示的 segments * * segment 类型: * { type: 'text', text } * { type: 'mention', userId, nickname } — @某人,点击加好友 * { type: 'linkTag', label, url } — #链接标签,点击跳转 * { type: 'image', src, alt } — 图片 */ /** 判断内容是否为 HTML */ function isHtmlContent(content) { if (!content || typeof content !== 'string') return false const trimmed = content.trim() return trimmed.includes('<') && trimmed.includes('>') && /<[a-z][^>]*>/i.test(trimmed) } /** 解码常见 HTML 实体 */ function decodeEntities(str) { return str .replace(/ /g, ' ') .replace(/&/g, '&') .replace(/</g, '<') .replace(/>/g, '>') .replace(/"/g, '"') .replace(/'/g, "'") } /** * 将一个 HTML block 字符串解析为 segments 数组 * 处理三种内联元素:mention / linkTag(span) / linkTag(a) / img */ function parseBlockToSegments(block) { const segs = [] // 合并匹配所有内联元素 const tokenRe = /]*data-type="mention"[^>]*>[\s\S]*?<\/span>|]*data-type="linkTag"[^>]*>[\s\S]*?<\/span>|]*href="([^"]*)"[^>]*>(#[^<]*)<\/a>|]*\/?>/gi let lastEnd = 0 let m while ((m = tokenRe.exec(block)) !== null) { // 前置纯文本 const before = decodeEntities(block.slice(lastEnd, m.index).replace(/<[^>]+>/g, '')) if (before.trim()) segs.push({ type: 'text', text: before }) const tag = m[0] if (/data-type="mention"/i.test(tag)) { // @mention — TipTap mention span const idMatch = tag.match(/data-id="([^"]*)"/) const labelMatch = tag.match(/data-label="([^"]*)"/) const innerText = tag.replace(/<[^>]+>/g, '') const userId = idMatch ? idMatch[1].trim() : '' const nickname = labelMatch ? labelMatch[1].trim() : innerText.replace(/^@/, '').trim() if (userId || nickname) segs.push({ type: 'mention', userId, nickname }) } else if (/data-type="linkTag"/i.test(tag)) { // #linkTag — 自定义 span 格式(data-type="linkTag" data-url="..." data-tag-type="..." data-page-path="...") const urlMatch = tag.match(/data-url="([^"]*)"/) const tagTypeMatch = tag.match(/data-tag-type="([^"]*)"/) const pagePathMatch = tag.match(/data-page-path="([^"]*)"/) const tagIdMatch = tag.match(/data-tag-id="([^"]*)"/) const innerText = tag.replace(/<[^>]+>/g, '').replace(/^#/, '').trim() const url = urlMatch ? urlMatch[1] : '' const tagType = tagTypeMatch ? tagTypeMatch[1] : 'url' const pagePath = pagePathMatch ? pagePathMatch[1] : '' const tagId = tagIdMatch ? tagIdMatch[1] : '' segs.push({ type: 'linkTag', label: innerText || '#', url, tagType, pagePath, tagId }) } else if (/^(insertLinkTag 旧版产生,url 可能为空) // m[1] = href, m[2] = innerText(以 # 开头) const url = m[1] || '' const label = (m[2] || '').replace(/^#/, '').trim() // 旧格式没有 tagType,在 onLinkTagTap 中会按 label 匹配缓存的 linkTags 配置降级处理 segs.push({ type: 'linkTag', label: label || '#', url, tagType: '', pagePath: '', tagId: '' }) } else if (/^]+>/g, '')) if (after.trim()) segs.push({ type: 'text', text: after }) return segs } /** * 从 HTML 中解析出 lines(纯文本行)和 segments(含富文本片段) */ function parseHtmlToSegments(html) { const lines = [] const segments = [] // 1. 块级标签换行,保留内联标签供后续解析 let text = html text = text.replace(/<\/p>\s*]*>/gi, '\n\n') text = text.replace(/]*>/gi, '') text = text.replace(/<\/p>/gi, '\n') text = text.replace(/]*>/gi, '') text = text.replace(/<\/div>/gi, '\n') text = text.replace(//gi, '\n') text = text.replace(/<\/?h[1-6][^>]*>/gi, '\n') text = text.replace(/<\/?blockquote[^>]*>/gi, '\n') text = text.replace(/<\/?ul[^>]*>/gi, '\n') text = text.replace(/<\/?ol[^>]*>/gi, '\n') text = text.replace(/]*>/gi, '• ') text = text.replace(/<\/li>/gi, '\n') // 2. 逐段解析 const blocks = text.split(/\n+/) for (const block of blocks) { if (!block.trim()) continue const blockSegs = parseBlockToSegments(block) if (!blockSegs.length) continue // 纯图片行独立成段 if (blockSegs.length === 1 && blockSegs[0].type === 'image') { lines.push('') segments.push(blockSegs) continue } // 行纯文本用于 lines(previewParagraphs 降级展示) const lineText = decodeEntities(block.replace(/<[^>]+>/g, '')).trim() lines.push(lineText) segments.push(blockSegs) } return { lines, segments } } /** 纯文本按行解析(无 HTML 标签) */ function parsePlainTextToSegments(text) { const lines = text.split('\n').map(l => l.trim()).filter(l => l.length > 0) const segments = lines.map(line => [{ type: 'text', text: line }]) return { lines, segments } } /** * 将原始内容解析为 contentSegments(用于阅读页展示) * @param {string} rawContent * @returns {{ lines: string[], segments: Array> }} */ function parseContent(rawContent) { if (!rawContent || typeof rawContent !== 'string') { return { lines: [], segments: [] } } if (isHtmlContent(rawContent)) { return parseHtmlToSegments(rawContent) } return parsePlainTextToSegments(rawContent) } module.exports = { parseContent, isHtmlContent }