/** * 卡若创业派对 - 内容解析工具 * 解析 TipTap HTML 为阅读页可展示的 segments * * segment 类型: * { type: 'text', text } * { type: 'mention', userId, nickname } — @某人,点击加好友 * { type: 'linkTag', label, url } — #链接标签,点击跳转 * { type: 'image', src, alt } — 图片 */ /** 判断内容是否为 HTML */ function isHtmlContent(content) { if (!content || typeof content !== 'string') return false const trimmed = content.trim() return trimmed.includes('<') && trimmed.includes('>') && /<[a-z][^>]*>/i.test(trimmed) } /** 解码常见 HTML 实体 */ function decodeEntities(str) { return str .replace(/ /g, ' ') .replace(/&/g, '&') .replace(/</g, '<') .replace(/>/g, '>') .replace(/"/g, '"') .replace(/'/g, "'") } /** * 单行展示用:昵称、#标签文案、章节外标题类字段 — 合并换行、
、连续空白(避免 TipTap/粘贴带入异常断行) */ function cleanSingleLineField(s) { if (!s && s !== 0) return '' let t = decodeEntities(String(s)) .replace(//gi, ' ') .replace(/\r\n|\r|\n/g, ' ') .replace(/[\s\u00a0\u200b\u200c\u200d\ufeff\u3000]+/g, ' ') .trim() return t } /** @提及昵称:去首尾空白、零宽、全角空格;合并内部换行/
*/ function cleanMentionNickname(n) { return cleanSingleLineField(n) } /** 纯文本在 mention 节点前若已有「@」,去掉末尾 @,避免渲染成「找@@阿浪」 */ function stripTrailingAtForMention(before) { return before.replace(/[@@][\s\u00a0\u200b]*$/u, '') } /** * 将一个 HTML block 字符串解析为 segments 数组 * 处理三种内联元素:mention / linkTag(span) / linkTag(a) / img */ function parseBlockToSegments(block) { const segs = [] // 合并匹配所有内联元素 const tokenRe = /]*data-type="mention"[^>]*>[\s\S]*?<\/span>|]*data-type="linkTag"[^>]*>[\s\S]*?<\/span>|]*href="([^"]*)"[^>]*>(#[^<]*)<\/a>|]*\/?>/gi let lastEnd = 0 let m while ((m = tokenRe.exec(block)) !== null) { // 前置纯文本(mention 紧挨手写「找@」时去掉重复 @) let before = decodeEntities(block.slice(lastEnd, m.index).replace(/<[^>]+>/g, '')) const tag = m[0] if (/data-type="mention"/i.test(tag)) { before = stripTrailingAtForMention(before) } if (before.trim()) segs.push({ type: 'text', text: before }) if (/data-type="mention"/i.test(tag)) { // @mention — TipTap mention span(span 内常见「@ 昵称」多空格,统一紧挨显示) const idMatch = tag.match(/data-id="([^"]*)"/) const labelMatch = tag.match(/data-label="([^"]*)"/) const innerText = tag.replace(/<[^>]+>/g, '') const userId = idMatch ? idMatch[1].trim() : '' let nickname = labelMatch ? labelMatch[1] : innerText.replace(/^[@@]\s*/, '') nickname = cleanMentionNickname((nickname || '').trim()) if (userId || nickname) { segs.push({ type: 'mention', userId, nickname, mentionDisplay: '@' + nickname }) } } else if (/data-type="linkTag"/i.test(tag)) { // #linkTag — 自定义 span 格式(data-type="linkTag" data-url="..." data-tag-type="..." data-page-path="..." data-app-id="...") const urlMatch = tag.match(/data-url="([^"]*)"/) const tagTypeMatch = tag.match(/data-tag-type="([^"]*)"/) const pagePathMatch = tag.match(/data-page-path="([^"]*)"/) const tagIdMatch = tag.match(/data-tag-id="([^"]*)"/) const appIdMatch = tag.match(/data-app-id="([^"]*)"/) const mpKeyMatch = tag.match(/data-mp-key="([^"]*)"/) const innerText = cleanSingleLineField(tag.replace(/<[^>]+>/g, '').replace(/^#/, '')) const url = urlMatch ? urlMatch[1] : '' const tagType = tagTypeMatch ? tagTypeMatch[1] : 'url' const pagePath = pagePathMatch ? pagePathMatch[1] : '' const tagId = tagIdMatch ? tagIdMatch[1] : '' const appId = appIdMatch ? appIdMatch[1] : '' const mpKey = mpKeyMatch ? mpKeyMatch[1] : '' segs.push({ type: 'linkTag', label: innerText || '#', url, tagType, pagePath, tagId, appId, mpKey }) } else if (/^(insertLinkTag 旧版产生,url 可能为空) // m[1] = href, m[2] = innerText(以 # 开头) const url = m[1] || '' const label = cleanSingleLineField((m[2] || '').replace(/^#/, '')) // 旧格式没有 tagType,在 onLinkTagTap 中会按 label 匹配缓存的 linkTags 配置降级处理 segs.push({ type: 'linkTag', label: label || '#', url, tagType: '', pagePath: '', tagId: '' }) } else if (/^]+>/g, '')) if (after.trim()) segs.push({ type: 'text', text: after }) return segs } /** * 从 HTML 中解析出 lines(纯文本行)和 segments(含富文本片段) * @param {string} html * @param {object} [config] - { persons: [], linkTags: [] },用于对 text 段自动匹配 @人名 / #标签 */ function parseHtmlToSegments(html, config) { const lines = [] const segments = [] // 1. 块级标签换行,保留内联标签供后续解析 let text = html text = text.replace(/<\/p>\s*]*>/gi, '\n\n') text = text.replace(/]*>/gi, '') text = text.replace(/<\/p>/gi, '\n') text = text.replace(/]*>/gi, '') text = text.replace(/<\/div>/gi, '\n') text = text.replace(//gi, '\n') text = text.replace(/<\/?h[1-6][^>]*>/gi, '\n') text = text.replace(/<\/?blockquote[^>]*>/gi, '\n') text = text.replace(/<\/?ul[^>]*>/gi, '\n') text = text.replace(/<\/?ol[^>]*>/gi, '\n') text = text.replace(/]*>/gi, '• ') text = text.replace(/<\/li>/gi, '\n') // 2. 逐段解析 const blocks = text.split(/\n+/) for (const block of blocks) { if (!block.trim()) continue let blockSegs = parseBlockToSegments(block) if (!blockSegs.length) continue // 纯图片行独立成段 if (blockSegs.length === 1 && blockSegs[0].type === 'image') { lines.push('') segments.push(blockSegs) continue } // 对 text 段再跑一遍 @人名 / #标签 自动匹配(处理未用 TipTap 插入而是手打的 @xxx) if (config && (config.persons?.length || config.linkTags?.length)) { const expanded = [] for (const seg of blockSegs) { if (seg.type === 'text' && seg.text) { const sub = matchLineToSegments(seg.text, config) expanded.push(...sub) } else { expanded.push(seg) } } blockSegs = expanded } // 行纯文本用于 lines(previewParagraphs 降级展示) const lineText = decodeEntities(block.replace(/<[^>]+>/g, '')).trim() lines.push(lineText) segments.push(blockSegs) } return { lines, segments } } /** 清理 Markdown 格式标记(**加粗** *斜体* __加粗__ _斜体_ ~~删除线~~ `代码` 等)*/ function stripMarkdownFormatting(text) { if (!text) return text let s = text s = s.replace(/^#{1,6}\s+/gm, '') s = s.replace(/\*\*(.+?)\*\*/g, '$1') s = s.replace(/__(.+?)__/g, '$1') s = s.replace(/(?\s+/gm, '') s = s.replace(/^---$/gm, '') s = s.replace(/^\* /gm, '• ') s = s.replace(/^- /gm, '• ') s = s.replace(/^\d+\.\s/gm, '') return s } /** * 对一行纯文本进行 @人名 / #标签 自动匹配,返回 segments 数组 * config: { persons: [{ personId, token, name, label, aliases }], linkTags: [...] } * 点击加好友时须传 persons.token(与 CKB lead 的 targetUserId 一致),不能用 personId。 */ function matchLineToSegments(line, config) { if (!config || (!config.persons?.length && !config.linkTags?.length)) { return [{ type: 'text', text: line }] } // 编辑器/系统在 @ 与人名之间插入的普通空格,合并为紧挨 @(避免「找@ 阿浪」无法匹配人名) line = line.replace(/([@@])\s+(?=[\u4e00-\u9fffA-Za-z0-9_\u00b7])/g, '$1') const normalize = s => (s || '').trim().toLowerCase() const personMap = {} const tagMap = {} for (const p of (config.persons || [])) { const token = (p.token || '').trim() if (!token) continue const display = (p.name || p.label || '').trim() const aliasStr = p.aliases != null ? String(p.aliases) : '' const keys = [display, p.label, ...(aliasStr ? aliasStr.split(',') : [])] .map((x) => (x != null ? String(x) : '').trim()) .filter(Boolean) .map(normalize) .filter(Boolean) for (const k of keys) { if (!personMap[k]) personMap[k] = p } } for (const t of (config.linkTags || [])) { const keys = [t.label, ...(t.aliases ? t.aliases.split(',') : [])].map(normalize).filter(Boolean) for (const k of keys) { if (!tagMap[k]) tagMap[k] = t } } const esc = n => n.replace(/[.*+?^${}()|[\]\\]/g, '\\$&') const personNames = Object.keys(personMap).sort((a, b) => b.length - a.length).map(esc) const tagLabels = Object.keys(tagMap).sort((a, b) => b.length - a.length).map(esc) if (!personNames.length && !tagLabels.length) return [{ type: 'text', text: line }] const parts = [] if (personNames.length) parts.push('[@@]\\s*(' + personNames.join('|') + ')') if (tagLabels.length) parts.push('[##]\\s*(' + tagLabels.join('|') + ')') const pattern = new RegExp(parts.join('|'), 'gi') const segs = [] let lastEnd = 0 let m while ((m = pattern.exec(line)) !== null) { if (m.index > lastEnd) { segs.push({ type: 'text', text: line.slice(lastEnd, m.index) }) } const full = m[0] if (/^[@@]/u.test(full)) { const body = full.replace(/^[@@]\s*/u, '') const person = personMap[normalize(body)] if (person) { const nick = cleanSingleLineField(person.name || person.label || body) const uid = (person.token || '').trim() if (uid) { segs.push({ type: 'mention', userId: uid, nickname: nick, mentionDisplay: '@' + nick }) } else { segs.push({ type: 'text', text: full }) } } else { segs.push({ type: 'text', text: full }) } } else { const body = full.replace(/^[##]\s*/u, '') const tag = tagMap[normalize(body)] if (tag) { segs.push({ type: 'linkTag', label: tag.label || body, url: tag.url || '', tagType: tag.type || 'url', pagePath: tag.pagePath || '', tagId: tag.tagId || '', appId: tag.appId || '', mpKey: tag.mpKey || '' }) } else { segs.push({ type: 'text', text: full }) } } lastEnd = m.index + full.length } if (lastEnd < line.length) { segs.push({ type: 'text', text: line.slice(lastEnd) }) } return segs.length ? segs : [{ type: 'text', text: line }] } /** 纯文本/Markdown 按行解析 */ function parsePlainTextToSegments(text, config) { const cleaned = stripMarkdownFormatting(text) const lines = cleaned.split('\n').map(l => l.trim()).filter(l => l.length > 0) const segments = lines.map(line => matchLineToSegments(line, config)) return { lines, segments } } /** 清理残留的 Markdown 图片引用文本(如 "image.png![](xxx)" ) */ function stripOrphanImageRefs(text) { if (!text) return text text = text.replace(/[^\s]*\.(?:png|jpg|jpeg|gif|webp|svg|bmp)!\[[^\]]*\]\([^)]*\)/gi, '') text = text.replace(/!\[[^\]]*\]\([^)]*\)/g, '') return text } /** * 将原始内容解析为 contentSegments(用于阅读页展示) * @param {string} rawContent * @param {object} [config] - { persons: [], linkTags: [] } * @returns {{ lines: string[], segments: Array> }} */ function parseContent(rawContent, config) { if (!rawContent || typeof rawContent !== 'string') { return { lines: [], segments: [] } } let content = stripOrphanImageRefs(rawContent) if (isHtmlContent(content)) { return parseHtmlToSegments(content, config) } return parsePlainTextToSegments(content, config) } module.exports = { parseContent, isHtmlContent, cleanSingleLineField, }