2026-03-10 14:32:20 +08:00
|
|
|
|
/**
|
|
|
|
|
|
* Soul创业派对 - 内容解析工具
|
2026-03-10 18:06:10 +08:00
|
|
|
|
* 解析 TipTap HTML 为阅读页可展示的 segments
|
|
|
|
|
|
*
|
|
|
|
|
|
* segment 类型:
|
|
|
|
|
|
* { type: 'text', text }
|
|
|
|
|
|
* { type: 'mention', userId, nickname } — @某人,点击加好友
|
|
|
|
|
|
* { type: 'linkTag', label, url } — #链接标签,点击跳转
|
|
|
|
|
|
* { type: 'image', src, alt } — 图片
|
2026-03-10 14:32:20 +08:00
|
|
|
|
*/
|
|
|
|
|
|
|
2026-03-10 18:06:10 +08:00
|
|
|
|
/** 判断内容是否为 HTML */
|
2026-03-10 14:32:20 +08:00
|
|
|
|
function isHtmlContent(content) {
|
|
|
|
|
|
if (!content || typeof content !== 'string') return false
|
|
|
|
|
|
const trimmed = content.trim()
|
|
|
|
|
|
return trimmed.includes('<') && trimmed.includes('>') && /<[a-z][^>]*>/i.test(trimmed)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-03-10 18:06:10 +08:00
|
|
|
|
/** 解码常见 HTML 实体 */
|
|
|
|
|
|
function decodeEntities(str) {
|
|
|
|
|
|
return str
|
|
|
|
|
|
.replace(/ /g, ' ')
|
|
|
|
|
|
.replace(/&/g, '&')
|
|
|
|
|
|
.replace(/</g, '<')
|
|
|
|
|
|
.replace(/>/g, '>')
|
|
|
|
|
|
.replace(/"/g, '"')
|
|
|
|
|
|
.replace(/'/g, "'")
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* 将一个 HTML block 字符串解析为 segments 数组
|
|
|
|
|
|
* 处理三种内联元素:mention / linkTag(span) / linkTag(a) / img
|
|
|
|
|
|
*/
|
|
|
|
|
|
function parseBlockToSegments(block) {
|
|
|
|
|
|
const segs = []
|
|
|
|
|
|
// 合并匹配所有内联元素
|
|
|
|
|
|
const tokenRe = /<span[^>]*data-type="mention"[^>]*>[\s\S]*?<\/span>|<span[^>]*data-type="linkTag"[^>]*>[\s\S]*?<\/span>|<a[^>]*href="([^"]*)"[^>]*>(#[^<]*)<\/a>|<img[^>]*\/?>/gi
|
|
|
|
|
|
let lastEnd = 0
|
|
|
|
|
|
let m
|
|
|
|
|
|
|
|
|
|
|
|
while ((m = tokenRe.exec(block)) !== null) {
|
|
|
|
|
|
// 前置纯文本
|
|
|
|
|
|
const before = decodeEntities(block.slice(lastEnd, m.index).replace(/<[^>]+>/g, ''))
|
|
|
|
|
|
if (before.trim()) segs.push({ type: 'text', text: before })
|
|
|
|
|
|
|
|
|
|
|
|
const tag = m[0]
|
|
|
|
|
|
|
|
|
|
|
|
if (/data-type="mention"/i.test(tag)) {
|
|
|
|
|
|
// @mention — TipTap mention span
|
|
|
|
|
|
const idMatch = tag.match(/data-id="([^"]*)"/)
|
|
|
|
|
|
const labelMatch = tag.match(/data-label="([^"]*)"/)
|
|
|
|
|
|
const innerText = tag.replace(/<[^>]+>/g, '')
|
|
|
|
|
|
const userId = idMatch ? idMatch[1].trim() : ''
|
|
|
|
|
|
const nickname = labelMatch ? labelMatch[1].trim() : innerText.replace(/^@/, '').trim()
|
|
|
|
|
|
if (userId || nickname) segs.push({ type: 'mention', userId, nickname })
|
|
|
|
|
|
|
|
|
|
|
|
} else if (/data-type="linkTag"/i.test(tag)) {
|
2026-03-12 16:51:12 +08:00
|
|
|
|
// #linkTag — 自定义 span 格式(data-type="linkTag" data-url="..." data-tag-type="..." data-page-path="..." data-app-id="...")
|
2026-03-10 18:06:10 +08:00
|
|
|
|
const urlMatch = tag.match(/data-url="([^"]*)"/)
|
|
|
|
|
|
const tagTypeMatch = tag.match(/data-tag-type="([^"]*)"/)
|
|
|
|
|
|
const pagePathMatch = tag.match(/data-page-path="([^"]*)"/)
|
|
|
|
|
|
const tagIdMatch = tag.match(/data-tag-id="([^"]*)"/)
|
2026-03-12 16:51:12 +08:00
|
|
|
|
const appIdMatch = tag.match(/data-app-id="([^"]*)"/)
|
|
|
|
|
|
const mpKeyMatch = tag.match(/data-mp-key="([^"]*)"/)
|
2026-03-10 18:06:10 +08:00
|
|
|
|
const innerText = tag.replace(/<[^>]+>/g, '').replace(/^#/, '').trim()
|
|
|
|
|
|
const url = urlMatch ? urlMatch[1] : ''
|
|
|
|
|
|
const tagType = tagTypeMatch ? tagTypeMatch[1] : 'url'
|
|
|
|
|
|
const pagePath = pagePathMatch ? pagePathMatch[1] : ''
|
|
|
|
|
|
const tagId = tagIdMatch ? tagIdMatch[1] : ''
|
2026-03-12 16:51:12 +08:00
|
|
|
|
const appId = appIdMatch ? appIdMatch[1] : ''
|
2026-03-14 14:37:17 +08:00
|
|
|
|
const mpKey = mpKeyMatch ? mpKeyMatch[1] : ''
|
2026-03-12 16:51:12 +08:00
|
|
|
|
segs.push({ type: 'linkTag', label: innerText || '#', url, tagType, pagePath, tagId, appId, mpKey })
|
2026-03-10 18:06:10 +08:00
|
|
|
|
|
|
|
|
|
|
} else if (/^<a /i.test(tag)) {
|
|
|
|
|
|
// #linkTag — 旧格式 <a href>(insertLinkTag 旧版产生,url 可能为空)
|
|
|
|
|
|
// m[1] = href, m[2] = innerText(以 # 开头)
|
|
|
|
|
|
const url = m[1] || ''
|
|
|
|
|
|
const label = (m[2] || '').replace(/^#/, '').trim()
|
|
|
|
|
|
// 旧格式没有 tagType,在 onLinkTagTap 中会按 label 匹配缓存的 linkTags 配置降级处理
|
|
|
|
|
|
segs.push({ type: 'linkTag', label: label || '#', url, tagType: '', pagePath: '', tagId: '' })
|
|
|
|
|
|
|
|
|
|
|
|
} else if (/^<img /i.test(tag)) {
|
|
|
|
|
|
// 图片
|
|
|
|
|
|
const srcMatch = tag.match(/src="([^"]*)"/)
|
|
|
|
|
|
const altMatch = tag.match(/alt="([^"]*)"/)
|
|
|
|
|
|
if (srcMatch) {
|
|
|
|
|
|
segs.push({ type: 'image', src: srcMatch[1], alt: altMatch ? altMatch[1] : '' })
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
lastEnd = m.index + tag.length
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 尾部纯文本
|
|
|
|
|
|
const after = decodeEntities(block.slice(lastEnd).replace(/<[^>]+>/g, ''))
|
|
|
|
|
|
if (after.trim()) segs.push({ type: 'text', text: after })
|
|
|
|
|
|
|
|
|
|
|
|
return segs
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-03-10 14:32:20 +08:00
|
|
|
|
/**
|
2026-03-10 18:06:10 +08:00
|
|
|
|
* 从 HTML 中解析出 lines(纯文本行)和 segments(含富文本片段)
|
2026-03-15 23:00:42 +08:00
|
|
|
|
* @param {string} html
|
|
|
|
|
|
* @param {object} [config] - { persons: [], linkTags: [] },用于对 text 段自动匹配 @人名 / #标签
|
2026-03-10 14:32:20 +08:00
|
|
|
|
*/
|
2026-03-15 23:00:42 +08:00
|
|
|
|
function parseHtmlToSegments(html, config) {
|
2026-03-10 14:32:20 +08:00
|
|
|
|
const lines = []
|
|
|
|
|
|
const segments = []
|
|
|
|
|
|
|
2026-03-10 18:06:10 +08:00
|
|
|
|
// 1. 块级标签换行,保留内联标签供后续解析
|
2026-03-10 14:32:20 +08:00
|
|
|
|
let text = html
|
|
|
|
|
|
text = text.replace(/<\/p>\s*<p[^>]*>/gi, '\n\n')
|
|
|
|
|
|
text = text.replace(/<p[^>]*>/gi, '')
|
|
|
|
|
|
text = text.replace(/<\/p>/gi, '\n')
|
|
|
|
|
|
text = text.replace(/<div[^>]*>/gi, '')
|
|
|
|
|
|
text = text.replace(/<\/div>/gi, '\n')
|
|
|
|
|
|
text = text.replace(/<br\s*\/?>/gi, '\n')
|
|
|
|
|
|
text = text.replace(/<\/?h[1-6][^>]*>/gi, '\n')
|
|
|
|
|
|
text = text.replace(/<\/?blockquote[^>]*>/gi, '\n')
|
|
|
|
|
|
text = text.replace(/<\/?ul[^>]*>/gi, '\n')
|
|
|
|
|
|
text = text.replace(/<\/?ol[^>]*>/gi, '\n')
|
|
|
|
|
|
text = text.replace(/<li[^>]*>/gi, '• ')
|
|
|
|
|
|
text = text.replace(/<\/li>/gi, '\n')
|
|
|
|
|
|
|
2026-03-10 18:06:10 +08:00
|
|
|
|
// 2. 逐段解析
|
2026-03-10 14:32:20 +08:00
|
|
|
|
const blocks = text.split(/\n+/)
|
|
|
|
|
|
for (const block of blocks) {
|
2026-03-10 18:06:10 +08:00
|
|
|
|
if (!block.trim()) continue
|
|
|
|
|
|
|
2026-03-15 23:00:42 +08:00
|
|
|
|
let blockSegs = parseBlockToSegments(block)
|
2026-03-10 18:06:10 +08:00
|
|
|
|
if (!blockSegs.length) continue
|
|
|
|
|
|
|
|
|
|
|
|
// 纯图片行独立成段
|
|
|
|
|
|
if (blockSegs.length === 1 && blockSegs[0].type === 'image') {
|
|
|
|
|
|
lines.push('')
|
|
|
|
|
|
segments.push(blockSegs)
|
|
|
|
|
|
continue
|
2026-03-10 14:32:20 +08:00
|
|
|
|
}
|
2026-03-10 18:06:10 +08:00
|
|
|
|
|
2026-03-15 23:00:42 +08:00
|
|
|
|
// 对 text 段再跑一遍 @人名 / #标签 自动匹配(处理未用 TipTap 插入而是手打的 @xxx)
|
|
|
|
|
|
if (config && (config.persons?.length || config.linkTags?.length)) {
|
|
|
|
|
|
const expanded = []
|
|
|
|
|
|
for (const seg of blockSegs) {
|
|
|
|
|
|
if (seg.type === 'text' && seg.text) {
|
|
|
|
|
|
const sub = matchLineToSegments(seg.text, config)
|
|
|
|
|
|
expanded.push(...sub)
|
|
|
|
|
|
} else {
|
|
|
|
|
|
expanded.push(seg)
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
blockSegs = expanded
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-03-10 18:06:10 +08:00
|
|
|
|
// 行纯文本用于 lines(previewParagraphs 降级展示)
|
|
|
|
|
|
const lineText = decodeEntities(block.replace(/<[^>]+>/g, '')).trim()
|
|
|
|
|
|
lines.push(lineText)
|
|
|
|
|
|
segments.push(blockSegs)
|
2026-03-10 14:32:20 +08:00
|
|
|
|
}
|
2026-03-10 18:06:10 +08:00
|
|
|
|
|
2026-03-10 14:32:20 +08:00
|
|
|
|
return { lines, segments }
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-03-15 09:20:27 +08:00
|
|
|
|
/** 清理 Markdown 格式标记(**加粗** *斜体* __加粗__ _斜体_ ~~删除线~~ `代码` 等)*/
|
|
|
|
|
|
function stripMarkdownFormatting(text) {
|
|
|
|
|
|
if (!text) return text
|
|
|
|
|
|
let s = text
|
|
|
|
|
|
s = s.replace(/^#{1,6}\s+/gm, '')
|
|
|
|
|
|
s = s.replace(/\*\*(.+?)\*\*/g, '$1')
|
|
|
|
|
|
s = s.replace(/__(.+?)__/g, '$1')
|
|
|
|
|
|
s = s.replace(/(?<!\*)\*(?!\*)(.+?)(?<!\*)\*(?!\*)/g, '$1')
|
|
|
|
|
|
s = s.replace(/(?<!_)_(?!_)(.+?)(?<!_)_(?!_)/g, '$1')
|
|
|
|
|
|
s = s.replace(/~~(.+?)~~/g, '$1')
|
|
|
|
|
|
s = s.replace(/`([^`]+)`/g, '$1')
|
|
|
|
|
|
s = s.replace(/^>\s+/gm, '')
|
|
|
|
|
|
s = s.replace(/^---$/gm, '')
|
|
|
|
|
|
s = s.replace(/^\* /gm, '• ')
|
|
|
|
|
|
s = s.replace(/^- /gm, '• ')
|
|
|
|
|
|
s = s.replace(/^\d+\.\s/gm, '')
|
|
|
|
|
|
return s
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-03-15 23:00:42 +08:00
|
|
|
|
/**
|
|
|
|
|
|
* 对一行纯文本进行 @人名 / #标签 自动匹配,返回 segments 数组
|
|
|
|
|
|
* config: { persons: [{personId, name, aliases}], linkTags: [{tagId, label, type, pagePath, mpKey, url, aliases}] }
|
|
|
|
|
|
*/
|
|
|
|
|
|
function matchLineToSegments(line, config) {
|
|
|
|
|
|
if (!config || (!config.persons?.length && !config.linkTags?.length)) {
|
|
|
|
|
|
return [{ type: 'text', text: line }]
|
|
|
|
|
|
}
|
|
|
|
|
|
const normalize = s => (s || '').trim().toLowerCase()
|
|
|
|
|
|
const personMap = {}
|
|
|
|
|
|
const tagMap = {}
|
|
|
|
|
|
for (const p of (config.persons || [])) {
|
|
|
|
|
|
const keys = [p.name, ...(p.aliases ? p.aliases.split(',') : [])].map(normalize).filter(Boolean)
|
|
|
|
|
|
for (const k of keys) { if (!personMap[k]) personMap[k] = p }
|
|
|
|
|
|
}
|
|
|
|
|
|
for (const t of (config.linkTags || [])) {
|
|
|
|
|
|
const keys = [t.label, ...(t.aliases ? t.aliases.split(',') : [])].map(normalize).filter(Boolean)
|
|
|
|
|
|
for (const k of keys) { if (!tagMap[k]) tagMap[k] = t }
|
|
|
|
|
|
}
|
|
|
|
|
|
const esc = n => n.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')
|
|
|
|
|
|
const personNames = Object.keys(personMap).sort((a, b) => b.length - a.length).map(esc)
|
|
|
|
|
|
const tagLabels = Object.keys(tagMap).sort((a, b) => b.length - a.length).map(esc)
|
|
|
|
|
|
if (!personNames.length && !tagLabels.length) return [{ type: 'text', text: line }]
|
|
|
|
|
|
|
|
|
|
|
|
const parts = []
|
|
|
|
|
|
if (personNames.length) parts.push('[@@](' + personNames.join('|') + ')')
|
|
|
|
|
|
if (tagLabels.length) parts.push('[##](' + tagLabels.join('|') + ')')
|
|
|
|
|
|
const pattern = new RegExp(parts.join('|'), 'gi')
|
|
|
|
|
|
|
|
|
|
|
|
const segs = []
|
|
|
|
|
|
let lastEnd = 0
|
|
|
|
|
|
let m
|
|
|
|
|
|
while ((m = pattern.exec(line)) !== null) {
|
|
|
|
|
|
if (m.index > lastEnd) {
|
|
|
|
|
|
segs.push({ type: 'text', text: line.slice(lastEnd, m.index) })
|
|
|
|
|
|
}
|
|
|
|
|
|
const full = m[0]
|
|
|
|
|
|
const prefix = full[0]
|
|
|
|
|
|
const body = full.slice(1)
|
|
|
|
|
|
if (prefix === '@' || prefix === '@') {
|
|
|
|
|
|
const person = personMap[normalize(body)]
|
|
|
|
|
|
if (person) {
|
|
|
|
|
|
segs.push({ type: 'mention', userId: person.personId || '', nickname: person.name || body })
|
|
|
|
|
|
} else {
|
|
|
|
|
|
segs.push({ type: 'text', text: full })
|
|
|
|
|
|
}
|
|
|
|
|
|
} else {
|
|
|
|
|
|
const tag = tagMap[normalize(body)]
|
|
|
|
|
|
if (tag) {
|
|
|
|
|
|
segs.push({
|
|
|
|
|
|
type: 'linkTag',
|
|
|
|
|
|
label: tag.label || body,
|
|
|
|
|
|
url: tag.url || '',
|
|
|
|
|
|
tagType: tag.type || 'url',
|
|
|
|
|
|
pagePath: tag.pagePath || '',
|
|
|
|
|
|
tagId: tag.tagId || '',
|
|
|
|
|
|
appId: tag.appId || '',
|
|
|
|
|
|
mpKey: tag.mpKey || ''
|
|
|
|
|
|
})
|
|
|
|
|
|
} else {
|
|
|
|
|
|
segs.push({ type: 'text', text: full })
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
lastEnd = m.index + full.length
|
|
|
|
|
|
}
|
|
|
|
|
|
if (lastEnd < line.length) {
|
|
|
|
|
|
segs.push({ type: 'text', text: line.slice(lastEnd) })
|
|
|
|
|
|
}
|
|
|
|
|
|
return segs.length ? segs : [{ type: 'text', text: line }]
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-03-15 09:20:27 +08:00
|
|
|
|
/** 纯文本/Markdown 按行解析 */
|
2026-03-15 23:00:42 +08:00
|
|
|
|
function parsePlainTextToSegments(text, config) {
|
2026-03-15 09:20:27 +08:00
|
|
|
|
const cleaned = stripMarkdownFormatting(text)
|
|
|
|
|
|
const lines = cleaned.split('\n').map(l => l.trim()).filter(l => l.length > 0)
|
2026-03-15 23:00:42 +08:00
|
|
|
|
const segments = lines.map(line => matchLineToSegments(line, config))
|
2026-03-10 14:32:20 +08:00
|
|
|
|
return { lines, segments }
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-03-15 23:00:42 +08:00
|
|
|
|
/** 清理残留的 Markdown 图片引用文本(如 "image.png" ) */
|
|
|
|
|
|
function stripOrphanImageRefs(text) {
|
|
|
|
|
|
if (!text) return text
|
|
|
|
|
|
text = text.replace(/[^\s]*\.(?:png|jpg|jpeg|gif|webp|svg|bmp)!\[[^\]]*\]\([^)]*\)/gi, '')
|
|
|
|
|
|
text = text.replace(/!\[[^\]]*\]\([^)]*\)/g, '')
|
|
|
|
|
|
return text
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-03-10 14:32:20 +08:00
|
|
|
|
/**
|
|
|
|
|
|
* 将原始内容解析为 contentSegments(用于阅读页展示)
|
2026-03-10 18:06:10 +08:00
|
|
|
|
* @param {string} rawContent
|
2026-03-15 23:00:42 +08:00
|
|
|
|
* @param {object} [config] - { persons: [], linkTags: [] }
|
2026-03-10 18:06:10 +08:00
|
|
|
|
* @returns {{ lines: string[], segments: Array<Array<segment>> }}
|
2026-03-10 14:32:20 +08:00
|
|
|
|
*/
|
2026-03-15 23:00:42 +08:00
|
|
|
|
function parseContent(rawContent, config) {
|
2026-03-10 14:32:20 +08:00
|
|
|
|
if (!rawContent || typeof rawContent !== 'string') {
|
|
|
|
|
|
return { lines: [], segments: [] }
|
|
|
|
|
|
}
|
2026-03-15 23:00:42 +08:00
|
|
|
|
let content = stripOrphanImageRefs(rawContent)
|
|
|
|
|
|
if (isHtmlContent(content)) {
|
|
|
|
|
|
return parseHtmlToSegments(content, config)
|
2026-03-10 14:32:20 +08:00
|
|
|
|
}
|
2026-03-15 23:00:42 +08:00
|
|
|
|
return parsePlainTextToSegments(content, config)
|
2026-03-10 14:32:20 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
module.exports = {
|
|
|
|
|
|
parseContent,
|
|
|
|
|
|
isHtmlContent
|
|
|
|
|
|
}
|