Files
soul-yongping/miniprogram/utils/contentParser.js

169 lines
5.9 KiB
JavaScript
Raw Normal View History

/**
* Soul创业派对 - 内容解析工具
* 解析 TipTap HTML 为阅读页可展示的 segments
*
* segment 类型
* { type: 'text', text }
* { type: 'mention', userId, nickname } @某人点击加好友
* { type: 'linkTag', label, url } #链接标签点击跳转
* { type: 'image', src, alt } 图片
*/
/** 判断内容是否为 HTML */
function isHtmlContent(content) {
if (!content || typeof content !== 'string') return false
const trimmed = content.trim()
return trimmed.includes('<') && trimmed.includes('>') && /<[a-z][^>]*>/i.test(trimmed)
}
/** 解码常见 HTML 实体 */
function decodeEntities(str) {
return str
.replace(/&nbsp;/g, ' ')
.replace(/&amp;/g, '&')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'")
}
/**
* 将一个 HTML block 字符串解析为 segments 数组
* 处理三种内联元素mention / linkTag(span) / linkTag(a) / img
*/
function parseBlockToSegments(block) {
const segs = []
// 合并匹配所有内联元素
const tokenRe = /<span[^>]*data-type="mention"[^>]*>[\s\S]*?<\/span>|<span[^>]*data-type="linkTag"[^>]*>[\s\S]*?<\/span>|<a[^>]*href="([^"]*)"[^>]*>(#[^<]*)<\/a>|<img[^>]*\/?>/gi
let lastEnd = 0
let m
while ((m = tokenRe.exec(block)) !== null) {
// 前置纯文本
const before = decodeEntities(block.slice(lastEnd, m.index).replace(/<[^>]+>/g, ''))
if (before.trim()) segs.push({ type: 'text', text: before })
const tag = m[0]
if (/data-type="mention"/i.test(tag)) {
// @mention — TipTap mention span
const idMatch = tag.match(/data-id="([^"]*)"/)
const labelMatch = tag.match(/data-label="([^"]*)"/)
const innerText = tag.replace(/<[^>]+>/g, '')
const userId = idMatch ? idMatch[1].trim() : ''
const nickname = labelMatch ? labelMatch[1].trim() : innerText.replace(/^@/, '').trim()
if (userId || nickname) segs.push({ type: 'mention', userId, nickname })
} else if (/data-type="linkTag"/i.test(tag)) {
// #linkTag — 自定义 span 格式data-type="linkTag" data-url="..." data-tag-type="..." data-page-path="..."
const urlMatch = tag.match(/data-url="([^"]*)"/)
const tagTypeMatch = tag.match(/data-tag-type="([^"]*)"/)
const pagePathMatch = tag.match(/data-page-path="([^"]*)"/)
const tagIdMatch = tag.match(/data-tag-id="([^"]*)"/)
const innerText = tag.replace(/<[^>]+>/g, '').replace(/^#/, '').trim()
const url = urlMatch ? urlMatch[1] : ''
const tagType = tagTypeMatch ? tagTypeMatch[1] : 'url'
const pagePath = pagePathMatch ? pagePathMatch[1] : ''
const tagId = tagIdMatch ? tagIdMatch[1] : ''
segs.push({ type: 'linkTag', label: innerText || '#', url, tagType, pagePath, tagId })
} else if (/^<a /i.test(tag)) {
// #linkTag — 旧格式 <a href>insertLinkTag 旧版产生url 可能为空)
// m[1] = href, m[2] = innerText以 # 开头)
const url = m[1] || ''
const label = (m[2] || '').replace(/^#/, '').trim()
// 旧格式没有 tagType在 onLinkTagTap 中会按 label 匹配缓存的 linkTags 配置降级处理
segs.push({ type: 'linkTag', label: label || '#', url, tagType: '', pagePath: '', tagId: '' })
} else if (/^<img /i.test(tag)) {
// 图片
const srcMatch = tag.match(/src="([^"]*)"/)
const altMatch = tag.match(/alt="([^"]*)"/)
if (srcMatch) {
segs.push({ type: 'image', src: srcMatch[1], alt: altMatch ? altMatch[1] : '' })
}
}
lastEnd = m.index + tag.length
}
// 尾部纯文本
const after = decodeEntities(block.slice(lastEnd).replace(/<[^>]+>/g, ''))
if (after.trim()) segs.push({ type: 'text', text: after })
return segs
}
/**
* HTML 中解析出 lines纯文本行 segments含富文本片段
*/
function parseHtmlToSegments(html) {
const lines = []
const segments = []
// 1. 块级标签换行,保留内联标签供后续解析
let text = html
text = text.replace(/<\/p>\s*<p[^>]*>/gi, '\n\n')
text = text.replace(/<p[^>]*>/gi, '')
text = text.replace(/<\/p>/gi, '\n')
text = text.replace(/<div[^>]*>/gi, '')
text = text.replace(/<\/div>/gi, '\n')
text = text.replace(/<br\s*\/?>/gi, '\n')
text = text.replace(/<\/?h[1-6][^>]*>/gi, '\n')
text = text.replace(/<\/?blockquote[^>]*>/gi, '\n')
text = text.replace(/<\/?ul[^>]*>/gi, '\n')
text = text.replace(/<\/?ol[^>]*>/gi, '\n')
text = text.replace(/<li[^>]*>/gi, '• ')
text = text.replace(/<\/li>/gi, '\n')
// 2. 逐段解析
const blocks = text.split(/\n+/)
for (const block of blocks) {
if (!block.trim()) continue
const blockSegs = parseBlockToSegments(block)
if (!blockSegs.length) continue
// 纯图片行独立成段
if (blockSegs.length === 1 && blockSegs[0].type === 'image') {
lines.push('')
segments.push(blockSegs)
continue
}
// 行纯文本用于 linespreviewParagraphs 降级展示)
const lineText = decodeEntities(block.replace(/<[^>]+>/g, '')).trim()
lines.push(lineText)
segments.push(blockSegs)
}
return { lines, segments }
}
/** 纯文本按行解析(无 HTML 标签) */
function parsePlainTextToSegments(text) {
const lines = text.split('\n').map(l => l.trim()).filter(l => l.length > 0)
const segments = lines.map(line => [{ type: 'text', text: line }])
return { lines, segments }
}
/**
* 将原始内容解析为 contentSegments用于阅读页展示
* @param {string} rawContent
* @returns {{ lines: string[], segments: Array<Array<segment>> }}
*/
function parseContent(rawContent) {
if (!rawContent || typeof rawContent !== 'string') {
return { lines: [], segments: [] }
}
if (isHtmlContent(rawContent)) {
return parseHtmlToSegments(rawContent)
}
return parsePlainTextToSegments(rawContent)
}
module.exports = {
parseContent,
isHtmlContent
}