Files
soul-yongping/miniprogram/utils/contentParser.js
卡若 5724fba877 feat: 小程序超级个体/个人资料/CKB获客;VIP列表展示过滤;管理端与API联调
- 超级个体:去掉首位特例;列表仅展示有头像且非微信默认昵称(vip.go)
- 个人资料:居中头像、低调联系方式、点头像优先走存客宝 lead(ckbLeadToken)
- 阅读页分享朋友圈复制与 toast 去重
- soul-api: miniprogram users 带 ckbLeadToken;其它 handler 与路由调整
- 脚本:content_upload、miniprogram 上传辅助等

Made-with: Cursor
2026-03-22 08:34:28 +08:00

338 lines
12 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* 卡若创业派对 - 内容解析工具
* 解析 TipTap HTML 为阅读页可展示的 segments
*
* segment 类型:
* { type: 'text', text }
* { type: 'mention', userId, nickname } — @某人,点击加好友
* { type: 'linkTag', label, url } — #链接标签,点击跳转
* { type: 'image', src, alt } — 图片
*/
/** 判断内容是否为 HTML */
function isHtmlContent(content) {
if (!content || typeof content !== 'string') return false
const trimmed = content.trim()
return trimmed.includes('<') && trimmed.includes('>') && /<[a-z][^>]*>/i.test(trimmed)
}
/** 解码常见 HTML 实体 */
function decodeEntities(str) {
return str
.replace(/&nbsp;/g, ' ')
.replace(/&amp;/g, '&')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'")
}
/**
* 单行展示用:昵称、#标签文案、章节外标题类字段 — 合并换行、<br>、连续空白(避免 TipTap/粘贴带入异常断行)
*/
function cleanSingleLineField(s) {
if (!s && s !== 0) return ''
let t = decodeEntities(String(s))
.replace(/<br\s*\/?>/gi, ' ')
.replace(/\r\n|\r|\n/g, ' ')
.replace(/[\s\u00a0\u200b\u200c\u200d\ufeff\u3000]+/g, ' ')
.trim()
return t
}
/** @提及昵称:去首尾空白、零宽、全角空格;合并内部换行/<br> */
function cleanMentionNickname(n) {
return cleanSingleLineField(n)
}
/** 纯文本在 mention 节点前若已有「@」,去掉末尾 @,避免渲染成「找@@阿浪」 */
function stripTrailingAtForMention(before) {
return before.replace(/[@][\s\u00a0\u200b]*$/u, '')
}
/**
* 将一个 HTML block 字符串解析为 segments 数组
* 处理三种内联元素mention / linkTag(span) / linkTag(a) / img
*/
function parseBlockToSegments(block) {
const segs = []
// 合并匹配所有内联元素
const tokenRe = /<span[^>]*data-type="mention"[^>]*>[\s\S]*?<\/span>|<span[^>]*data-type="linkTag"[^>]*>[\s\S]*?<\/span>|<a[^>]*href="([^"]*)"[^>]*>(#[^<]*)<\/a>|<img[^>]*\/?>/gi
let lastEnd = 0
let m
while ((m = tokenRe.exec(block)) !== null) {
// 前置纯文本mention 紧挨手写「找@」时去掉重复 @
let before = decodeEntities(block.slice(lastEnd, m.index).replace(/<[^>]+>/g, ''))
const tag = m[0]
if (/data-type="mention"/i.test(tag)) {
before = stripTrailingAtForMention(before)
}
if (before.trim()) segs.push({ type: 'text', text: before })
if (/data-type="mention"/i.test(tag)) {
// @mention — TipTap mention spanspan 内常见「@ 昵称」多空格,统一紧挨显示)
const idMatch = tag.match(/data-id="([^"]*)"/)
const labelMatch = tag.match(/data-label="([^"]*)"/)
const innerText = tag.replace(/<[^>]+>/g, '')
const userId = idMatch ? idMatch[1].trim() : ''
let nickname = labelMatch ? labelMatch[1] : innerText.replace(/^[@]\s*/, '')
nickname = cleanMentionNickname((nickname || '').trim())
if (userId || nickname) {
segs.push({ type: 'mention', userId, nickname, mentionDisplay: '@' + nickname })
}
} else if (/data-type="linkTag"/i.test(tag)) {
// #linkTag — 自定义 span 格式data-type="linkTag" data-url="..." data-tag-type="..." data-page-path="..." data-app-id="..."
const urlMatch = tag.match(/data-url="([^"]*)"/)
const tagTypeMatch = tag.match(/data-tag-type="([^"]*)"/)
const pagePathMatch = tag.match(/data-page-path="([^"]*)"/)
const tagIdMatch = tag.match(/data-tag-id="([^"]*)"/)
const appIdMatch = tag.match(/data-app-id="([^"]*)"/)
const mpKeyMatch = tag.match(/data-mp-key="([^"]*)"/)
const innerText = cleanSingleLineField(tag.replace(/<[^>]+>/g, '').replace(/^#/, ''))
const url = urlMatch ? urlMatch[1] : ''
const tagType = tagTypeMatch ? tagTypeMatch[1] : 'url'
const pagePath = pagePathMatch ? pagePathMatch[1] : ''
const tagId = tagIdMatch ? tagIdMatch[1] : ''
const appId = appIdMatch ? appIdMatch[1] : ''
const mpKey = mpKeyMatch ? mpKeyMatch[1] : ''
segs.push({ type: 'linkTag', label: innerText || '#', url, tagType, pagePath, tagId, appId, mpKey })
} else if (/^<a /i.test(tag)) {
// #linkTag — 旧格式 <a href>insertLinkTag 旧版产生url 可能为空)
// m[1] = href, m[2] = innerText以 # 开头)
const url = m[1] || ''
const label = cleanSingleLineField((m[2] || '').replace(/^#/, ''))
// 旧格式没有 tagType在 onLinkTagTap 中会按 label 匹配缓存的 linkTags 配置降级处理
segs.push({ type: 'linkTag', label: label || '#', url, tagType: '', pagePath: '', tagId: '' })
} else if (/^<img /i.test(tag)) {
// 图片
const srcMatch = tag.match(/src="([^"]*)"/)
const altMatch = tag.match(/alt="([^"]*)"/)
if (srcMatch) {
segs.push({ type: 'image', src: srcMatch[1], alt: altMatch ? altMatch[1] : '' })
}
}
lastEnd = m.index + tag.length
}
// 尾部纯文本
const after = decodeEntities(block.slice(lastEnd).replace(/<[^>]+>/g, ''))
if (after.trim()) segs.push({ type: 'text', text: after })
return segs
}
/**
* 从 HTML 中解析出 lines纯文本行和 segments含富文本片段
* @param {string} html
* @param {object} [config] - { persons: [], linkTags: [] },用于对 text 段自动匹配 @人名 / #标签
*/
function parseHtmlToSegments(html, config) {
const lines = []
const segments = []
// 1. 块级标签换行,保留内联标签供后续解析
let text = html
text = text.replace(/<\/p>\s*<p[^>]*>/gi, '\n\n')
text = text.replace(/<p[^>]*>/gi, '')
text = text.replace(/<\/p>/gi, '\n')
text = text.replace(/<div[^>]*>/gi, '')
text = text.replace(/<\/div>/gi, '\n')
text = text.replace(/<br\s*\/?>/gi, '\n')
text = text.replace(/<\/?h[1-6][^>]*>/gi, '\n')
text = text.replace(/<\/?blockquote[^>]*>/gi, '\n')
text = text.replace(/<\/?ul[^>]*>/gi, '\n')
text = text.replace(/<\/?ol[^>]*>/gi, '\n')
text = text.replace(/<li[^>]*>/gi, '• ')
text = text.replace(/<\/li>/gi, '\n')
// 2. 逐段解析
const blocks = text.split(/\n+/)
for (const block of blocks) {
if (!block.trim()) continue
let blockSegs = parseBlockToSegments(block)
if (!blockSegs.length) continue
// 纯图片行独立成段
if (blockSegs.length === 1 && blockSegs[0].type === 'image') {
lines.push('')
segments.push(blockSegs)
continue
}
// 对 text 段再跑一遍 @人名 / #标签 自动匹配(处理未用 TipTap 插入而是手打的 @xxx
if (config && (config.persons?.length || config.linkTags?.length)) {
const expanded = []
for (const seg of blockSegs) {
if (seg.type === 'text' && seg.text) {
const sub = matchLineToSegments(seg.text, config)
expanded.push(...sub)
} else {
expanded.push(seg)
}
}
blockSegs = expanded
}
// 行纯文本用于 linespreviewParagraphs 降级展示)
const lineText = decodeEntities(block.replace(/<[^>]+>/g, '')).trim()
lines.push(lineText)
segments.push(blockSegs)
}
return { lines, segments }
}
/** 清理 Markdown 格式标记(**加粗** *斜体* __加粗__ _斜体_ ~~删除线~~ `代码` 等)*/
function stripMarkdownFormatting(text) {
if (!text) return text
let s = text
s = s.replace(/^#{1,6}\s+/gm, '')
s = s.replace(/\*\*(.+?)\*\*/g, '$1')
s = s.replace(/__(.+?)__/g, '$1')
s = s.replace(/(?<!\*)\*(?!\*)(.+?)(?<!\*)\*(?!\*)/g, '$1')
s = s.replace(/(?<!_)_(?!_)(.+?)(?<!_)_(?!_)/g, '$1')
s = s.replace(/~~(.+?)~~/g, '$1')
s = s.replace(/`([^`]+)`/g, '$1')
s = s.replace(/^>\s+/gm, '')
s = s.replace(/^---$/gm, '')
s = s.replace(/^\* /gm, '• ')
s = s.replace(/^- /gm, '• ')
s = s.replace(/^\d+\.\s/gm, '')
return s
}
/**
* 对一行纯文本进行 @人名 / #标签 自动匹配,返回 segments 数组
* config: { persons: [{ personId, token, name, label, aliases }], linkTags: [...] }
* 点击加好友时须传 persons.token与 CKB lead 的 targetUserId 一致),不能用 personId。
*/
function matchLineToSegments(line, config) {
if (!config || (!config.persons?.length && !config.linkTags?.length)) {
return [{ type: 'text', text: line }]
}
// 编辑器/系统在 @ 与人名之间插入的普通空格,合并为紧挨 @(避免「找@ 阿浪」无法匹配人名)
line = line.replace(/([@])\s+(?=[\u4e00-\u9fffA-Za-z0-9_\u00b7])/g, '$1')
const normalize = s => (s || '').trim().toLowerCase()
const personMap = {}
const tagMap = {}
for (const p of (config.persons || [])) {
const token = (p.token || '').trim()
if (!token) continue
const display = (p.name || p.label || '').trim()
const aliasStr = p.aliases != null ? String(p.aliases) : ''
const keys = [display, p.label, ...(aliasStr ? aliasStr.split(',') : [])]
.map((x) => (x != null ? String(x) : '').trim())
.filter(Boolean)
.map(normalize)
.filter(Boolean)
for (const k of keys) {
if (!personMap[k]) personMap[k] = p
}
}
for (const t of (config.linkTags || [])) {
const keys = [t.label, ...(t.aliases ? t.aliases.split(',') : [])].map(normalize).filter(Boolean)
for (const k of keys) { if (!tagMap[k]) tagMap[k] = t }
}
const esc = n => n.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')
const personNames = Object.keys(personMap).sort((a, b) => b.length - a.length).map(esc)
const tagLabels = Object.keys(tagMap).sort((a, b) => b.length - a.length).map(esc)
if (!personNames.length && !tagLabels.length) return [{ type: 'text', text: line }]
const parts = []
if (personNames.length) parts.push('[@]\\s*(' + personNames.join('|') + ')')
if (tagLabels.length) parts.push('[#]\\s*(' + tagLabels.join('|') + ')')
const pattern = new RegExp(parts.join('|'), 'gi')
const segs = []
let lastEnd = 0
let m
while ((m = pattern.exec(line)) !== null) {
if (m.index > lastEnd) {
segs.push({ type: 'text', text: line.slice(lastEnd, m.index) })
}
const full = m[0]
if (/^[@]/u.test(full)) {
const body = full.replace(/^[@]\s*/u, '')
const person = personMap[normalize(body)]
if (person) {
const nick = cleanSingleLineField(person.name || person.label || body)
const uid = (person.token || '').trim()
if (uid) {
segs.push({ type: 'mention', userId: uid, nickname: nick, mentionDisplay: '@' + nick })
} else {
segs.push({ type: 'text', text: full })
}
} else {
segs.push({ type: 'text', text: full })
}
} else {
const body = full.replace(/^[#]\s*/u, '')
const tag = tagMap[normalize(body)]
if (tag) {
segs.push({
type: 'linkTag',
label: tag.label || body,
url: tag.url || '',
tagType: tag.type || 'url',
pagePath: tag.pagePath || '',
tagId: tag.tagId || '',
appId: tag.appId || '',
mpKey: tag.mpKey || ''
})
} else {
segs.push({ type: 'text', text: full })
}
}
lastEnd = m.index + full.length
}
if (lastEnd < line.length) {
segs.push({ type: 'text', text: line.slice(lastEnd) })
}
return segs.length ? segs : [{ type: 'text', text: line }]
}
/** 纯文本/Markdown 按行解析 */
function parsePlainTextToSegments(text, config) {
const cleaned = stripMarkdownFormatting(text)
const lines = cleaned.split('\n').map(l => l.trim()).filter(l => l.length > 0)
const segments = lines.map(line => matchLineToSegments(line, config))
return { lines, segments }
}
/** 清理残留的 Markdown 图片引用文本(如 "image.png![](xxx)" */
function stripOrphanImageRefs(text) {
if (!text) return text
text = text.replace(/[^\s]*\.(?:png|jpg|jpeg|gif|webp|svg|bmp)!\[[^\]]*\]\([^)]*\)/gi, '')
text = text.replace(/!\[[^\]]*\]\([^)]*\)/g, '')
return text
}
/**
* 将原始内容解析为 contentSegments用于阅读页展示
* @param {string} rawContent
* @param {object} [config] - { persons: [], linkTags: [] }
* @returns {{ lines: string[], segments: Array<Array<segment>> }}
*/
function parseContent(rawContent, config) {
if (!rawContent || typeof rawContent !== 'string') {
return { lines: [], segments: [] }
}
let content = stripOrphanImageRefs(rawContent)
if (isHtmlContent(content)) {
return parseHtmlToSegments(content, config)
}
return parsePlainTextToSegments(content, config)
}
module.exports = {
parseContent,
isHtmlContent,
cleanSingleLineField,
}