soul-yongping/miniprogram/utils/contentParser.js

/**
 * Soul创业派对 - 内容解析工具
 * 解析 TipTap HTML 为阅读页可展示的 segments
 *
 * segment 类型：
 *   { type: 'text',    text }
 *   { type: 'mention', userId, nickname }        — @某人，点击加好友
 *   { type: 'linkTag', label, url }              — #链接标签，点击跳转
 *   { type: 'image',   src, alt }               — 图片
 */

/** 判断内容是否为 HTML */
function isHtmlContent(content) {
  if (!content || typeof content !== 'string') return false
  const trimmed = content.trim()
  return trimmed.includes('<') && trimmed.includes('>') && /<[a-z][^>]*>/i.test(trimmed)
}

/** 解码常见 HTML 实体 */
function decodeEntities(str) {
  return str
    .replace(/&nbsp;/g, ' ')
    .replace(/&amp;/g, '&')
    .replace(/&lt;/g, '<')
    .replace(/&gt;/g, '>')
    .replace(/&quot;/g, '"')
    .replace(/&#39;/g, "'")
}

/**
 * 将一个 HTML block 字符串解析为 segments 数组
 * 处理三种内联元素：mention / linkTag(span) / linkTag(a) / img
 */
function parseBlockToSegments(block) {
  const segs = []
  // 合并匹配所有内联元素
  const tokenRe = /<span[^>]*data-type="mention"[^>]*>[\s\S]*?<\/span>|<span[^>]*data-type="linkTag"[^>]*>[\s\S]*?<\/span>|<a[^>]*href="([^"]*)"[^>]*>(#[^<]*)<\/a>|<img[^>]*\/?>/gi
  let lastEnd = 0
  let m

  while ((m = tokenRe.exec(block)) !== null) {
    // 前置纯文本
    const before = decodeEntities(block.slice(lastEnd, m.index).replace(/<[^>]+>/g, ''))
    if (before.trim()) segs.push({ type: 'text', text: before })

    const tag = m[0]

    if (/data-type="mention"/i.test(tag)) {
      // @mention — TipTap mention span
      const idMatch = tag.match(/data-id="([^"]*)"/)
      const labelMatch = tag.match(/data-label="([^"]*)"/)
      const innerText = tag.replace(/<[^>]+>/g, '')
      const userId = idMatch ? idMatch[1].trim() : ''
      const nickname = labelMatch ? labelMatch[1].trim() : innerText.replace(/^@/, '').trim()
      if (userId || nickname) segs.push({ type: 'mention', userId, nickname })

    } else if (/data-type="linkTag"/i.test(tag)) {
      // #linkTag — 自定义 span 格式（data-type="linkTag" data-url="..." data-tag-type="..." data-page-path="..." data-app-id="..."）
      const urlMatch      = tag.match(/data-url="([^"]*)"/)
      const tagTypeMatch  = tag.match(/data-tag-type="([^"]*)"/)
      const pagePathMatch = tag.match(/data-page-path="([^"]*)"/)
      const tagIdMatch    = tag.match(/data-tag-id="([^"]*)"/)
      const appIdMatch    = tag.match(/data-app-id="([^"]*)"/)
      const mpKeyMatch    = tag.match(/data-mp-key="([^"]*)"/)
      const innerText     = tag.replace(/<[^>]+>/g, '').replace(/^#/, '').trim()
      const url           = urlMatch      ? urlMatch[1]      : ''
      const tagType       = tagTypeMatch  ? tagTypeMatch[1]  : 'url'
      const pagePath      = pagePathMatch ? pagePathMatch[1] : ''
      const tagId         = tagIdMatch    ? tagIdMatch[1]    : ''
      const appId         = appIdMatch    ? appIdMatch[1]   : ''
      const mpKey         = mpKeyMatch    ? mpKeyMatch[1]   : ''
      segs.push({ type: 'linkTag', label: innerText || '#', url, tagType, pagePath, tagId, appId, mpKey })

    } else if (/^<a /i.test(tag)) {
      // #linkTag — 旧格式 <a href>（insertLinkTag 旧版产生，url 可能为空）
      // m[1] = href, m[2] = innerText（以 # 开头）
      const url   = m[1] || ''
      const label = (m[2] || '').replace(/^#/, '').trim()
      // 旧格式没有 tagType，在 onLinkTagTap 中会按 label 匹配缓存的 linkTags 配置降级处理
      segs.push({ type: 'linkTag', label: label || '#', url, tagType: '', pagePath: '', tagId: '' })

    } else if (/^<img /i.test(tag)) {
      // 图片
      const srcMatch = tag.match(/src="([^"]*)"/)
      const altMatch = tag.match(/alt="([^"]*)"/)
      if (srcMatch) {
        segs.push({ type: 'image', src: srcMatch[1], alt: altMatch ? altMatch[1] : '' })
      }
    }

    lastEnd = m.index + tag.length
  }

  // 尾部纯文本
  const after = decodeEntities(block.slice(lastEnd).replace(/<[^>]+>/g, ''))
  if (after.trim()) segs.push({ type: 'text', text: after })

  return segs
}

/**
 * 从 HTML 中解析出 lines（纯文本行）和 segments（含富文本片段）
 */
function parseHtmlToSegments(html) {
  const lines = []
  const segments = []

  // 1. 块级标签换行，保留内联标签供后续解析
  let text = html
  text = text.replace(/<\/p>\s*<p[^>]*>/gi, '\n\n')
  text = text.replace(/<p[^>]*>/gi, '')
  text = text.replace(/<\/p>/gi, '\n')
  text = text.replace(/<div[^>]*>/gi, '')
  text = text.replace(/<\/div>/gi, '\n')
  text = text.replace(/<br\s*\/?>/gi, '\n')
  text = text.replace(/<\/?h[1-6][^>]*>/gi, '\n')
  text = text.replace(/<\/?blockquote[^>]*>/gi, '\n')
  text = text.replace(/<\/?ul[^>]*>/gi, '\n')
  text = text.replace(/<\/?ol[^>]*>/gi, '\n')
  text = text.replace(/<li[^>]*>/gi, '• ')
  text = text.replace(/<\/li>/gi, '\n')

  // 2. 逐段解析
  const blocks = text.split(/\n+/)
  for (const block of blocks) {
    if (!block.trim()) continue

    const blockSegs = parseBlockToSegments(block)
    if (!blockSegs.length) continue

    // 纯图片行独立成段
    if (blockSegs.length === 1 && blockSegs[0].type === 'image') {
      lines.push('')
      segments.push(blockSegs)
      continue
    }

    // 行纯文本用于 lines（previewParagraphs 降级展示）
    const lineText = decodeEntities(block.replace(/<[^>]+>/g, '')).trim()
    lines.push(lineText)
    segments.push(blockSegs)
  }

  return { lines, segments }
}

/** 清理 Markdown 格式标记（**加粗** *斜体* __加粗__ _斜体_ ~~删除线~~ `代码` 等）*/
function stripMarkdownFormatting(text) {
  if (!text) return text
  let s = text
  s = s.replace(/^#{1,6}\s+/gm, '')
  s = s.replace(/\*\*(.+?)\*\*/g, '$1')
  s = s.replace(/__(.+?)__/g, '$1')
  s = s.replace(/(?<!\*)\*(?!\*)(.+?)(?<!\*)\*(?!\*)/g, '$1')
  s = s.replace(/(?<!_)_(?!_)(.+?)(?<!_)_(?!_)/g, '$1')
  s = s.replace(/~~(.+?)~~/g, '$1')
  s = s.replace(/`([^`]+)`/g, '$1')
  s = s.replace(/^>\s+/gm, '')
  s = s.replace(/^---$/gm, '')
  s = s.replace(/^\* /gm, '• ')
  s = s.replace(/^- /gm, '• ')
  s = s.replace(/^\d+\.\s/gm, '')
  return s
}

/** 纯文本/Markdown 按行解析 */
function parsePlainTextToSegments(text) {
  const cleaned = stripMarkdownFormatting(text)
  const lines = cleaned.split('\n').map(l => l.trim()).filter(l => l.length > 0)
  const segments = lines.map(line => [{ type: 'text', text: line }])
  return { lines, segments }
}

/**
 * 将原始内容解析为 contentSegments（用于阅读页展示）
 * @param {string} rawContent
 * @returns {{ lines: string[], segments: Array<Array<segment>> }}
 */
function parseContent(rawContent) {
  if (!rawContent || typeof rawContent !== 'string') {
    return { lines: [], segments: [] }
  }
  if (isHtmlContent(rawContent)) {
    return parseHtmlToSegments(rawContent)
  }
  return parsePlainTextToSegments(rawContent)
}

module.exports = {
  parseContent,
  isHtmlContent
}