soul-yongping/miniprogram/utils/contentParser.js

/**
 * Soul创业派对 - 内容解析工具
 * 解析 TipTap HTML 为阅读页可展示的 segments
 *
 * segment 类型：
 *   { type: 'text',    text }
 *   { type: 'mention', userId, nickname }        — @某人，点击加好友
 *   { type: 'linkTag', label, url }              — #链接标签，点击跳转
 *   { type: 'image',   src, alt }               — 图片
 */

/** 判断内容是否为 HTML */
function isHtmlContent(content) {
  if (!content || typeof content !== 'string') return false
  const trimmed = content.trim()
  return trimmed.includes('<') && trimmed.includes('>') && /<[a-z][^>]*>/i.test(trimmed)
}

/** 解码常见 HTML 实体 */
function decodeEntities(str) {
  return str
    .replace(/&nbsp;/g, ' ')
    .replace(/&amp;/g, '&')
    .replace(/&lt;/g, '<')
    .replace(/&gt;/g, '>')
    .replace(/&quot;/g, '"')
    .replace(/&#39;/g, "'")
}

/**
 * 将一个 HTML block 字符串解析为 segments 数组
 * 处理三种内联元素：mention / linkTag(span) / linkTag(a) / img
 */
function parseBlockToSegments(block) {
  const segs = []
  // 合并匹配所有内联元素
  const tokenRe = /<span[^>]*data-type="mention"[^>]*>[\s\S]*?<\/span>|<span[^>]*data-type="linkTag"[^>]*>[\s\S]*?<\/span>|<a[^>]*href="([^"]*)"[^>]*>(#[^<]*)<\/a>|<img[^>]*\/?>/gi
  let lastEnd = 0
  let m

  while ((m = tokenRe.exec(block)) !== null) {
    // 前置纯文本
    const before = decodeEntities(block.slice(lastEnd, m.index).replace(/<[^>]+>/g, ''))
    if (before.trim()) segs.push({ type: 'text', text: before })

    const tag = m[0]

    if (/data-type="mention"/i.test(tag)) {
      // @mention — TipTap mention span
      const idMatch = tag.match(/data-id="([^"]*)"/)
      const labelMatch = tag.match(/data-label="([^"]*)"/)
      const innerText = tag.replace(/<[^>]+>/g, '')
      const userId = idMatch ? idMatch[1].trim() : ''
      const nickname = labelMatch ? labelMatch[1].trim() : innerText.replace(/^@/, '').trim()
      if (userId || nickname) segs.push({ type: 'mention', userId, nickname })

    } else if (/data-type="linkTag"/i.test(tag)) {
      // #linkTag — 自定义 span 格式（data-type="linkTag" data-url="..." data-tag-type="..." data-page-path="..." data-app-id="..."）
      const urlMatch      = tag.match(/data-url="([^"]*)"/)
      const tagTypeMatch  = tag.match(/data-tag-type="([^"]*)"/)
      const pagePathMatch = tag.match(/data-page-path="([^"]*)"/)
      const tagIdMatch    = tag.match(/data-tag-id="([^"]*)"/)
      const appIdMatch    = tag.match(/data-app-id="([^"]*)"/)
      const mpKeyMatch    = tag.match(/data-mp-key="([^"]*)"/)
      const innerText     = tag.replace(/<[^>]+>/g, '').replace(/^#/, '').trim()
      const url           = urlMatch      ? urlMatch[1]      : ''
      const tagType       = tagTypeMatch  ? tagTypeMatch[1]  : 'url'
      const pagePath      = pagePathMatch ? pagePathMatch[1] : ''
      const tagId         = tagIdMatch    ? tagIdMatch[1]    : ''
      const appId         = appIdMatch    ? appIdMatch[1]   : ''
      const mpKey         = mpKeyMatch    ? mpKeyMatch[1]   : ''
      segs.push({ type: 'linkTag', label: innerText || '#', url, tagType, pagePath, tagId, appId, mpKey })

    } else if (/^<a /i.test(tag)) {
      // #linkTag — 旧格式 <a href>（insertLinkTag 旧版产生，url 可能为空）
      // m[1] = href, m[2] = innerText（以 # 开头）
      const url   = m[1] || ''
      const label = (m[2] || '').replace(/^#/, '').trim()
      // 旧格式没有 tagType，在 onLinkTagTap 中会按 label 匹配缓存的 linkTags 配置降级处理
      segs.push({ type: 'linkTag', label: label || '#', url, tagType: '', pagePath: '', tagId: '' })

    } else if (/^<img /i.test(tag)) {
      // 图片
      const srcMatch = tag.match(/src="([^"]*)"/)
      const altMatch = tag.match(/alt="([^"]*)"/)
      if (srcMatch) {
        segs.push({ type: 'image', src: srcMatch[1], alt: altMatch ? altMatch[1] : '' })
      }
    }

    lastEnd = m.index + tag.length
  }

  // 尾部纯文本
  const after = decodeEntities(block.slice(lastEnd).replace(/<[^>]+>/g, ''))
  if (after.trim()) segs.push({ type: 'text', text: after })

  return segs
}

/**
 * 从 HTML 中解析出 lines（纯文本行）和 segments（含富文本片段）
 * @param {string} html
 * @param {object} [config] - { persons: [], linkTags: [] }，用于对 text 段自动匹配 @人名 / #标签
 */
function parseHtmlToSegments(html, config) {
  const lines = []
  const segments = []

  // 1. 块级标签换行，保留内联标签供后续解析
  let text = html
  text = text.replace(/<\/p>\s*<p[^>]*>/gi, '\n\n')
  text = text.replace(/<p[^>]*>/gi, '')
  text = text.replace(/<\/p>/gi, '\n')
  text = text.replace(/<div[^>]*>/gi, '')
  text = text.replace(/<\/div>/gi, '\n')
  text = text.replace(/<br\s*\/?>/gi, '\n')
  text = text.replace(/<\/?h[1-6][^>]*>/gi, '\n')
  text = text.replace(/<\/?blockquote[^>]*>/gi, '\n')
  text = text.replace(/<\/?ul[^>]*>/gi, '\n')
  text = text.replace(/<\/?ol[^>]*>/gi, '\n')
  text = text.replace(/<li[^>]*>/gi, '• ')
  text = text.replace(/<\/li>/gi, '\n')

  // 2. 逐段解析
  const blocks = text.split(/\n+/)
  for (const block of blocks) {
    if (!block.trim()) continue

    let blockSegs = parseBlockToSegments(block)
    if (!blockSegs.length) continue

    // 纯图片行独立成段
    if (blockSegs.length === 1 && blockSegs[0].type === 'image') {
      lines.push('')
      segments.push(blockSegs)
      continue
    }

    // 对 text 段再跑一遍 @人名 / #标签 自动匹配（处理未用 TipTap 插入而是手打的 @xxx）
    if (config && (config.persons?.length || config.linkTags?.length)) {
      const expanded = []
      for (const seg of blockSegs) {
        if (seg.type === 'text' && seg.text) {
          const sub = matchLineToSegments(seg.text, config)
          expanded.push(...sub)
        } else {
          expanded.push(seg)
        }
      }
      blockSegs = expanded
    }

    // 行纯文本用于 lines（previewParagraphs 降级展示）
    const lineText = decodeEntities(block.replace(/<[^>]+>/g, '')).trim()
    lines.push(lineText)
    segments.push(blockSegs)
  }

  return { lines, segments }
}

/** 清理 Markdown 格式标记（**加粗** *斜体* __加粗__ _斜体_ ~~删除线~~ `代码` 等）*/
function stripMarkdownFormatting(text) {
  if (!text) return text
  let s = text
  s = s.replace(/^#{1,6}\s+/gm, '')
  s = s.replace(/\*\*(.+?)\*\*/g, '$1')
  s = s.replace(/__(.+?)__/g, '$1')
  s = s.replace(/(?<!\*)\*(?!\*)(.+?)(?<!\*)\*(?!\*)/g, '$1')
  s = s.replace(/(?<!_)_(?!_)(.+?)(?<!_)_(?!_)/g, '$1')
  s = s.replace(/~~(.+?)~~/g, '$1')
  s = s.replace(/`([^`]+)`/g, '$1')
  s = s.replace(/^>\s+/gm, '')
  s = s.replace(/^---$/gm, '')
  s = s.replace(/^\* /gm, '• ')
  s = s.replace(/^- /gm, '• ')
  s = s.replace(/^\d+\.\s/gm, '')
  return s
}

/**
 * 对一行纯文本进行 @人名 / #标签 自动匹配，返回 segments 数组
 * config: { persons: [{personId, name, aliases}], linkTags: [{tagId, label, type, pagePath, mpKey, url, aliases}] }
 */
function matchLineToSegments(line, config) {
  if (!config || (!config.persons?.length && !config.linkTags?.length)) {
    return [{ type: 'text', text: line }]
  }
  const normalize = s => (s || '').trim().toLowerCase()
  const personMap = {}
  const tagMap = {}
  for (const p of (config.persons || [])) {
    const keys = [p.name, ...(p.aliases ? p.aliases.split(',') : [])].map(normalize).filter(Boolean)
    for (const k of keys) { if (!personMap[k]) personMap[k] = p }
  }
  for (const t of (config.linkTags || [])) {
    const keys = [t.label, ...(t.aliases ? t.aliases.split(',') : [])].map(normalize).filter(Boolean)
    for (const k of keys) { if (!tagMap[k]) tagMap[k] = t }
  }
  const esc = n => n.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')
  const personNames = Object.keys(personMap).sort((a, b) => b.length - a.length).map(esc)
  const tagLabels = Object.keys(tagMap).sort((a, b) => b.length - a.length).map(esc)
  if (!personNames.length && !tagLabels.length) return [{ type: 'text', text: line }]

  const parts = []
  if (personNames.length) parts.push('[@＠](' + personNames.join('|') + ')')
  if (tagLabels.length)   parts.push('[#＃](' + tagLabels.join('|') + ')')
  const pattern = new RegExp(parts.join('|'), 'gi')

  const segs = []
  let lastEnd = 0
  let m
  while ((m = pattern.exec(line)) !== null) {
    if (m.index > lastEnd) {
      segs.push({ type: 'text', text: line.slice(lastEnd, m.index) })
    }
    const full = m[0]
    const prefix = full[0]
    const body = full.slice(1)
    if (prefix === '@' || prefix === '＠') {
      const person = personMap[normalize(body)]
      if (person) {
        segs.push({ type: 'mention', userId: person.personId || '', nickname: person.name || body })
      } else {
        segs.push({ type: 'text', text: full })
      }
    } else {
      const tag = tagMap[normalize(body)]
      if (tag) {
        segs.push({
          type: 'linkTag',
          label: tag.label || body,
          url: tag.url || '',
          tagType: tag.type || 'url',
          pagePath: tag.pagePath || '',
          tagId: tag.tagId || '',
          appId: tag.appId || '',
          mpKey: tag.mpKey || ''
        })
      } else {
        segs.push({ type: 'text', text: full })
      }
    }
    lastEnd = m.index + full.length
  }
  if (lastEnd < line.length) {
    segs.push({ type: 'text', text: line.slice(lastEnd) })
  }
  return segs.length ? segs : [{ type: 'text', text: line }]
}

/** 纯文本/Markdown 按行解析 */
function parsePlainTextToSegments(text, config) {
  const cleaned = stripMarkdownFormatting(text)
  const lines = cleaned.split('\n').map(l => l.trim()).filter(l => l.length > 0)
  const segments = lines.map(line => matchLineToSegments(line, config))
  return { lines, segments }
}

/** 清理残留的 Markdown 图片引用文本（如 "image.png![](xxx)" ） */
function stripOrphanImageRefs(text) {
  if (!text) return text
  text = text.replace(/[^\s]*\.(?:png|jpg|jpeg|gif|webp|svg|bmp)!\[[^\]]*\]\([^)]*\)/gi, '')
  text = text.replace(/!\[[^\]]*\]\([^)]*\)/g, '')
  return text
}

/**
 * 将原始内容解析为 contentSegments（用于阅读页展示）
 * @param {string} rawContent
 * @param {object} [config] - { persons: [], linkTags: [] }
 * @returns {{ lines: string[], segments: Array<Array<segment>> }}
 */
function parseContent(rawContent, config) {
  if (!rawContent || typeof rawContent !== 'string') {
    return { lines: [], segments: [] }
  }
  let content = stripOrphanImageRefs(rawContent)
  if (isHtmlContent(content)) {
    return parseHtmlToSegments(content, config)
  }
  return parsePlainTextToSegments(content, config)
}

module.exports = {
  parseContent,
  isHtmlContent
}