soul-yongping/scripts/feishu_wiki_upload.py

#!/usr/bin/env python3
"""
将《一场soul的创业实验》全书（含图片）上传到飞书知识库子节点下。

飞书知识库链接：https://cunkebao.feishu.cn/wiki/FNP6wdvNKij7yMkb3xCce0CYnpd
需在该链接对应的「创业实验」节点下创建子页面并写入内容。

环境变量（必填）：
  FEISHU_APP_ID         飞书应用 App ID（找卡若AI拿：卡若AI/02_卡人（水）/水桥_平台对接/飞书管理/）
  FEISHU_APP_SECRET     飞书应用 App Secret（同上）
  FEISHU_WIKI_NODE_TOKEN  知识库父节点 token，即链接中的 ID：FNP6wdvNKij7yMkb3xCce0CYnpd

可选：将上述变量写在 scripts/.env.feishu 中（每行 KEY=VALUE），本脚本会自动加载。

权限要求：应用需加入该知识库为成员（管理员），并开通 知识库、云文档 权限。

用法：
  python3 feishu_wiki_upload.py [--dry-run] [--only 4.6]
  python3 feishu_wiki_upload.py --full  按目录结构上传全书（同名节点复用，不重复建）
  --dry-run  仅检查配置与本地文件，不上传
  --only 4.6 仅上传 4.6 一节（用于测试）
  --full     全书同步：建齐目录层级，有同名则覆盖该页正文
"""
import argparse
import json
import os
import re
import sys
import time
import webbrowser
from pathlib import Path

try:
    import requests
except ImportError:
    print("请安装 requests: pip install requests")
    sys.exit(1)

# 卡若AI 飞书用户 token 路径（用户身份有知识库编辑权限时用）
FEISHU_USER_TOKENS_JSON = Path("/Users/karuo/Documents/个人/卡若AI/02_卡人（水）/水桥_平台对接/飞书管理/脚本/.feishu_tokens.json")

def load_env():
    env_path = Path(__file__).resolve().parent / ".env.feishu"
    if env_path.exists():
        for line in env_path.read_text(encoding="utf-8").splitlines():
            line = line.strip()
            if line and not line.startswith("#") and "=" in line:
                k, v = line.split("=", 1)
                os.environ.setdefault(k.strip(), v.strip().replace('"', "").replace("'", ""))

def _refresh_user_token(app_id: str, app_secret: str) -> str:
    """用 app 凭证 + 卡若AI .feishu_tokens.json 的 refresh_token 刷新用户 token 并写回文件。"""
    if not FEISHU_USER_TOKENS_JSON.exists():
        return ""
    try:
        d = json.loads(FEISHU_USER_TOKENS_JSON.read_text(encoding="utf-8"))
        refresh = (d.get("refresh_token") or "").strip()
        if not refresh:
            return ""
    except Exception:
        return ""
    r = requests.post(
        "https://open.feishu.cn/open-apis/auth/v3/app_access_token/internal",
        json={"app_id": app_id, "app_secret": app_secret},
        timeout=10,
    )
    app_token = (r.json() or {}).get("app_access_token")
    if not app_token:
        return ""
    r2 = requests.post(
        "https://open.feishu.cn/open-apis/authen/v1/oidc/refresh_access_token",
        headers={"Authorization": f"Bearer {app_token}", "Content-Type": "application/json"},
        json={"grant_type": "refresh_token", "refresh_token": refresh},
        timeout=10,
    )
    out = r2.json() or {}
    if out.get("code") != 0:
        return ""
    data = out.get("data") or {}
    new_access = (data.get("access_token") or "").strip()
    new_refresh = (data.get("refresh_token") or "").strip() or refresh
    if not new_access:
        return ""
    d["access_token"] = new_access
    d["refresh_token"] = new_refresh
    FEISHU_USER_TOKENS_JSON.write_text(json.dumps(d, ensure_ascii=False, indent=2), encoding="utf-8")
    return new_access


def get_user_token(app_id: str = "", app_secret: str = "") -> str:
    """优先用用户 token（知识库编辑通常需用户身份）；有 app 凭证时先刷新再返回。"""
    if os.environ.get("FEISHU_USER_TOKEN"):
        return os.environ["FEISHU_USER_TOKEN"].strip()
    if not FEISHU_USER_TOKENS_JSON.exists():
        return ""
    try:
        d = json.loads(FEISHU_USER_TOKENS_JSON.read_text(encoding="utf-8"))
        if app_id and app_secret and d.get("refresh_token"):
            refreshed = _refresh_user_token(app_id, app_secret)
            if refreshed:
                return refreshed
        return (d.get("access_token") or "").strip()
    except Exception:
        pass
    return ""


BASE = "https://open.feishu.cn/open-apis"
WIKI_NODE_TOKEN = "FNP6wdvNKij7yMkb3xCce0CYnpd"
BOOK_ROOT = Path(os.environ.get("SOUL_BOOK_ROOT", "/Users/karuo/Documents/个人/2、我写的书/《一场soul的创业实验》"))


def get_tenant_access_token(app_id: str, app_secret: str) -> str:
    r = requests.post(
        f"{BASE}/auth/v3/tenant_access_token/internal",
        json={"app_id": app_id, "app_secret": app_secret},
        timeout=10,
    )
    d = r.json()
    if d.get("code") != 0:
        raise RuntimeError(f"获取 tenant_access_token 失败: {d}")
    return d["tenant_access_token"]


def get_node_info(token: str, node_token: str) -> dict:
    r = requests.get(
        f"{BASE}/wiki/v2/spaces/get_node",
        params={"token": node_token},
        headers={"Authorization": f"Bearer {token}"},
        timeout=10,
    )
    d = r.json()
    if d.get("code") != 0:
        raise RuntimeError(f"获取节点信息失败: {d}")
    return d["data"]["node"]


def list_wiki_children(token: str, space_id: str, parent_node_token: str) -> list:
    """列出某节点下直接子节点，返回 [{node_token, obj_token, title}, ...]。"""
    out = []
    page_token = None
    while True:
        params = {"parent_node_token": parent_node_token, "page_size": 50}
        if page_token:
            params["page_token"] = page_token
        r = requests.get(
            f"{BASE}/wiki/v2/spaces/{space_id}/nodes",
            headers={"Authorization": f"Bearer {token}"},
            params=params,
            timeout=15,
        )
        d = r.json()
        if d.get("code") != 0:
            raise RuntimeError(f"列出子节点失败: {d}")
        data = d.get("data") or {}
        for n in data.get("items") or []:
            out.append({
                "node_token": n.get("node_token"),
                "obj_token": n.get("obj_token"),
                "title": (n.get("title") or "").strip() or "未命名",
            })
        page_token = data.get("page_token")
        if not page_token:
            break
        time.sleep(0.15)
    return out


def get_or_create_node(token: str, space_id: str, parent_node_token: str, title: str) -> tuple:
    """在 parent 下获取或创建标题为 title 的节点。返回 (node_token, obj_token, created: bool)。同名则复用。"""
    children = list_wiki_children(token, space_id, parent_node_token)
    for c in children:
        if (c.get("title") or "").strip() == title.strip():
            return (c["node_token"], c["obj_token"], False)
    node = create_wiki_node(token, space_id, parent_node_token, title)
    return (node["node_token"], node["obj_token"], True)


def create_wiki_node(token: str, space_id: str, parent_node_token: str, title: str, obj_type: str = "docx") -> dict:
    r = requests.post(
        f"{BASE}/wiki/v2/spaces/{space_id}/nodes",
        headers={"Authorization": f"Bearer {token}", "Content-Type": "application/json"},
        json={
            "obj_type": obj_type,
            "node_type": "origin",
            "parent_node_token": parent_node_token,
            "title": title,
        },
        timeout=10,
    )
    d = r.json()
    if d.get("code") != 0:
        raise RuntimeError(f"创建节点失败: {d}")
    return d["data"]["node"]


def get_docx_block_children(token: str, document_id: str) -> list:
    """获取文档块列表，用于得到根块 id（docx 新文档的 document_id 即可作为根块 id 使用）"""
    r = requests.get(
        f"{BASE}/docx/v1/documents/{document_id}/blocks",
        params={"document_revision_id": -1, "page_size": 50},
        headers={"Authorization": f"Bearer {token}"},
        timeout=10,
    )
    d = r.json()
    if d.get("code") != 0:
        raise RuntimeError(f"获取文档块失败: {d}")
    return d.get("data", {}).get("items", [])


DOCX_CHILDREN_BATCH = 50  # 飞书单次创建子块上限

def create_docx_block_children(token: str, document_id: str, block_id: str, children: list, index: int = 0) -> dict:
    """在指定块下创建子块。children 格式见飞书 docx 创建块 API；单次最多 50 个。"""
    r = requests.post(
        f"{BASE}/docx/v1/documents/{document_id}/blocks/{block_id}/children",
        params={"document_revision_id": -1},
        headers={"Authorization": f"Bearer {token}", "Content-Type": "application/json"},
        json={"children": children, "index": index},
        timeout=10,
    )
    d = r.json()
    if d.get("code") != 0:
        raise RuntimeError(f"创建块失败: {d}")
    return d.get("data", {})


def create_docx_block_children_batched(token: str, document_id: str, block_id: str, children: list) -> None:
    """分批创建子块（每批最多 DOCX_CHILDREN_BATCH 个）。"""
    for i in range(0, len(children), DOCX_CHILDREN_BATCH):
        chunk = children[i : i + DOCX_CHILDREN_BATCH]
        create_docx_block_children(token, document_id, block_id, chunk, index=i)


def clear_docx_children(token: str, document_id: str) -> bool:
    """清空文档根块下所有直接子块（用于同名页覆盖正文）。"""
    all_items = []
    page_token = None
    while True:
        params = {"document_revision_id": -1, "page_size": 200}
        if page_token:
            params["page_token"] = page_token
        r = requests.get(
            f"{BASE}/docx/v1/documents/{document_id}/blocks",
            headers={"Authorization": f"Bearer {token}"},
            params=params,
            timeout=15,
        )
        d = r.json()
        if d.get("code") != 0:
            return False
        data = d.get("data") or {}
        all_items.extend(data.get("items") or [])
        page_token = data.get("page_token")
        if not page_token:
            break
    child_ids = [b["block_id"] for b in all_items if b.get("parent_id") == document_id and b.get("block_id")]
    if not child_ids:
        return True
    for i in range(0, len(child_ids), 50):
        batch = child_ids[i : i + 50]
        rd = requests.delete(
            f"{BASE}/docx/v1/documents/{document_id}/blocks/{document_id}/children/batch_delete",
            headers={"Authorization": f"Bearer {token}", "Content-Type": "application/json"},
            json={"block_id_list": batch},
            timeout=15,
        )
        if (rd.json() or {}).get("code") != 0:
            return False
        time.sleep(0.1)
    return True


def write_docx_content(token: str, doc_id: str, blocks: list, overwrite: bool = True) -> None:
    """向文档写入正文。若 overwrite 且已有子块则先清空再写。"""
    if overwrite:
        clear_docx_children(token, doc_id)
    create_docx_block_children_batched(token, doc_id, doc_id, blocks)


# 节标题与文件名不一致时在此指定飞书页面标题（避免重复建页）
TITLE_OVERRIDE = {"4.6 Soul被封号了：解决方案和干货": "4.6 Soul被封号了：如何处理？"}


def _normalize_part(name: str) -> str:
    """目录名转飞书页名：_第一篇｜真实的人 -> 第一篇｜真实的人"""
    s = (name or "").strip()
    if s.startswith("_"):
        s = s[1:]
    return s


def build_book_entries(book_root: Path) -> list:
    """按飞书目录顺序生成 (parent_key, title, md_path)。parent_key 为空表示根下。"""
    book_root = book_root.resolve()
    dirs_seen = set()  # (parent_key, title) 已加入的目录
    entries = []  # (parent_key, title, md_path or None)

    def ensure_dir(parent_key: str, title: str) -> None:
        k = (parent_key, title)
        if k in dirs_seen:
            return
        dirs_seen.add(k)
        entries.append((parent_key, title, None))

    for rel in sorted(book_root.rglob("*.md"), key=lambda p: str(p)):
        try:
            rel = rel.relative_to(book_root)
        except ValueError:
            continue
        if rel.name.startswith(".") or rel.stem == "飞书同步说明":
            continue
        parts = rel.parts
        name_stem = rel.stem
        title = TITLE_OVERRIDE.get(name_stem, name_stem)
        if len(parts) == 1:
            ensure_dir("", title)
            entries.append(("", title, rel))
            continue
        # 多级：第一篇/第1章/1.1.md 或 第二篇/第4章/4.6.md
        p0 = _normalize_part(parts[0])
        if len(parts) == 2:
            ensure_dir("", p0)
            entries.append((p0, title, rel))
            continue
        p1 = parts[1]
        ensure_dir("", p0)
        ensure_dir(p0, p1)
        if len(parts) == 3:
            entries.append((f"{p0}/{p1}", title, rel))
            continue
        # 附录/附录1｜...md
        p2 = parts[2]
        ensure_dir("", p0)
        ensure_dir(p0, p1)
        ensure_dir(f"{p0}/{p1}", p2)
        entries.append((f"{p0}/{p1}/{p2}", title, rel))

    # 保证顺序：先所有「目录」再带文件的；同 parent 下按 title 排
    dir_entries = [(pk, t, None) for (pk, t, p) in entries if p is None]
    file_entries = [(pk, t, p) for (pk, t, p) in entries if p is not None]
    # 目录按层级排：根下先，再一层子，再二层…
    def depth(k):
        return len([x for x in k.split("/") if x]) if k else 0

    dir_entries.sort(key=lambda x: (depth(x[0]), x[0], x[1]))
    file_entries.sort(key=lambda x: (x[0], x[1]))
    seen_dir = set()
    out = []
    for pk, t, _ in dir_entries:
        if (pk, t) in seen_dir:
            continue
        seen_dir.add((pk, t))
        out.append((pk, t, None))
    for pk, t, p in file_entries:
        out.append((pk, t, p))
    return out


def _strip_md_bold(text: str) -> str:
    """去掉 Markdown 粗体星号，飞书正文不保留 **。"""
    return re.sub(r"\*\*", "", text)


def text_to_docx_blocks(md_text: str, assets_dir: Path) -> list:
    """将 markdown 转为 docx 子块列表（仅文本块；图片占位为文本说明；正文去掉 ** 等星号）。"""
    blocks = []
    # 按双换行分段，每段一个文本块
    segments = re.split(r"\n\n+", md_text)
    for seg in segments:
        seg = seg.strip()
        if not seg:
            continue
        # 若是图片行 ![alt](path)，先插入一段说明文字，图片需在飞书内上传后插入
        if seg.startswith("!["):
            m = re.match(r'!\[([^\]]*)\]\(([^)]+)\)', seg)
            if m:
                alt, path = m.group(1), m.group(2)
                blocks.append({
                    "block_type": 2,
                    "text": {
                        "elements": [{"type": "text_run", "text_run": {"content": f"[图片: {alt}]", "style": {}}}]
                    },
                })
            continue
        # 普通段落：写入飞书时去掉 **，避免页面出现星号
        lines = seg.split("\n")
        for line in lines:
            line = line.strip()
            if not line:
                continue
            blocks.append({
                "block_type": 2,
                "text": {
                    "elements": [{"type": "text_run", "text_run": {"content": _strip_md_bold(line) + "\n", "style": {}}}]
                },
            })
    return blocks


def main():
    load_env()
    parser = argparse.ArgumentParser(description="上传书稿到飞书知识库")
    parser.add_argument("--dry-run", action="store_true", help="仅检查配置与文件，不上传")
    parser.add_argument("--only", default="", help="仅上传指定节，如 4.6")
    parser.add_argument("--full", action="store_true", help="按目录结构上传全书，同名则覆盖该页")
    args = parser.parse_args()

    app_id = os.environ.get("FEISHU_APP_ID", "").strip()
    app_secret = os.environ.get("FEISHU_APP_SECRET", "").strip()
    node_token = os.environ.get("FEISHU_WIKI_NODE_TOKEN", WIKI_NODE_TOKEN).strip()

    if args.dry_run:
        print("dry-run: 检查本地文件与目标飞书链接。")
        print("  飞书链接: https://cunkebao.feishu.cn/wiki/FNP6wdvNKij7yMkb3xCce0CYnpd")
        print(f"  书稿根目录: {BOOK_ROOT} (存在={BOOK_ROOT.exists()})")
        if args.full:
            entries = build_book_entries(BOOK_ROOT)
            print(f"  全书条目数: {len(entries)}（含目录页）")
            for i, (pk, t, p) in enumerate(entries[:12]):
                print(f"    [{i+1}] {pk!r} / {t!r} {'-> ' + str(p) if p else '(目录)'}")
            if len(entries) > 12:
                print(f"    ... 等共 {len(entries)} 项")
        else:
            section_path = BOOK_ROOT / "第二篇｜真实的行业" / "第4章｜内容商业篇" / "4.6 Soul被封号了：解决方案和干货.md"
            assets_dir = section_path.parent / "assets"
            print(f"  4.6 正文: {section_path} (存在={section_path.exists()})")
            print(f"  4.6 图片目录: {assets_dir} (存在={assets_dir.exists()})")
            if assets_dir.exists():
                for f in assets_dir.iterdir():
                    if f.suffix.lower() in (".png", ".jpg", ".jpeg", ".gif"):
                        print(f"    - {f.name}")
        if not app_id or not app_secret:
            print("  未配置 FEISHU_APP_ID/FEISHU_APP_SECRET，实际上传前请在 scripts/.env.feishu 中配置。")
        return 0

    if not app_id or not app_secret:
        print("错误: 未配置飞书应用凭证。")
        print("请设置环境变量 FEISHU_APP_ID、FEISHU_APP_SECRET，或在 scripts/.env.feishu 中配置。")
        print("飞书开放平台: https://open.feishu.cn/app 创建应用并开通「知识库」「云文档」权限，")
        print("将应用添加为知识库成员后，把 App ID 与 App Secret 填入 .env.feishu。")
        sys.exit(1)

    tenant_token = get_tenant_access_token(app_id, app_secret)
    # 知识库创建节点需编辑权限：优先用用户 token（过期则用 app 凭证刷新）
    user_token = get_user_token(app_id, app_secret)
    token = user_token if user_token else tenant_token
    if user_token:
        print("使用用户 token 操作知识库")
    node = get_node_info(token, node_token)
    space_id = node["space_id"]
    parent_token = node_token
    print(f"知识库 space_id: {space_id}, 父节点: {parent_token}")

    write_token = user_token if user_token else tenant_token
    cache = {"": parent_token}  # parent_key -> node_token（根用 FEISHU_WIKI_NODE_TOKEN）

    # 仅上传 4.6
    if args.only == "4.6":
        section_path = BOOK_ROOT / "第二篇｜真实的行业" / "第4章｜内容商业篇" / "4.6 Soul被封号了：解决方案和干货.md"
        assets_dir = section_path.parent / "assets"
        if not section_path.exists():
            print(f"文件不存在: {section_path}")
            sys.exit(1)
        content = section_path.read_text(encoding="utf-8")
        title = "4.6 Soul被封号了：如何处理？"
        _, doc_id, created = get_or_create_node(token, space_id, parent_token, title)
        blocks = text_to_docx_blocks(content, assets_dir)
        write_docx_content(write_token, doc_id, blocks, overwrite=True)
        print(f"已同步子页面: {title}, document_id={doc_id}" + (" (新建)" if created else " (覆盖)"))
        print("图片需在飞书该文档内手动上传并插入到 [图片: xxx] 位置；本地路径:", list(assets_dir.iterdir()) if assets_dir.exists() else [])
        doc_url = f"https://cunkebao.feishu.cn/docx/{doc_id}"
        webbrowser.open(doc_url)
        print(f"已打开: {doc_url}")
        return 0

    # 仅上传 112场（第9章下）
    if args.only == "112场" or "112" in args.only:
        section_path = BOOK_ROOT / "第四篇｜真实的赚钱" / "第9章｜我在Soul上亲访的赚钱案例" / "第112场｜一个人起头，维权挣了大半套房.md"
        if not section_path.exists():
            print(f"文件不存在: {section_path}")
            sys.exit(1)
        entries = build_book_entries(BOOK_ROOT)
        target = next((e for e in entries if e[1] and "112场" in e[1]), None)
        if not target:
            print("未在全书条目中找到 112场")
            sys.exit(1)
        parent_key, title, md_path = target
        if not md_path:
            print("112场 对应的是目录项，无正文")
            sys.exit(1)
        # 先确保父链存在并写入 cache
        parts = [p for p in parent_key.split("/") if p]
        for i in range(len(parts) + 1):
            pk = "/".join(parts[:i]) if i else ""
            p_token = cache.get(pk)
            if p_token is None:
                print(f"  跳过（父未就绪）: {pk!r}")
                continue
            need_title = parts[i] if i < len(parts) else title
            if i < len(parts):
                node_tok, _, _ = get_or_create_node(token, space_id, p_token, need_title)
                cache[pk + "/" + need_title if pk else need_title] = node_tok
        p_token = cache.get(parent_key)
        if p_token is None:
            print("父节点未解析到，请先执行 --full 一次或检查目录结构")
            sys.exit(1)
        _, doc_id, created = get_or_create_node(token, space_id, p_token, title)
        content = section_path.read_text(encoding="utf-8")
        assets_dir = section_path.parent / "assets"
        blocks = text_to_docx_blocks(content, assets_dir)
        write_docx_content(write_token, doc_id, blocks, overwrite=True)
        print(f"已同步子页面: {title}, document_id={doc_id}" + (" (新建)" if created else " (覆盖)"))
        doc_url = f"https://cunkebao.feishu.cn/docx/{doc_id}"
        webbrowser.open(doc_url)
        print(f"已打开: {doc_url}")
        return 0

    # 全书上传：按目录建节点，有同名则复用并覆盖正文
    if args.full:
        entries = build_book_entries(BOOK_ROOT)
        print(f"全书共 {len(entries)} 项（含目录页），开始同步…")
        created_count = 0
        updated_count = 0
        for parent_key, title, md_path in entries:
            p_token = cache.get(parent_key)
            if p_token is None:
                print(f"  跳过（父未就绪）: {parent_key!r} / {title!r}")
                continue
            node_token, obj_token, created = get_or_create_node(token, space_id, p_token, title)
            current_key = f"{parent_key}/{title}" if parent_key else title
            cache[current_key] = node_token
            if md_path is not None:
                full_path = BOOK_ROOT / md_path
                full_path = BOOK_ROOT / (md_path if isinstance(md_path, str) else str(md_path))
                if not full_path.exists():
                    print(f"  跳过（文件不存在）: {md_path}")
                    continue
                try:
                    content = full_path.read_text(encoding="utf-8")
                except Exception as e:
                    print(f"  跳过（读文件失败）: {md_path} -> {e}")
                    continue
                assets_dir = full_path.parent / "assets"
                blocks = text_to_docx_blocks(content, assets_dir)
                write_docx_content(write_token, obj_token, blocks, overwrite=True)
                if created:
                    created_count += 1
                else:
                    updated_count += 1
                print(f"  写入: {current_key}")
            else:
                if created:
                    created_count += 1
                print(f"  目录: {current_key}")
        print(f"完成。新建 {created_count} 个页面，覆盖 {updated_count} 个页面。")
        return 0

    print("请使用 --only 4.6 或 --full 指定上传范围。")
    sys.exit(0)


if __name__ == "__main__":
    main()