Files
soul-yongping/scripts/feishu_wiki_upload.py
2026-03-07 22:58:43 +08:00

565 lines
23 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
将《一场soul的创业实验》全书含图片上传到飞书知识库子节点下。
飞书知识库链接https://cunkebao.feishu.cn/wiki/FNP6wdvNKij7yMkb3xCce0CYnpd
需在该链接对应的「创业实验」节点下创建子页面并写入内容。
环境变量(必填):
FEISHU_APP_ID 飞书应用 App ID找卡若AI拿卡若AI/02_卡人/水桥_平台对接/飞书管理/
FEISHU_APP_SECRET 飞书应用 App Secret同上
FEISHU_WIKI_NODE_TOKEN 知识库父节点 token即链接中的 IDFNP6wdvNKij7yMkb3xCce0CYnpd
可选:将上述变量写在 scripts/.env.feishu 中(每行 KEY=VALUE本脚本会自动加载。
权限要求:应用需加入该知识库为成员(管理员),并开通 知识库、云文档 权限。
用法:
python3 feishu_wiki_upload.py [--dry-run] [--only 4.6]
python3 feishu_wiki_upload.py --full 按目录结构上传全书(同名节点复用,不重复建)
--dry-run 仅检查配置与本地文件,不上传
--only 4.6 仅上传 4.6 一节(用于测试)
--full 全书同步:建齐目录层级,有同名则覆盖该页正文
"""
import argparse
import json
import os
import re
import sys
import time
import webbrowser
from pathlib import Path
try:
import requests
except ImportError:
print("请安装 requests: pip install requests")
sys.exit(1)
# 卡若AI 飞书用户 token 路径(用户身份有知识库编辑权限时用)
FEISHU_USER_TOKENS_JSON = Path("/Users/karuo/Documents/个人/卡若AI/02_卡人/水桥_平台对接/飞书管理/脚本/.feishu_tokens.json")
def load_env():
env_path = Path(__file__).resolve().parent / ".env.feishu"
if env_path.exists():
for line in env_path.read_text(encoding="utf-8").splitlines():
line = line.strip()
if line and not line.startswith("#") and "=" in line:
k, v = line.split("=", 1)
os.environ.setdefault(k.strip(), v.strip().replace('"', "").replace("'", ""))
def _refresh_user_token(app_id: str, app_secret: str) -> str:
"""用 app 凭证 + 卡若AI .feishu_tokens.json 的 refresh_token 刷新用户 token 并写回文件。"""
if not FEISHU_USER_TOKENS_JSON.exists():
return ""
try:
d = json.loads(FEISHU_USER_TOKENS_JSON.read_text(encoding="utf-8"))
refresh = (d.get("refresh_token") or "").strip()
if not refresh:
return ""
except Exception:
return ""
r = requests.post(
"https://open.feishu.cn/open-apis/auth/v3/app_access_token/internal",
json={"app_id": app_id, "app_secret": app_secret},
timeout=10,
)
app_token = (r.json() or {}).get("app_access_token")
if not app_token:
return ""
r2 = requests.post(
"https://open.feishu.cn/open-apis/authen/v1/oidc/refresh_access_token",
headers={"Authorization": f"Bearer {app_token}", "Content-Type": "application/json"},
json={"grant_type": "refresh_token", "refresh_token": refresh},
timeout=10,
)
out = r2.json() or {}
if out.get("code") != 0:
return ""
data = out.get("data") or {}
new_access = (data.get("access_token") or "").strip()
new_refresh = (data.get("refresh_token") or "").strip() or refresh
if not new_access:
return ""
d["access_token"] = new_access
d["refresh_token"] = new_refresh
FEISHU_USER_TOKENS_JSON.write_text(json.dumps(d, ensure_ascii=False, indent=2), encoding="utf-8")
return new_access
def get_user_token(app_id: str = "", app_secret: str = "") -> str:
"""优先用用户 token知识库编辑通常需用户身份有 app 凭证时先刷新再返回。"""
if os.environ.get("FEISHU_USER_TOKEN"):
return os.environ["FEISHU_USER_TOKEN"].strip()
if not FEISHU_USER_TOKENS_JSON.exists():
return ""
try:
d = json.loads(FEISHU_USER_TOKENS_JSON.read_text(encoding="utf-8"))
if app_id and app_secret and d.get("refresh_token"):
refreshed = _refresh_user_token(app_id, app_secret)
if refreshed:
return refreshed
return (d.get("access_token") or "").strip()
except Exception:
pass
return ""
BASE = "https://open.feishu.cn/open-apis"
WIKI_NODE_TOKEN = "FNP6wdvNKij7yMkb3xCce0CYnpd"
BOOK_ROOT = Path(os.environ.get("SOUL_BOOK_ROOT", "/Users/karuo/Documents/个人/2、我写的书/《一场soul的创业实验》"))
def get_tenant_access_token(app_id: str, app_secret: str) -> str:
r = requests.post(
f"{BASE}/auth/v3/tenant_access_token/internal",
json={"app_id": app_id, "app_secret": app_secret},
timeout=10,
)
d = r.json()
if d.get("code") != 0:
raise RuntimeError(f"获取 tenant_access_token 失败: {d}")
return d["tenant_access_token"]
def get_node_info(token: str, node_token: str) -> dict:
r = requests.get(
f"{BASE}/wiki/v2/spaces/get_node",
params={"token": node_token},
headers={"Authorization": f"Bearer {token}"},
timeout=10,
)
d = r.json()
if d.get("code") != 0:
raise RuntimeError(f"获取节点信息失败: {d}")
return d["data"]["node"]
def list_wiki_children(token: str, space_id: str, parent_node_token: str) -> list:
"""列出某节点下直接子节点,返回 [{node_token, obj_token, title}, ...]。"""
out = []
page_token = None
while True:
params = {"parent_node_token": parent_node_token, "page_size": 50}
if page_token:
params["page_token"] = page_token
r = requests.get(
f"{BASE}/wiki/v2/spaces/{space_id}/nodes",
headers={"Authorization": f"Bearer {token}"},
params=params,
timeout=15,
)
d = r.json()
if d.get("code") != 0:
raise RuntimeError(f"列出子节点失败: {d}")
data = d.get("data") or {}
for n in data.get("items") or []:
out.append({
"node_token": n.get("node_token"),
"obj_token": n.get("obj_token"),
"title": (n.get("title") or "").strip() or "未命名",
})
page_token = data.get("page_token")
if not page_token:
break
time.sleep(0.15)
return out
def get_or_create_node(token: str, space_id: str, parent_node_token: str, title: str) -> tuple:
"""在 parent 下获取或创建标题为 title 的节点。返回 (node_token, obj_token, created: bool)。同名则复用。"""
children = list_wiki_children(token, space_id, parent_node_token)
for c in children:
if (c.get("title") or "").strip() == title.strip():
return (c["node_token"], c["obj_token"], False)
node = create_wiki_node(token, space_id, parent_node_token, title)
return (node["node_token"], node["obj_token"], True)
def create_wiki_node(token: str, space_id: str, parent_node_token: str, title: str, obj_type: str = "docx") -> dict:
r = requests.post(
f"{BASE}/wiki/v2/spaces/{space_id}/nodes",
headers={"Authorization": f"Bearer {token}", "Content-Type": "application/json"},
json={
"obj_type": obj_type,
"node_type": "origin",
"parent_node_token": parent_node_token,
"title": title,
},
timeout=10,
)
d = r.json()
if d.get("code") != 0:
raise RuntimeError(f"创建节点失败: {d}")
return d["data"]["node"]
def get_docx_block_children(token: str, document_id: str) -> list:
"""获取文档块列表,用于得到根块 iddocx 新文档的 document_id 即可作为根块 id 使用)"""
r = requests.get(
f"{BASE}/docx/v1/documents/{document_id}/blocks",
params={"document_revision_id": -1, "page_size": 50},
headers={"Authorization": f"Bearer {token}"},
timeout=10,
)
d = r.json()
if d.get("code") != 0:
raise RuntimeError(f"获取文档块失败: {d}")
return d.get("data", {}).get("items", [])
DOCX_CHILDREN_BATCH = 50 # 飞书单次创建子块上限
def create_docx_block_children(token: str, document_id: str, block_id: str, children: list, index: int = 0) -> dict:
"""在指定块下创建子块。children 格式见飞书 docx 创建块 API单次最多 50 个。"""
r = requests.post(
f"{BASE}/docx/v1/documents/{document_id}/blocks/{block_id}/children",
params={"document_revision_id": -1},
headers={"Authorization": f"Bearer {token}", "Content-Type": "application/json"},
json={"children": children, "index": index},
timeout=10,
)
d = r.json()
if d.get("code") != 0:
raise RuntimeError(f"创建块失败: {d}")
return d.get("data", {})
def create_docx_block_children_batched(token: str, document_id: str, block_id: str, children: list) -> None:
"""分批创建子块(每批最多 DOCX_CHILDREN_BATCH 个)。"""
for i in range(0, len(children), DOCX_CHILDREN_BATCH):
chunk = children[i : i + DOCX_CHILDREN_BATCH]
create_docx_block_children(token, document_id, block_id, chunk, index=i)
def clear_docx_children(token: str, document_id: str) -> bool:
"""清空文档根块下所有直接子块(用于同名页覆盖正文)。"""
all_items = []
page_token = None
while True:
params = {"document_revision_id": -1, "page_size": 200}
if page_token:
params["page_token"] = page_token
r = requests.get(
f"{BASE}/docx/v1/documents/{document_id}/blocks",
headers={"Authorization": f"Bearer {token}"},
params=params,
timeout=15,
)
d = r.json()
if d.get("code") != 0:
return False
data = d.get("data") or {}
all_items.extend(data.get("items") or [])
page_token = data.get("page_token")
if not page_token:
break
child_ids = [b["block_id"] for b in all_items if b.get("parent_id") == document_id and b.get("block_id")]
if not child_ids:
return True
for i in range(0, len(child_ids), 50):
batch = child_ids[i : i + 50]
rd = requests.delete(
f"{BASE}/docx/v1/documents/{document_id}/blocks/{document_id}/children/batch_delete",
headers={"Authorization": f"Bearer {token}", "Content-Type": "application/json"},
json={"block_id_list": batch},
timeout=15,
)
if (rd.json() or {}).get("code") != 0:
return False
time.sleep(0.1)
return True
def write_docx_content(token: str, doc_id: str, blocks: list, overwrite: bool = True) -> None:
"""向文档写入正文。若 overwrite 且已有子块则先清空再写。"""
if overwrite:
clear_docx_children(token, doc_id)
create_docx_block_children_batched(token, doc_id, doc_id, blocks)
# 节标题与文件名不一致时在此指定飞书页面标题(避免重复建页)
TITLE_OVERRIDE = {"4.6 Soul被封号了解决方案和干货": "4.6 Soul被封号了如何处理"}
def _normalize_part(name: str) -> str:
"""目录名转飞书页名_第一篇真实的人 -> 第一篇|真实的人"""
s = (name or "").strip()
if s.startswith("_"):
s = s[1:]
return s
def build_book_entries(book_root: Path) -> list:
"""按飞书目录顺序生成 (parent_key, title, md_path)。parent_key 为空表示根下。"""
book_root = book_root.resolve()
dirs_seen = set() # (parent_key, title) 已加入的目录
entries = [] # (parent_key, title, md_path or None)
def ensure_dir(parent_key: str, title: str) -> None:
k = (parent_key, title)
if k in dirs_seen:
return
dirs_seen.add(k)
entries.append((parent_key, title, None))
for rel in sorted(book_root.rglob("*.md"), key=lambda p: str(p)):
try:
rel = rel.relative_to(book_root)
except ValueError:
continue
if rel.name.startswith(".") or rel.stem == "飞书同步说明":
continue
parts = rel.parts
name_stem = rel.stem
title = TITLE_OVERRIDE.get(name_stem, name_stem)
if len(parts) == 1:
ensure_dir("", title)
entries.append(("", title, rel))
continue
# 多级:第一篇/第1章/1.1.md 或 第二篇/第4章/4.6.md
p0 = _normalize_part(parts[0])
if len(parts) == 2:
ensure_dir("", p0)
entries.append((p0, title, rel))
continue
p1 = parts[1]
ensure_dir("", p0)
ensure_dir(p0, p1)
if len(parts) == 3:
entries.append((f"{p0}/{p1}", title, rel))
continue
# 附录/附录1...md
p2 = parts[2]
ensure_dir("", p0)
ensure_dir(p0, p1)
ensure_dir(f"{p0}/{p1}", p2)
entries.append((f"{p0}/{p1}/{p2}", title, rel))
# 保证顺序:先所有「目录」再带文件的;同 parent 下按 title 排
dir_entries = [(pk, t, None) for (pk, t, p) in entries if p is None]
file_entries = [(pk, t, p) for (pk, t, p) in entries if p is not None]
# 目录按层级排:根下先,再一层子,再二层…
def depth(k):
return len([x for x in k.split("/") if x]) if k else 0
dir_entries.sort(key=lambda x: (depth(x[0]), x[0], x[1]))
file_entries.sort(key=lambda x: (x[0], x[1]))
seen_dir = set()
out = []
for pk, t, _ in dir_entries:
if (pk, t) in seen_dir:
continue
seen_dir.add((pk, t))
out.append((pk, t, None))
for pk, t, p in file_entries:
out.append((pk, t, p))
return out
def _strip_md_bold(text: str) -> str:
"""去掉 Markdown 粗体星号,飞书正文不保留 **。"""
return re.sub(r"\*\*", "", text)
def text_to_docx_blocks(md_text: str, assets_dir: Path) -> list:
"""将 markdown 转为 docx 子块列表(仅文本块;图片占位为文本说明;正文去掉 ** 等星号)。"""
blocks = []
# 按双换行分段,每段一个文本块
segments = re.split(r"\n\n+", md_text)
for seg in segments:
seg = seg.strip()
if not seg:
continue
# 若是图片行 ![alt](path),先插入一段说明文字,图片需在飞书内上传后插入
if seg.startswith("!["):
m = re.match(r'!\[([^\]]*)\]\(([^)]+)\)', seg)
if m:
alt, path = m.group(1), m.group(2)
blocks.append({
"block_type": 2,
"text": {
"elements": [{"type": "text_run", "text_run": {"content": f"[图片: {alt}]", "style": {}}}]
},
})
continue
# 普通段落:写入飞书时去掉 **,避免页面出现星号
lines = seg.split("\n")
for line in lines:
line = line.strip()
if not line:
continue
blocks.append({
"block_type": 2,
"text": {
"elements": [{"type": "text_run", "text_run": {"content": _strip_md_bold(line) + "\n", "style": {}}}]
},
})
return blocks
def main():
load_env()
parser = argparse.ArgumentParser(description="上传书稿到飞书知识库")
parser.add_argument("--dry-run", action="store_true", help="仅检查配置与文件,不上传")
parser.add_argument("--only", default="", help="仅上传指定节,如 4.6")
parser.add_argument("--full", action="store_true", help="按目录结构上传全书,同名则覆盖该页")
args = parser.parse_args()
app_id = os.environ.get("FEISHU_APP_ID", "").strip()
app_secret = os.environ.get("FEISHU_APP_SECRET", "").strip()
node_token = os.environ.get("FEISHU_WIKI_NODE_TOKEN", WIKI_NODE_TOKEN).strip()
if args.dry_run:
print("dry-run: 检查本地文件与目标飞书链接。")
print(" 飞书链接: https://cunkebao.feishu.cn/wiki/FNP6wdvNKij7yMkb3xCce0CYnpd")
print(f" 书稿根目录: {BOOK_ROOT} (存在={BOOK_ROOT.exists()})")
if args.full:
entries = build_book_entries(BOOK_ROOT)
print(f" 全书条目数: {len(entries)}(含目录页)")
for i, (pk, t, p) in enumerate(entries[:12]):
print(f" [{i+1}] {pk!r} / {t!r} {'-> ' + str(p) if p else '(目录)'}")
if len(entries) > 12:
print(f" ... 等共 {len(entries)}")
else:
section_path = BOOK_ROOT / "第二篇|真实的行业" / "第4章内容商业篇" / "4.6 Soul被封号了解决方案和干货.md"
assets_dir = section_path.parent / "assets"
print(f" 4.6 正文: {section_path} (存在={section_path.exists()})")
print(f" 4.6 图片目录: {assets_dir} (存在={assets_dir.exists()})")
if assets_dir.exists():
for f in assets_dir.iterdir():
if f.suffix.lower() in (".png", ".jpg", ".jpeg", ".gif"):
print(f" - {f.name}")
if not app_id or not app_secret:
print(" 未配置 FEISHU_APP_ID/FEISHU_APP_SECRET实际上传前请在 scripts/.env.feishu 中配置。")
return 0
if not app_id or not app_secret:
print("错误: 未配置飞书应用凭证。")
print("请设置环境变量 FEISHU_APP_ID、FEISHU_APP_SECRET或在 scripts/.env.feishu 中配置。")
print("飞书开放平台: https://open.feishu.cn/app 创建应用并开通「知识库」「云文档」权限,")
print("将应用添加为知识库成员后,把 App ID 与 App Secret 填入 .env.feishu。")
sys.exit(1)
tenant_token = get_tenant_access_token(app_id, app_secret)
# 知识库创建节点需编辑权限:优先用用户 token过期则用 app 凭证刷新)
user_token = get_user_token(app_id, app_secret)
token = user_token if user_token else tenant_token
if user_token:
print("使用用户 token 操作知识库")
node = get_node_info(token, node_token)
space_id = node["space_id"]
parent_token = node_token
print(f"知识库 space_id: {space_id}, 父节点: {parent_token}")
write_token = user_token if user_token else tenant_token
cache = {"": parent_token} # parent_key -> node_token根用 FEISHU_WIKI_NODE_TOKEN
# 仅上传 4.6
if args.only == "4.6":
section_path = BOOK_ROOT / "第二篇|真实的行业" / "第4章内容商业篇" / "4.6 Soul被封号了解决方案和干货.md"
assets_dir = section_path.parent / "assets"
if not section_path.exists():
print(f"文件不存在: {section_path}")
sys.exit(1)
content = section_path.read_text(encoding="utf-8")
title = "4.6 Soul被封号了如何处理"
_, doc_id, created = get_or_create_node(token, space_id, parent_token, title)
blocks = text_to_docx_blocks(content, assets_dir)
write_docx_content(write_token, doc_id, blocks, overwrite=True)
print(f"已同步子页面: {title}, document_id={doc_id}" + (" (新建)" if created else " (覆盖)"))
print("图片需在飞书该文档内手动上传并插入到 [图片: xxx] 位置;本地路径:", list(assets_dir.iterdir()) if assets_dir.exists() else [])
doc_url = f"https://cunkebao.feishu.cn/docx/{doc_id}"
webbrowser.open(doc_url)
print(f"已打开: {doc_url}")
return 0
# 仅上传 112场第9章下
if args.only == "112场" or "112" in args.only:
section_path = BOOK_ROOT / "第四篇|真实的赚钱" / "第9章我在Soul上亲访的赚钱案例" / "第112场一个人起头维权挣了大半套房.md"
if not section_path.exists():
print(f"文件不存在: {section_path}")
sys.exit(1)
entries = build_book_entries(BOOK_ROOT)
target = next((e for e in entries if e[1] and "112场" in e[1]), None)
if not target:
print("未在全书条目中找到 112场")
sys.exit(1)
parent_key, title, md_path = target
if not md_path:
print("112场 对应的是目录项,无正文")
sys.exit(1)
# 先确保父链存在并写入 cache
parts = [p for p in parent_key.split("/") if p]
for i in range(len(parts) + 1):
pk = "/".join(parts[:i]) if i else ""
p_token = cache.get(pk)
if p_token is None:
print(f" 跳过(父未就绪): {pk!r}")
continue
need_title = parts[i] if i < len(parts) else title
if i < len(parts):
node_tok, _, _ = get_or_create_node(token, space_id, p_token, need_title)
cache[pk + "/" + need_title if pk else need_title] = node_tok
p_token = cache.get(parent_key)
if p_token is None:
print("父节点未解析到,请先执行 --full 一次或检查目录结构")
sys.exit(1)
_, doc_id, created = get_or_create_node(token, space_id, p_token, title)
content = section_path.read_text(encoding="utf-8")
assets_dir = section_path.parent / "assets"
blocks = text_to_docx_blocks(content, assets_dir)
write_docx_content(write_token, doc_id, blocks, overwrite=True)
print(f"已同步子页面: {title}, document_id={doc_id}" + (" (新建)" if created else " (覆盖)"))
doc_url = f"https://cunkebao.feishu.cn/docx/{doc_id}"
webbrowser.open(doc_url)
print(f"已打开: {doc_url}")
return 0
# 全书上传:按目录建节点,有同名则复用并覆盖正文
if args.full:
entries = build_book_entries(BOOK_ROOT)
print(f"全书共 {len(entries)} 项(含目录页),开始同步…")
created_count = 0
updated_count = 0
for parent_key, title, md_path in entries:
p_token = cache.get(parent_key)
if p_token is None:
print(f" 跳过(父未就绪): {parent_key!r} / {title!r}")
continue
node_token, obj_token, created = get_or_create_node(token, space_id, p_token, title)
current_key = f"{parent_key}/{title}" if parent_key else title
cache[current_key] = node_token
if md_path is not None:
full_path = BOOK_ROOT / md_path
full_path = BOOK_ROOT / (md_path if isinstance(md_path, str) else str(md_path))
if not full_path.exists():
print(f" 跳过(文件不存在): {md_path}")
continue
try:
content = full_path.read_text(encoding="utf-8")
except Exception as e:
print(f" 跳过(读文件失败): {md_path} -> {e}")
continue
assets_dir = full_path.parent / "assets"
blocks = text_to_docx_blocks(content, assets_dir)
write_docx_content(write_token, obj_token, blocks, overwrite=True)
if created:
created_count += 1
else:
updated_count += 1
print(f" 写入: {current_key}")
else:
if created:
created_count += 1
print(f" 目录: {current_key}")
print(f"完成。新建 {created_count} 个页面,覆盖 {updated_count} 个页面。")
return 0
print("请使用 --only 4.6 或 --full 指定上传范围。")
sys.exit(0)
if __name__ == "__main__":
main()