🔄 卡若AI 同步 2026-03-13 10:42 | 更新:水桥平台对接、卡木、火炬、运营中枢、运营中枢工作台 | 排除 >20MB: 11 个
This commit is contained in:
182
02_卡人(水)/水桥_平台对接/智能纪要/脚本/sync_missing_txt.py
Normal file
182
02_卡人(水)/水桥_平台对接/智能纪要/脚本/sync_missing_txt.py
Normal file
@@ -0,0 +1,182 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""对比 聊天记录/soul 下已有 txt 与妙记列表,只下载缺失场次的文字。"""
|
||||
import re
|
||||
import sys
|
||||
import sqlite3
|
||||
import shutil
|
||||
import tempfile
|
||||
import time
|
||||
import requests
|
||||
from pathlib import Path
|
||||
|
||||
TXT_DIR = Path("/Users/karuo/Documents/聊天记录/soul")
|
||||
COOKIE_PATH = Path.home() / "Library/Application Support/Cursor/Partitions/cursor-browser/Cookies"
|
||||
LIST_URL = "https://cunkebao.feishu.cn/minutes/api/space/list"
|
||||
EXPORT_URL = "https://cunkebao.feishu.cn/minutes/api/export"
|
||||
MAX_PAGES = 30
|
||||
PAGE_SIZE = 50
|
||||
|
||||
|
||||
def get_cookie():
|
||||
if not COOKIE_PATH.exists():
|
||||
return "", ""
|
||||
tmp = tempfile.mktemp(suffix=".db")
|
||||
shutil.copy2(COOKIE_PATH, tmp)
|
||||
try:
|
||||
conn = sqlite3.connect(tmp)
|
||||
rows = conn.execute(
|
||||
"SELECT name, value FROM cookies WHERE (host_key LIKE '%feishu%' OR host_key LIKE '%cunkebao%') AND value != ''"
|
||||
).fetchall()
|
||||
conn.close()
|
||||
finally:
|
||||
Path(tmp).unlink(missing_ok=True)
|
||||
cookie_str = "; ".join([f"{n}={v}" for n, v in rows])
|
||||
bv = ""
|
||||
for key in ("bv_csrf_token=", "minutes_csrf_token="):
|
||||
i = cookie_str.find(key)
|
||||
if i != -1:
|
||||
s = i + len(key)
|
||||
e = cookie_str.find(";", s)
|
||||
val = cookie_str[s : e if e != -1 else len(cookie_str)].strip()
|
||||
if len(val) == 36:
|
||||
bv = val
|
||||
break
|
||||
return cookie_str, bv
|
||||
|
||||
|
||||
def have_pairs_from_dir():
|
||||
have = set()
|
||||
for f in TXT_DIR.iterdir():
|
||||
if not f.is_file() or f.suffix.lower() != ".txt":
|
||||
continue
|
||||
nums = re.findall(r"(\d+)场", f.name)
|
||||
dates = re.findall(r"(20\d{6})", f.name)
|
||||
for n in nums:
|
||||
for d in dates:
|
||||
have.add((int(n), d))
|
||||
if not dates:
|
||||
have.add((int(n), ""))
|
||||
return have
|
||||
|
||||
|
||||
def topic_to_pair(topic):
|
||||
nums = re.findall(r"(\d+)场", topic)
|
||||
dates = re.findall(r"(20\d{6})", topic)
|
||||
if not nums:
|
||||
return None, None
|
||||
return int(nums[0]), (dates[0] if dates else "")
|
||||
|
||||
|
||||
def sanitize(topic):
|
||||
s = topic.strip()
|
||||
for c in r'\/:*?"<>|':
|
||||
s = s.replace(c, "_")
|
||||
return s[:85].strip()
|
||||
|
||||
|
||||
def fetch_list(headers):
|
||||
all_items = []
|
||||
last_ts = ""
|
||||
for page in range(1, MAX_PAGES + 1):
|
||||
url = f"{LIST_URL}?size={PAGE_SIZE}&space_name=1"
|
||||
if last_ts:
|
||||
url += f"&last_time={last_ts}"
|
||||
r = requests.get(url, headers=headers, timeout=30)
|
||||
if r.status_code != 200:
|
||||
break
|
||||
data = r.json()
|
||||
if data.get("code") != 0:
|
||||
break
|
||||
items = data.get("data", {}).get("list", [])
|
||||
if not items:
|
||||
break
|
||||
all_items.extend(items)
|
||||
last_ts = items[-1].get("create_time", "")
|
||||
if len(items) < PAGE_SIZE:
|
||||
break
|
||||
time.sleep(0.25)
|
||||
return all_items
|
||||
|
||||
|
||||
def export_txt(headers, object_token):
|
||||
params = {"object_token": object_token, "format": 2, "add_speaker": "true", "add_timestamp": "false"}
|
||||
r = requests.post(EXPORT_URL, params=params, headers=headers, timeout=25)
|
||||
r.encoding = "utf-8"
|
||||
if r.status_code == 200 and (r.text or "").strip():
|
||||
return (r.text or "").strip()
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
import argparse
|
||||
ap = argparse.ArgumentParser(description="同步缺失场次文字到 聊天记录/soul")
|
||||
ap.add_argument("--max-download", type=int, default=0, help="最多下载条数,0=全部")
|
||||
ap.add_argument("--dry-run", action="store_true", help="只列缺失不下载")
|
||||
args = ap.parse_args()
|
||||
|
||||
TXT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
cookie_str, bv = get_cookie()
|
||||
if len(cookie_str) < 100:
|
||||
print("无法获取 Cookie(请用 Cursor 打开过飞书妙记)", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0",
|
||||
"Cookie": cookie_str,
|
||||
"Referer": "https://cunkebao.feishu.cn/minutes/",
|
||||
}
|
||||
if bv:
|
||||
headers["bv-csrf-token"] = bv
|
||||
|
||||
have = have_pairs_from_dir()
|
||||
print(f"目录已有场次对: {len(have)}", flush=True)
|
||||
|
||||
all_items = fetch_list(headers)
|
||||
print(f"API 拉取: {len(all_items)} 条", flush=True)
|
||||
|
||||
missing = []
|
||||
seen_tokens = set()
|
||||
for it in all_items:
|
||||
topic = it.get("topic", "")
|
||||
token = it.get("object_token", "")
|
||||
if not token or token in seen_tokens:
|
||||
continue
|
||||
n, d = topic_to_pair(topic)
|
||||
if n is None:
|
||||
continue
|
||||
if (n, d) in have or (n, "") in have:
|
||||
continue
|
||||
seen_tokens.add(token)
|
||||
missing.append({"topic": topic, "object_token": token, "n": n, "d": d})
|
||||
|
||||
print(f"缺失需下载: {len(missing)} 条(已按 object_token 去重)", flush=True)
|
||||
if not missing:
|
||||
return 0
|
||||
if args.dry_run:
|
||||
for m in missing[:50]:
|
||||
print(f" {m['n']}场 {m['d']} {m['topic'][:55]}", flush=True)
|
||||
if len(missing) > 50:
|
||||
print(f" ... 共 {len(missing)} 条", flush=True)
|
||||
return 0
|
||||
|
||||
to_do = missing[: args.max_download] if args.max_download else missing
|
||||
ok = 0
|
||||
for i, m in enumerate(to_do):
|
||||
topic = m["topic"]
|
||||
token = m["object_token"]
|
||||
body = export_txt(headers, token)
|
||||
if body and len(body) > 50:
|
||||
base = sanitize(topic) + ".txt"
|
||||
path = TXT_DIR / base
|
||||
path.write_text("标题: " + topic + "\n\n" + body, encoding="utf-8")
|
||||
print(f" [{i+1}/{len(to_do)}] OK {m['n']}场 -> {base[:50]}", flush=True)
|
||||
ok += 1
|
||||
else:
|
||||
print(f" [{i+1}/{len(to_do)}] 跳过(无转写) {m['n']}场 {topic[:40]}", flush=True)
|
||||
time.sleep(0.4)
|
||||
print(f"完成: 新写入 {ok} 个 txt(本次处理 {len(to_do)},剩余 {len(missing)-len(to_do)} 可再次运行本脚本补全)", flush=True)
|
||||
return ok
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user