Files
soul-yongping/scripts/sync_chapter_images_from_md.py

191 lines
6.0 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
从书稿 Markdown 中仅解析图片引用并上传到现网再生成 HTML 写入 chapters.content
- 只处理`![](相对/绝对路径.png)` 等常见图片后缀**不解析不上传视频/附件**
- 已是 `http(s)://` 的地址不重复上传原样写入 `<img src="...">`
- 非图片后缀的 `![]()`当作普通正文一行输出不尝试上传
用法:
cd 一场soul的创业实验-永平
python3 scripts/sync_chapter_images_from_md.py --id 10.22 \\
--md "/path/to/第130场….md"
依赖: pip install pymysql requests
环境变量: SOUL_API_BASE 默认 https://soulapi.quwanzhi.com
"""
from __future__ import annotations
import argparse
import html
import importlib.util
import os
import re
import sys
from pathlib import Path
ROOT = Path(__file__).resolve().parent.parent
IMAGE_EXT = frozenset({".png", ".jpg", ".jpeg", ".gif", ".webp"})
# 整行仅有一张图:![](...)
LINE_IMAGE_ONLY = re.compile(r"^\s*!\[([^\]]*)\]\(([^)]+)\)\s*$")
try:
import pymysql
import requests
except ImportError as e:
print("需要: pip install pymysql requests", e, file=sys.stderr)
sys.exit(1)
def load_db_config() -> dict:
mig = ROOT / "scripts" / "migrate_2026_sections.py"
spec = importlib.util.spec_from_file_location("_mig_db", mig)
mod = importlib.util.module_from_spec(spec)
assert spec.loader is not None
spec.loader.exec_module(mod)
cfg = getattr(mod, "DB_CONFIG", None)
if not isinstance(cfg, dict):
sys.exit("migrate_2026_sections.py 中无有效 DB_CONFIG")
return cfg
def resolve_local_path(md_path: Path, ref: str) -> Path | None:
ref = ref.strip()
if not ref or ref.startswith(("http://", "https://")):
return None
p = (md_path.parent / ref).expanduser().resolve()
if p.is_file():
return p
return None
def guess_mime(path: Path) -> str:
ext = path.suffix.lower()
return {
".png": "image/png",
".jpg": "image/jpeg",
".jpeg": "image/jpeg",
".gif": "image/gif",
".webp": "image/webp",
}.get(ext, "application/octet-stream")
def upload_image(local: Path, api_base: str) -> str:
url = f"{api_base.rstrip('/')}/api/upload"
mime = guess_mime(local)
with local.open("rb") as f:
files = {"file": (local.name, f, mime)}
data = {"folder": "book-images"}
r = requests.post(url, files=files, data=data, timeout=120)
r.raise_for_status()
j = r.json()
if not j.get("success"):
raise RuntimeError(j.get("error") or j.get("message") or str(j))
out = j.get("url") or (j.get("data") or {}).get("url")
if not out:
raise RuntimeError("响应无 url: " + str(j)[:500])
return str(out)
def md_to_html(md_path: Path, api_base: str) -> str:
raw = md_path.read_text(encoding="utf-8")
lines = raw.splitlines()
if lines and lines[0].lstrip().startswith("#"):
lines = lines[1:]
chunks: list[str] = []
upload_cache: dict[str, str] = {}
for line in lines:
if line.strip() == "---":
chunks.append("")
continue
m = LINE_IMAGE_ONLY.match(line)
if m:
alt, ref = m.group(1), m.group(2).strip()
if ref.startswith(("http://", "https://")):
chunks.append(
f'<p><img src="{html.escape(ref)}" alt="{html.escape(alt)}"/></p>'
)
continue
loc = resolve_local_path(md_path, ref)
if not loc:
chunks.append(f"<p>(图片路径无效:{html.escape(ref)}</p>")
continue
ext = loc.suffix.lower()
if ext not in IMAGE_EXT:
# 非图片(如视频):不解析、不上传,整行当正文
chunks.append(f"<p>{html.escape(line.strip())}</p>")
continue
key = str(loc)
if key not in upload_cache:
print(f"上传图片: {loc.name}", flush=True)
upload_cache[key] = upload_image(loc, api_base)
src = upload_cache[key]
chunks.append(
f'<p><img src="{html.escape(src)}" alt="{html.escape(alt)}"/></p>'
)
continue
if line.strip():
chunks.append(f"<p>{html.escape(line.strip())}</p>")
else:
chunks.append("")
# 合并连续空串为单个换行,避免多余空 <p>
html_parts: list[str] = []
for c in chunks:
if c == "":
if html_parts and html_parts[-1] != "":
html_parts.append("")
else:
html_parts.append(c)
return "\n".join(html_parts).strip() + "\n"
def main() -> None:
p = argparse.ArgumentParser(description="MD 内图片上传并写回 chapters仅图片")
p.add_argument("--id", required=True, help="章节 id如 10.22")
p.add_argument("--md", type=Path, required=True, help="文章 .md 路径")
p.add_argument(
"--api-base",
default=os.environ.get("SOUL_API_BASE", "https://soulapi.quwanzhi.com"),
help="API 根地址",
)
p.add_argument("--dry-run", action="store_true", help="只打印 HTML 前 800 字,不写库")
args = p.parse_args()
md_path = args.md.expanduser().resolve()
if not md_path.is_file():
sys.exit(f"文件不存在: {md_path}")
body = md_to_html(md_path, args.api_base)
word_count = len(body)
if args.dry_run:
print(body[:800])
print("… dry-runword_count=", word_count)
return
cfg = load_db_config()
conn = pymysql.connect(**cfg)
cur = conn.cursor()
cur.execute(
"UPDATE chapters SET content = %s, word_count = %s, updated_at = NOW() WHERE id = %s",
(body, word_count, args.id),
)
if cur.rowcount != 1:
conn.rollback()
sys.exit(f"更新失败id={args.id} rowcount={cur.rowcount}")
conn.commit()
conn.close()
print(f"已更新 {args.id} | word_count={word_count}")
if __name__ == "__main__":
main()