reformat_parse.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374
  1. #!/usr/bin/env python3
  2. """
  3. reformat_parse.py — Convert an existing document into content.json,
  4. then hand off to the CREATE pipeline (render_body.py).
  5. Supported input formats:
  6. .md / .txt — Markdown / plain text
  7. .pdf — Extract text from existing PDF (layout preserved as best-effort)
  8. .json — Pass-through if already content.json format
  9. Usage:
  10. python3 reformat_parse.py --input doc.md --out content.json
  11. python3 reformat_parse.py --input old.pdf --out content.json
  12. python3 reformat_parse.py --input data.json --out content.json
  13. Then pipe into the CREATE pipeline:
  14. python3 render_body.py --tokens tokens.json --content content.json --out body.pdf
  15. Or use make.sh reformat which does both steps:
  16. bash make.sh reformat --input doc.md --type report --title "My Report" --out output.pdf
  17. Exit codes: 0 success, 1 bad args / unsupported format, 2 dep missing, 3 parse error
  18. """
  19. import argparse
  20. import json
  21. import os
  22. import re
  23. import sys
  24. import importlib.util
  25. from pathlib import Path
  26. def ensure_deps():
  27. missing = []
  28. if importlib.util.find_spec("pypdf") is None:
  29. missing.append("pypdf")
  30. if missing:
  31. import subprocess
  32. subprocess.check_call(
  33. [sys.executable, "-m", "pip", "install", "--break-system-packages", "-q"] + missing
  34. )
  35. ensure_deps()
  36. # ── Markdown / plain text parser ───────────────────────────────────────────────
  37. def parse_markdown(text: str) -> list:
  38. """
  39. Convert Markdown to content.json blocks.
  40. Supports: # headings, **bold**, bullet lists, > blockquotes (→ callout),
  41. | tables |, plain paragraphs.
  42. """
  43. blocks = []
  44. lines = text.splitlines()
  45. i = 0
  46. def flush_para(buf: list):
  47. t = " ".join(buf).strip()
  48. if t:
  49. blocks.append({"type": "body", "text": _md_inline(t)})
  50. para_buf = []
  51. while i < len(lines):
  52. line = lines[i]
  53. stripped = line.strip()
  54. # Blank line — flush paragraph buffer
  55. if not stripped:
  56. flush_para(para_buf)
  57. para_buf = []
  58. i += 1
  59. continue
  60. # ATX Headings: # ## ###
  61. m = re.match(r'^(#{1,3})\s+(.*)', stripped)
  62. if m:
  63. flush_para(para_buf)
  64. para_buf = []
  65. level = len(m.group(1))
  66. htype = {1: "h1", 2: "h2", 3: "h3"}.get(level, "h3")
  67. blocks.append({"type": htype, "text": _md_inline(m.group(2))})
  68. i += 1
  69. continue
  70. # Display math block: $$expr$$ on one line, or opening $$ ... closing $$
  71. if stripped.startswith("$$"):
  72. flush_para(para_buf)
  73. para_buf = []
  74. inline_expr = stripped[2:].rstrip("$").strip()
  75. if inline_expr:
  76. # Single-line: $$E = mc^2$$
  77. blocks.append({"type": "math", "text": inline_expr})
  78. i += 1
  79. else:
  80. # Multi-line: opening $$ alone, then expression lines, then closing $$
  81. math_lines = []
  82. i += 1
  83. while i < len(lines) and lines[i].strip() != "$$":
  84. math_lines.append(lines[i])
  85. i += 1
  86. if i < len(lines):
  87. i += 1 # skip closing $$
  88. blocks.append({"type": "math", "text": "\n".join(math_lines).strip()})
  89. continue
  90. # Fenced code block: ``` or ~~~
  91. if stripped.startswith("```") or stripped.startswith("~~~"):
  92. flush_para(para_buf)
  93. para_buf = []
  94. fence = stripped[:3]
  95. code_lines = []
  96. i += 1
  97. while i < len(lines) and not lines[i].strip().startswith(fence):
  98. code_lines.append(lines[i])
  99. i += 1
  100. if i < len(lines):
  101. i += 1 # skip closing fence
  102. blocks.append({"type": "code", "text": "\n".join(code_lines)})
  103. continue
  104. # Blockquote → callout
  105. if stripped.startswith(">"):
  106. flush_para(para_buf)
  107. para_buf = []
  108. qt = re.sub(r'^>\s*', '', stripped)
  109. blocks.append({"type": "callout", "text": _md_inline(qt)})
  110. i += 1
  111. continue
  112. # Unordered bullet: -, *, +
  113. if re.match(r'^[-*+]\s+', stripped):
  114. flush_para(para_buf)
  115. para_buf = []
  116. text_part = re.sub(r'^[-*+]\s+', '', stripped)
  117. blocks.append({"type": "bullet", "text": _md_inline(text_part)})
  118. i += 1
  119. continue
  120. # Ordered list: 1. 2. etc. → numbered (preserves counter in render_body)
  121. if re.match(r'^\d+\.\s+', stripped):
  122. flush_para(para_buf)
  123. para_buf = []
  124. text_part = re.sub(r'^\d+\.\s+', '', stripped)
  125. blocks.append({"type": "numbered", "text": _md_inline(text_part)})
  126. i += 1
  127. continue
  128. # Table: | col | col |
  129. if stripped.startswith("|"):
  130. flush_para(para_buf)
  131. para_buf = []
  132. table_lines = []
  133. while i < len(lines) and lines[i].strip().startswith("|"):
  134. table_lines.append(lines[i].strip())
  135. i += 1
  136. # Remove separator rows (|---|---|)
  137. data_rows = [r for r in table_lines if not re.match(r'^\|[-:| ]+\|$', r)]
  138. parsed = []
  139. for row in data_rows:
  140. cells = [c.strip() for c in row.strip("|").split("|")]
  141. parsed.append(cells)
  142. if len(parsed) >= 2:
  143. blocks.append({
  144. "type": "table",
  145. "headers": parsed[0],
  146. "rows": parsed[1:],
  147. })
  148. elif len(parsed) == 1:
  149. # Single row — treat as paragraph
  150. blocks.append({"type": "body", "text": " | ".join(parsed[0])})
  151. continue
  152. # Horizontal rule → spacer
  153. if re.match(r'^[-*_]{3,}$', stripped):
  154. flush_para(para_buf)
  155. para_buf = []
  156. blocks.append({"type": "spacer", "pt": 16})
  157. i += 1
  158. continue
  159. # Plain text → accumulate into paragraph
  160. para_buf.append(stripped)
  161. i += 1
  162. flush_para(para_buf)
  163. return blocks
  164. def _md_inline(text: str) -> str:
  165. """Convert inline Markdown to ReportLab XML markup."""
  166. # Bold: **text** or __text__
  167. text = re.sub(r'\*\*(.+?)\*\*', r'<b>\1</b>', text)
  168. text = re.sub(r'__(.+?)__', r'<b>\1</b>', text)
  169. # Italic: *text* or _text_
  170. text = re.sub(r'\*(.+?)\*', r'<i>\1</i>', text)
  171. text = re.sub(r'_(.+?)_', r'<i>\1</i>', text)
  172. # Inline code: `code`
  173. text = re.sub(r'`(.+?)`', r'<font name="Courier">\1</font>', text)
  174. # Strip markdown links, keep text
  175. text = re.sub(r'\[(.+?)\]\(.+?\)', r'\1', text)
  176. return text
  177. # ── PDF text extractor ─────────────────────────────────────────────────────────
  178. def parse_pdf(pdf_path: str) -> list:
  179. """
  180. Extract text from an existing PDF and convert to content.json blocks.
  181. Best-effort: detects headings by font size heuristics if available,
  182. otherwise falls back to paragraph splitting.
  183. """
  184. from pypdf import PdfReader
  185. reader = PdfReader(pdf_path)
  186. all_text = []
  187. for page in reader.pages:
  188. text = page.extract_text()
  189. if text:
  190. all_text.append(text.strip())
  191. full_text = "\n\n".join(all_text)
  192. # Treat extracted PDF text as plain text / light markdown
  193. # (most PDFs lose formatting — we do our best)
  194. return parse_plain(full_text)
  195. def parse_plain(text: str) -> list:
  196. """
  197. Heuristic plain-text parser.
  198. Short ALL-CAPS or title-case lines → headings.
  199. Everything else → paragraphs.
  200. """
  201. blocks = []
  202. paragraphs = re.split(r'\n{2,}', text.strip())
  203. for para in paragraphs:
  204. para = para.strip()
  205. if not para:
  206. continue
  207. lines = para.splitlines()
  208. # Single short line that looks like a heading
  209. if len(lines) == 1 and len(para) < 80:
  210. if para.isupper() or re.match(r'^[A-Z][^.!?]*$', para):
  211. blocks.append({"type": "h1", "text": para.title()})
  212. continue
  213. # Bullet lists
  214. if lines[0].startswith(("- ", "• ", "* ")):
  215. for line in lines:
  216. text_part = re.sub(r'^[-•*]\s+', '', line.strip())
  217. if text_part:
  218. blocks.append({"type": "bullet", "text": text_part})
  219. continue
  220. # Regular paragraph
  221. blocks.append({"type": "body", "text": " ".join(lines)})
  222. return blocks
  223. # ── Pass-through validator ─────────────────────────────────────────────────────
  224. VALID_TYPES = {"h1","h2","h3","body","bullet","numbered","callout","table",
  225. "image","code","math","divider","caption","pagebreak","spacer"}
  226. def validate_content_json(data: list) -> tuple[list, list]:
  227. """Return (valid_blocks, warnings)."""
  228. valid, warnings = [], []
  229. for i, block in enumerate(data):
  230. if not isinstance(block, dict):
  231. warnings.append(f"Block {i}: not a dict, skipped")
  232. continue
  233. btype = block.get("type")
  234. if btype not in VALID_TYPES:
  235. warnings.append(f"Block {i}: unknown type '{btype}', kept as-is")
  236. valid.append(block)
  237. return valid, warnings
  238. # ── Dispatcher ─────────────────────────────────────────────────────────────────
  239. def parse_file(input_path: str) -> tuple[list, list]:
  240. """Return (blocks, warnings)."""
  241. ext = Path(input_path).suffix.lower()
  242. if ext in (".md", ".txt", ".markdown"):
  243. with open(input_path, encoding="utf-8", errors="replace") as f:
  244. text = f.read()
  245. blocks = parse_markdown(text)
  246. return blocks, []
  247. if ext == ".pdf":
  248. blocks = parse_pdf(input_path)
  249. return blocks, ["PDF text extraction is best-effort — review content.json before rendering"]
  250. if ext == ".json":
  251. with open(input_path) as f:
  252. data = json.load(f)
  253. if isinstance(data, list):
  254. return validate_content_json(data)
  255. # Maybe it's a meta-wrapper {"content": [...]}
  256. if isinstance(data, dict) and "content" in data:
  257. return validate_content_json(data["content"])
  258. return [], [f"JSON file does not contain a list of content blocks"]
  259. return [], [f"Unsupported file type: {ext}. Supported: .md .txt .pdf .json"]
  260. # ── CLI ────────────────────────────────────────────────────────────────────────
  261. def main():
  262. parser = argparse.ArgumentParser(description="Parse a document into content.json")
  263. parser.add_argument("--input", required=True, help="Input file (.md, .txt, .pdf, .json)")
  264. parser.add_argument("--out", default="content.json", help="Output content.json path")
  265. args = parser.parse_args()
  266. if not os.path.exists(args.input):
  267. print(json.dumps({"status": "error", "error": f"File not found: {args.input}"}),
  268. file=sys.stderr)
  269. sys.exit(1)
  270. try:
  271. blocks, warnings = parse_file(args.input)
  272. except Exception as e:
  273. import traceback
  274. print(json.dumps({"status": "error", "error": str(e),
  275. "trace": traceback.format_exc()}), file=sys.stderr)
  276. sys.exit(3)
  277. if not blocks:
  278. print(json.dumps({
  279. "status": "error",
  280. "error": "No content blocks extracted",
  281. "warnings": warnings,
  282. }), file=sys.stderr)
  283. sys.exit(3)
  284. with open(args.out, "w", encoding="utf-8") as f:
  285. json.dump(blocks, f, indent=2, ensure_ascii=False)
  286. result = {
  287. "status": "ok",
  288. "out": args.out,
  289. "block_count": len(blocks),
  290. "warnings": warnings,
  291. }
  292. print(json.dumps(result, indent=2))
  293. print(f"\n── Parsed {args.input} ─────────────────────────────────────",
  294. file=sys.stderr)
  295. print(f" Blocks : {len(blocks)}", file=sys.stderr)
  296. type_counts: dict = {}
  297. for b in blocks:
  298. type_counts[b.get("type","?")] = type_counts.get(b.get("type","?"), 0) + 1
  299. for t, n in sorted(type_counts.items()):
  300. print(f" {t:12} × {n}", file=sys.stderr)
  301. if warnings:
  302. print(f" Warnings:", file=sys.stderr)
  303. for w in warnings:
  304. print(f" ⚠ {w}", file=sys.stderr)
  305. print(f"\n Next: bash make.sh run --content {args.out} --title '...' --type ...",
  306. file=sys.stderr)
  307. print("", file=sys.stderr)
  308. if __name__ == "__main__":
  309. main()