cli.py 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245
  1. """Unified sidecar debug CLI for native / mineru / docling parsers.
  2. Drives ``LightRAG.parse_<engine>`` against a single source file and writes
  3. the resulting sidecar (and raw bundle, for mineru/docling) into a flat
  4. layout — no ``__parsed__/`` middle layer, source file never archived —
  5. so the artifacts can be inspected next to the input file.
  6. Invocation::
  7. python -m lightrag.parser.cli path/to/sample.docx --engine native
  8. python -m lightrag.parser.cli path/to/sample.pdf --engine mineru
  9. python -m lightrag.parser.cli path/to/sample.pdf --engine docling --force-reparse
  10. See ``docs/ParserDebugCLI-zh.md`` for the full reference.
  11. """
  12. from __future__ import annotations
  13. import argparse
  14. import asyncio
  15. import json
  16. import sys
  17. from contextlib import ExitStack
  18. from pathlib import Path
  19. from typing import Any
  20. from unittest import mock
  21. ENGINES = ("native", "mineru", "docling")
  22. def _build_parser() -> argparse.ArgumentParser:
  23. parser = argparse.ArgumentParser(
  24. prog="parse_sidecar",
  25. description=(
  26. "Run LightRAG.parse_<engine> on a single file and emit sidecar "
  27. "artifacts (plus a raw bundle for mineru/docling) into a flat "
  28. "layout alongside the source. No __parsed__/ middle layer; the "
  29. "source file is never moved."
  30. ),
  31. )
  32. parser.add_argument("input_file", type=Path, help="Source file to parse.")
  33. parser.add_argument(
  34. "--engine",
  35. required=True,
  36. choices=ENGINES,
  37. help="Parser engine to drive.",
  38. )
  39. parser.add_argument(
  40. "-o",
  41. "--sidecar-parent-dir",
  42. type=Path,
  43. default=None,
  44. help=(
  45. "Parent directory for <name>.parsed/ and <name>.<engine>_raw/. "
  46. "Default: the source file's parent directory."
  47. ),
  48. )
  49. parser.add_argument(
  50. "--doc-id",
  51. default=None,
  52. help="Override the doc id. Default: doc-<md5(absolute input path)>.",
  53. )
  54. parser.add_argument(
  55. "--force-reparse",
  56. action="store_true",
  57. help=(
  58. "Only affects mineru/docling. By default a non-empty raw_dir is "
  59. "treated as a valid cache and reused without manifest checks; "
  60. "this flag clears raw_dir and forces a fresh download/parse."
  61. ),
  62. )
  63. parser.add_argument(
  64. "--preview",
  65. type=int,
  66. default=5,
  67. metavar="N",
  68. help="Number of block rows to preview after parsing (0 disables).",
  69. )
  70. return parser
  71. def _print_summary(blocks_path: Path, raw_dir: Path | None, preview: int) -> None:
  72. with blocks_path.open("r", encoding="utf-8") as fh:
  73. meta_line = fh.readline().strip()
  74. if not meta_line:
  75. raise SystemExit(f"empty blocks file at {blocks_path}")
  76. meta = json.loads(meta_line)
  77. rows = [json.loads(line) for line in fh if line.strip()]
  78. parsed_dir = blocks_path.parent
  79. print(f"parsed dir : {parsed_dir} (exists={parsed_dir.exists()})")
  80. if raw_dir is not None:
  81. print(f"raw dir : {raw_dir} (exists={raw_dir.exists()})")
  82. print(f"document : {meta.get('document_name')}")
  83. print(f"doc_id : {meta.get('doc_id')}")
  84. print(f"engine : {meta.get('parse_engine')}")
  85. print(f"blocks : {meta.get('blocks')}")
  86. print(
  87. f"sidecars : tables={meta.get('table_file')} "
  88. f"drawings={meta.get('drawing_file')} "
  89. f"equations={meta.get('equation_file')} "
  90. f"asset_dir={meta.get('asset_dir')}"
  91. )
  92. if preview > 0 and rows:
  93. shown = min(preview, len(rows))
  94. print(f"--- preview (first {shown} of {len(rows)} blocks) ---")
  95. for row in rows[:preview]:
  96. heading = row.get("heading") or ""
  97. content = (row.get("content") or "").replace("\n", " ")
  98. snippet = content if len(content) <= 80 else content[:77] + "..."
  99. print(
  100. f" [{row.get('blockid', '')[:8]}] " f"heading={heading!r} :: {snippet}"
  101. )
  102. async def _run(args: argparse.Namespace) -> int:
  103. # Pipeline + heavy parser imports are deferred so ``--help`` and the
  104. # input-file existence check don't pay for them.
  105. from lightrag.constants import (
  106. FULL_DOCS_FORMAT_PENDING_PARSE,
  107. PARSER_ENGINE_SUFFIX_CAPABILITIES,
  108. )
  109. from lightrag.parser.debug import build_debug_rag
  110. from lightrag.utils import compute_mdhash_id
  111. import lightrag.pipeline as pipeline_mod
  112. import lightrag.utils_pipeline as utils_pipeline_mod
  113. source = args.input_file.resolve()
  114. if not source.is_file():
  115. print(f"error: input file does not exist: {source}", file=sys.stderr)
  116. return 1
  117. # Reject suffix/engine mismatches up-front: the pipeline would otherwise
  118. # fail deep inside the IR builder with a less helpful message.
  119. suffix = source.suffix.lstrip(".").lower()
  120. supported = PARSER_ENGINE_SUFFIX_CAPABILITIES.get(args.engine, frozenset())
  121. if suffix not in supported:
  122. print(
  123. f"error: engine '{args.engine}' does not support .{suffix or '<no suffix>'} "
  124. f"files (supported: {', '.join(sorted(supported))})",
  125. file=sys.stderr,
  126. )
  127. return 1
  128. sidecar_parent = (args.sidecar_parent_dir or source.parent).resolve()
  129. sidecar_parent.mkdir(parents=True, exist_ok=True)
  130. parsed_dir = sidecar_parent / f"{source.name}.parsed"
  131. raw_dir = (
  132. sidecar_parent / f"{source.name}.{args.engine}_raw"
  133. if args.engine in ("mineru", "docling")
  134. else None
  135. )
  136. doc_id = args.doc_id or compute_mdhash_id(str(source), prefix="doc-")
  137. def _patched_artifact_dir(
  138. file_path: str | None = None,
  139. *,
  140. parent_hint: Any | None = None,
  141. ) -> Path:
  142. # Flatten the production "<INPUT_DIR>/__parsed__/<base>.parsed/"
  143. # layout to "<sidecar_parent>/<source.name>.parsed/" so the sidecar
  144. # and the source file sit side by side.
  145. return parsed_dir
  146. def _lenient_bundle(raw_dir_arg: Path, _source_file: Path) -> bool:
  147. return raw_dir_arg.exists() and any(raw_dir_arg.iterdir())
  148. def _force_miss(*_args: Any, **_kwargs: Any) -> bool:
  149. return False
  150. bundle_check = _force_miss if args.force_reparse else _lenient_bundle
  151. async def _noop_archive(*_args: Any, **_kwargs: Any) -> None:
  152. return None
  153. rag = build_debug_rag()
  154. parse_method = getattr(rag, f"parse_{args.engine}")
  155. with ExitStack() as stack:
  156. # Patch 1: redirect sidecar output to the flat layout.
  157. # parsed_artifact_dir_for is from-imported into pipeline at
  158. # module load, so patch both namespaces.
  159. stack.enter_context(
  160. mock.patch.object(
  161. utils_pipeline_mod,
  162. "parsed_artifact_dir_for",
  163. _patched_artifact_dir,
  164. )
  165. )
  166. stack.enter_context(
  167. mock.patch.object(
  168. pipeline_mod,
  169. "parsed_artifact_dir_for",
  170. _patched_artifact_dir,
  171. )
  172. )
  173. # Patch 2: raw cache strategy. parse_mineru / parse_docling do a
  174. # function-local ``from lightrag.parser.external.<eng> import
  175. # is_bundle_valid``, so we replace the name on the facade module.
  176. if args.engine == "mineru":
  177. import lightrag.parser.external.mineru as mineru_pkg
  178. stack.enter_context(
  179. mock.patch.object(mineru_pkg, "is_bundle_valid", bundle_check)
  180. )
  181. elif args.engine == "docling":
  182. import lightrag.parser.external.docling as docling_pkg
  183. stack.enter_context(
  184. mock.patch.object(docling_pkg, "is_bundle_valid", bundle_check)
  185. )
  186. # Patch 3: keep the source file in place. All three parse_* methods
  187. # call archive_docx_source_after_full_docs_sync at the end.
  188. stack.enter_context(
  189. mock.patch.object(
  190. pipeline_mod,
  191. "archive_docx_source_after_full_docs_sync",
  192. _noop_archive,
  193. )
  194. )
  195. result = await parse_method(
  196. doc_id,
  197. str(source),
  198. {
  199. "parse_format": FULL_DOCS_FORMAT_PENDING_PARSE,
  200. "content": "",
  201. },
  202. )
  203. blocks_path = Path(result["blocks_path"])
  204. _print_summary(blocks_path, raw_dir, args.preview)
  205. return 0
  206. def main(argv: list[str] | None = None) -> int:
  207. args = _build_parser().parse_args(argv)
  208. return asyncio.run(_run(args))
  209. if __name__ == "__main__":
  210. sys.exit(main())