debug.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102
  1. """Shared debug LightRAG stand-in for the parse_* entry points.
  2. A minimal ``LightRAG`` stand-in plus a deterministic ``datetime`` shim,
  3. shared by the unified parser debug CLI (``lightrag/parser/cli.py``),
  4. the golden-fixture regen script (``scripts/regen_native_docx_golden.py``),
  5. and the byte-equivalence golden tests
  6. (``tests/parser/docx/test_native_docx_golden.py``).
  7. All three engines (``native`` / ``mineru`` / ``docling``) read the same
  8. ``self`` surface (``_persist_parsed_full_docs``, ``_resolve_source_file_for_parser``,
  9. ``self.full_docs``, ``self.doc_status``), so a single stand-in covers every
  10. ``parse_*`` method — when one of them grows a new dependency, extend
  11. this module rather than copy-pasting parallel stubs into each call site.
  12. """
  13. from __future__ import annotations
  14. from datetime import datetime, timezone
  15. from typing import Any
  16. class DebugFullDocs:
  17. """In-memory ``full_docs`` shim — captures the persisted record."""
  18. def __init__(self) -> None:
  19. self.data: dict[str, Any] = {}
  20. async def upsert(self, payload: dict[str, Any]) -> None:
  21. self.data.update(payload)
  22. async def get_by_id(self, doc_id: str) -> Any:
  23. return self.data.get(doc_id)
  24. async def index_done_callback(self) -> None:
  25. return None
  26. class DebugDocStatus:
  27. """No-op ``doc_status`` shim — the parse_* methods never read/write it."""
  28. async def get_by_id(self, doc_id: str) -> Any:
  29. return None
  30. async def upsert(self, data: dict[str, Any]) -> None:
  31. return None
  32. def build_debug_rag():
  33. """Build a minimal LightRAG stand-in that exposes what ``parse_*`` reads.
  34. The import of ``LightRAG`` is intentionally function-local: deferring
  35. it avoids a circular import when this helper is loaded during package
  36. init (the parser CLI resolves ``lightrag.parser.debug`` before
  37. ``lightrag`` itself is fully bound).
  38. LightRAG-side attributes the three ``parse_*`` methods read off ``self`` —
  39. every entry MUST be provided by this stand-in, or the debug CLI / golden
  40. tests / regen script will all break in sync:
  41. - **methods** (rebound from :class:`LightRAG`):
  42. - ``_persist_parsed_full_docs(doc_id, payload)`` — async; touches
  43. ``self.full_docs``.
  44. - ``_resolve_source_file_for_parser(file_path)`` — returns the
  45. on-disk source path. Stubbed to identity here since the CLI / tests
  46. feed an already-resolved path.
  47. - **storages**:
  48. - ``self.full_docs.upsert(...)`` / ``.get_by_id(...)`` /
  49. ``.index_done_callback()`` — :class:`DebugFullDocs` covers all three.
  50. - ``self.doc_status.get_by_id(...)`` / ``.upsert(...)`` —
  51. :class:`DebugDocStatus` covers both.
  52. When any of the three ``LightRAG.parse_*`` methods grows a new
  53. dependency on ``self``, extend this stand-in (and update the list
  54. above) rather than copy-pasting a parallel stub into the call sites.
  55. """
  56. from lightrag import LightRAG
  57. class _DebugRag:
  58. _persist_parsed_full_docs = LightRAG._persist_parsed_full_docs
  59. parse_native = LightRAG.parse_native
  60. parse_mineru = LightRAG.parse_mineru
  61. parse_docling = LightRAG.parse_docling
  62. def __init__(self) -> None:
  63. self.full_docs = DebugFullDocs()
  64. self.doc_status = DebugDocStatus()
  65. def _resolve_source_file_for_parser(self, file_path: str) -> str:
  66. return file_path
  67. return _DebugRag()
  68. _FROZEN_NOW = datetime(2026, 1, 1, tzinfo=timezone.utc)
  69. class FrozenDateTime(datetime):
  70. """Pin ``datetime.now`` so ``write_sidecar`` stamps a deterministic time."""
  71. @classmethod
  72. def now(cls, tz=None): # noqa: D401
  73. return _FROZEN_NOW if tz is None else _FROZEN_NOW.astimezone(tz)