manifest.py 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130
  1. """Helpers for building ``_manifest.json`` for docling raw bundles.
  2. Wraps the generic :class:`Manifest` schema with docling-specific knowledge:
  3. - the critical file is the main ``<stem>.json`` produced by docling-serve,
  4. - non-critical files are the markdown + every entry under ``artifacts/``,
  5. - ``extras`` carries the fixed pipeline constants so the options signature
  6. remains reproducible across runs.
  7. """
  8. from __future__ import annotations
  9. from datetime import datetime, timezone
  10. from pathlib import Path
  11. from lightrag.parser.external._common import compute_size_and_hash
  12. from lightrag.parser.external._manifest import (
  13. MANIFEST_FILENAME,
  14. Manifest,
  15. ManifestFile,
  16. write_manifest,
  17. )
  18. from lightrag.parser.external.docling import MANIFEST_ENGINE
  19. def select_main_json(raw_dir: Path, source_file_path: Path) -> Path:
  20. """Locate the primary docling JSON inside ``raw_dir``.
  21. Priority: ``<source_stem>.json`` if present, else the single ``*.json``
  22. sitting at ``raw_dir`` root (excluding ``_manifest.json``, which always
  23. sits in the bundle once a download has completed and would otherwise
  24. collide with the bundle JSON in the fallback). Raises ``RuntimeError``
  25. if zero or multiple candidates exist.
  26. """
  27. preferred = raw_dir / f"{source_file_path.stem}.json"
  28. if preferred.is_file():
  29. return preferred
  30. candidates = sorted(
  31. p for p in raw_dir.glob("*.json") if p.is_file() and p.name != MANIFEST_FILENAME
  32. )
  33. if len(candidates) == 1:
  34. return candidates[0]
  35. if not candidates:
  36. raise RuntimeError(f"Docling raw bundle at {raw_dir} contains no .json file")
  37. names = ", ".join(p.name for p in candidates)
  38. raise RuntimeError(
  39. f"Docling raw bundle at {raw_dir} has multiple .json candidates ({names}); "
  40. f"expected exactly one to derive the critical file from"
  41. )
  42. def select_main_md(raw_dir: Path, source_file_path: Path) -> Path | None:
  43. """Locate the markdown twin of the main JSON. Returns ``None`` if no
  44. markdown was produced (defensive — docling-serve always emits one for
  45. ``to_formats=["json","md"]`` but we don't want to crash if it is
  46. missing)."""
  47. preferred = raw_dir / f"{source_file_path.stem}.md"
  48. if preferred.is_file():
  49. return preferred
  50. candidates = sorted(p for p in raw_dir.glob("*.md") if p.is_file())
  51. return candidates[0] if candidates else None
  52. def build_and_write_docling_manifest(
  53. raw_dir: Path,
  54. *,
  55. source_file_path: Path,
  56. task_id: str,
  57. endpoint_signature: str,
  58. engine_version: str,
  59. options_signature: str,
  60. fixed_constants: dict[str, object],
  61. recorded_filename: str | None = None,
  62. ) -> Manifest:
  63. """Construct the manifest for a freshly downloaded docling bundle and
  64. persist it atomically. Returns the in-memory manifest for callers that
  65. need the task_id / signatures for logging.
  66. ``recorded_filename`` is the name passed to docling-serve at upload
  67. time (canonical, hint-stripped form when called from the pipeline).
  68. It governs both the preferred-path lookup for the bundle JSON and the
  69. value persisted as ``source_filename_at_parse``. When ``None``, falls
  70. back to ``source_file_path.name`` for backward compatibility.
  71. """
  72. lookup_path = Path(recorded_filename) if recorded_filename else source_file_path
  73. main_json = select_main_json(raw_dir, lookup_path)
  74. crit_size, crit_hash = compute_size_and_hash(main_json)
  75. critical = ManifestFile(
  76. path=main_json.relative_to(raw_dir).as_posix(),
  77. size=crit_size,
  78. sha256=crit_hash,
  79. )
  80. others: list[ManifestFile] = []
  81. for path in sorted(raw_dir.rglob("*")):
  82. if not path.is_file():
  83. continue
  84. rel = path.relative_to(raw_dir).as_posix()
  85. if rel == critical.path or rel.startswith("_manifest"):
  86. continue
  87. others.append(ManifestFile(path=rel, size=path.stat().st_size))
  88. source_size, source_hash = compute_size_and_hash(source_file_path)
  89. total = crit_size + sum(f.size for f in others)
  90. manifest = Manifest(
  91. engine=MANIFEST_ENGINE,
  92. source_content_hash=source_hash,
  93. source_size_bytes=source_size,
  94. source_filename_at_parse=recorded_filename or source_file_path.name,
  95. critical_file=critical,
  96. files=others,
  97. total_size_bytes=total,
  98. task_id=task_id,
  99. endpoint_signature=endpoint_signature,
  100. engine_version=engine_version,
  101. options_signature=options_signature,
  102. downloaded_at=datetime.now(timezone.utc).isoformat(timespec="seconds"),
  103. extras={"fixed_constants": dict(fixed_constants)},
  104. )
  105. write_manifest(raw_dir, manifest)
  106. return manifest
  107. __all__ = [
  108. "build_and_write_docling_manifest",
  109. "select_main_json",
  110. "select_main_md",
  111. ]