_manifest.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167
  1. """Shared ``_manifest.json`` schema for ``parser/external/<engine>/`` bundles.
  2. The manifest is the *atomic success marker* for a raw bundle. Its presence
  3. implies "all files in this directory finished downloading"; its content is
  4. the cache key for "is this bundle for the same source file, the same engine
  5. version, the same endpoint, and the same option signature we are using right
  6. now?".
  7. Write path: :func:`write_manifest` writes a temp file then atomically renames
  8. to ``_manifest.json``. A crash mid-download leaves no manifest, so the next
  9. parse call cleanly invalidates and re-downloads.
  10. Read path: :func:`load_manifest` returns ``None`` if absent, malformed, or
  11. recorded under a different engine — either way the bundle is treated as
  12. stale.
  13. """
  14. from __future__ import annotations
  15. import json
  16. import os
  17. from dataclasses import asdict, dataclass, field
  18. from pathlib import Path
  19. MANIFEST_FILENAME = "_manifest.json"
  20. MANIFEST_VERSION = "1.0"
  21. @dataclass
  22. class ManifestFile:
  23. """One file entry inside the bundle. Size always; sha256 only for files
  24. where silent corruption would break the adapter (the "critical" file).
  25. """
  26. path: str # relative to the raw dir
  27. size: int
  28. sha256: str | None = None # ``"sha256:<hex>"`` or ``None``
  29. @dataclass
  30. class Manifest:
  31. """Generic manifest schema. ``engine`` is filled by the caller (docling /
  32. mineru / etc.); ``options_signature`` lets per-engine cache layers detect
  33. when env-driven request parameters changed without bumping the version.
  34. """
  35. engine: str
  36. source_content_hash: str
  37. source_size_bytes: int
  38. source_filename_at_parse: str
  39. critical_file: ManifestFile
  40. files: list[ManifestFile]
  41. total_size_bytes: int
  42. task_id: str = ""
  43. api_mode: str = ""
  44. engine_version: str = ""
  45. endpoint_signature: str = ""
  46. options_signature: str = ""
  47. downloaded_at: str = ""
  48. extras: dict = field(default_factory=dict)
  49. version: str = MANIFEST_VERSION
  50. def to_dict(self) -> dict:
  51. return {
  52. "version": self.version,
  53. "engine": self.engine,
  54. "api_mode": self.api_mode,
  55. "engine_version": self.engine_version,
  56. "endpoint_signature": self.endpoint_signature,
  57. "options_signature": self.options_signature,
  58. "source_content_hash": self.source_content_hash,
  59. "source_size_bytes": int(self.source_size_bytes),
  60. "source_filename_at_parse": self.source_filename_at_parse,
  61. "task_id": self.task_id,
  62. "downloaded_at": self.downloaded_at,
  63. "critical_file": asdict(self.critical_file),
  64. "files": [asdict(f) for f in self.files],
  65. "total_size_bytes": int(self.total_size_bytes),
  66. "extras": dict(self.extras or {}),
  67. }
  68. @classmethod
  69. def from_dict(cls, payload: dict) -> "Manifest":
  70. critical_raw = payload.get("critical_file") or {}
  71. files_raw = payload.get("files") or []
  72. return cls(
  73. version=str(payload.get("version") or MANIFEST_VERSION),
  74. engine=str(payload.get("engine") or ""),
  75. api_mode=str(payload.get("api_mode") or ""),
  76. engine_version=str(payload.get("engine_version") or ""),
  77. endpoint_signature=str(payload.get("endpoint_signature") or ""),
  78. options_signature=str(payload.get("options_signature") or ""),
  79. source_content_hash=str(payload.get("source_content_hash") or ""),
  80. source_size_bytes=int(payload.get("source_size_bytes") or 0),
  81. source_filename_at_parse=str(payload.get("source_filename_at_parse") or ""),
  82. task_id=str(payload.get("task_id") or ""),
  83. downloaded_at=str(payload.get("downloaded_at") or ""),
  84. critical_file=ManifestFile(
  85. path=str(critical_raw.get("path") or ""),
  86. size=int(critical_raw.get("size") or 0),
  87. sha256=(
  88. str(critical_raw["sha256"]) if critical_raw.get("sha256") else None
  89. ),
  90. ),
  91. files=[
  92. ManifestFile(
  93. path=str(f.get("path") or ""),
  94. size=int(f.get("size") or 0),
  95. sha256=(str(f["sha256"]) if f.get("sha256") else None),
  96. )
  97. for f in files_raw
  98. if isinstance(f, dict)
  99. ],
  100. total_size_bytes=int(payload.get("total_size_bytes") or 0),
  101. extras=dict(payload.get("extras") or {}),
  102. )
  103. def manifest_path(raw_dir: Path) -> Path:
  104. return raw_dir / MANIFEST_FILENAME
  105. def load_manifest(raw_dir: Path, *, expected_engine: str) -> Manifest | None:
  106. """Return the parsed manifest or ``None`` if absent / malformed / for a
  107. different engine. ``expected_engine`` is required so a future shared raw
  108. dir cannot serve a bundle that belongs to another engine.
  109. """
  110. p = manifest_path(raw_dir)
  111. if not p.is_file():
  112. return None
  113. try:
  114. payload = json.loads(p.read_text(encoding="utf-8"))
  115. except (OSError, json.JSONDecodeError):
  116. return None
  117. if not isinstance(payload, dict):
  118. return None
  119. if payload.get("version") != MANIFEST_VERSION:
  120. return None
  121. if payload.get("engine") != expected_engine:
  122. return None
  123. try:
  124. return Manifest.from_dict(payload)
  125. except (TypeError, ValueError):
  126. return None
  127. def write_manifest(raw_dir: Path, manifest: Manifest) -> None:
  128. """Atomically write the manifest using temp-file + rename."""
  129. raw_dir.mkdir(parents=True, exist_ok=True)
  130. final = manifest_path(raw_dir)
  131. tmp = final.with_suffix(".json.tmp")
  132. tmp.write_text(
  133. json.dumps(manifest.to_dict(), ensure_ascii=False, indent=2),
  134. encoding="utf-8",
  135. )
  136. os.replace(tmp, final)
  137. __all__ = [
  138. "MANIFEST_FILENAME",
  139. "MANIFEST_VERSION",
  140. "Manifest",
  141. "ManifestFile",
  142. "load_manifest",
  143. "manifest_path",
  144. "write_manifest",
  145. ]