manifest.py 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164
  1. """``_manifest.json`` schema for ``*.mineru_raw/`` bundles.
  2. The manifest is the *atomic success marker* for a raw bundle. Its presence
  3. implies "all files in this directory finished downloading"; its content is
  4. the cache key for "is this bundle for the same source file, the same MinerU
  5. parser options, engine version, and endpoint we are using right now?".
  6. Write path: ``write_manifest(path, manifest)`` writes a temp file then
  7. atomically renames to ``_manifest.json``. A crash mid-download leaves no
  8. manifest, so the next ``parse_mineru`` call cleanly invalidates and
  9. re-downloads.
  10. Read path: ``load_manifest(path)`` returns ``None`` if absent or malformed
  11. — either way the bundle is treated as stale.
  12. """
  13. from __future__ import annotations
  14. import json
  15. import os
  16. from dataclasses import asdict, dataclass
  17. from pathlib import Path
  18. MANIFEST_FILENAME = "_manifest.json"
  19. MANIFEST_VERSION = "1.0"
  20. MANIFEST_ENGINE = "mineru"
  21. @dataclass
  22. class ManifestFile:
  23. """One file entry inside the bundle. Size always; sha256 only for the
  24. critical file (content_list.json) — see :class:`Manifest.critical_file`.
  25. """
  26. path: str # relative to the raw dir
  27. size: int
  28. sha256: str | None = None # ``"sha256:<hex>"`` form or ``None``
  29. @dataclass
  30. class Manifest:
  31. """Schema for ``_manifest.json``. Backward-compat policy: new optional
  32. fields can be added without bumping version; **any** mismatch on existing
  33. field semantics requires a version bump.
  34. """
  35. source_content_hash: str # ``"sha256:<hex>"`` of source file
  36. source_size_bytes: int
  37. source_filename_at_parse: str
  38. critical_file: ManifestFile # content_list.json; size + sha256
  39. files: list[ManifestFile] # other files; size only
  40. total_size_bytes: int
  41. task_id: str = ""
  42. api_mode: str = ""
  43. engine_version: str = ""
  44. endpoint_signature: str = ""
  45. options_signature: str = ""
  46. downloaded_at: str = ""
  47. version: str = MANIFEST_VERSION
  48. engine: str = MANIFEST_ENGINE
  49. def to_dict(self) -> dict:
  50. return {
  51. "version": self.version,
  52. "engine": self.engine,
  53. "api_mode": self.api_mode,
  54. "engine_version": self.engine_version,
  55. "endpoint_signature": self.endpoint_signature,
  56. "options_signature": self.options_signature,
  57. "source_content_hash": self.source_content_hash,
  58. "source_size_bytes": int(self.source_size_bytes),
  59. "source_filename_at_parse": self.source_filename_at_parse,
  60. "task_id": self.task_id,
  61. "downloaded_at": self.downloaded_at,
  62. "critical_file": asdict(self.critical_file),
  63. "files": [asdict(f) for f in self.files],
  64. "total_size_bytes": int(self.total_size_bytes),
  65. }
  66. @classmethod
  67. def from_dict(cls, payload: dict) -> "Manifest":
  68. critical_raw = payload.get("critical_file") or {}
  69. files_raw = payload.get("files") or []
  70. return cls(
  71. version=str(payload.get("version") or MANIFEST_VERSION),
  72. engine=str(payload.get("engine") or MANIFEST_ENGINE),
  73. api_mode=str(payload.get("api_mode") or ""),
  74. engine_version=str(payload.get("engine_version") or ""),
  75. endpoint_signature=str(payload.get("endpoint_signature") or ""),
  76. options_signature=str(payload.get("options_signature") or ""),
  77. source_content_hash=str(payload.get("source_content_hash") or ""),
  78. source_size_bytes=int(payload.get("source_size_bytes") or 0),
  79. source_filename_at_parse=str(payload.get("source_filename_at_parse") or ""),
  80. task_id=str(payload.get("task_id") or ""),
  81. downloaded_at=str(payload.get("downloaded_at") or ""),
  82. critical_file=ManifestFile(
  83. path=str(critical_raw.get("path") or ""),
  84. size=int(critical_raw.get("size") or 0),
  85. sha256=(
  86. str(critical_raw["sha256"]) if critical_raw.get("sha256") else None
  87. ),
  88. ),
  89. files=[
  90. ManifestFile(
  91. path=str(f.get("path") or ""),
  92. size=int(f.get("size") or 0),
  93. sha256=(str(f["sha256"]) if f.get("sha256") else None),
  94. )
  95. for f in files_raw
  96. if isinstance(f, dict)
  97. ],
  98. total_size_bytes=int(payload.get("total_size_bytes") or 0),
  99. )
  100. def manifest_path(raw_dir: Path) -> Path:
  101. return raw_dir / MANIFEST_FILENAME
  102. def load_manifest(raw_dir: Path) -> Manifest | None:
  103. """Return the parsed manifest or ``None`` if absent / malformed."""
  104. p = manifest_path(raw_dir)
  105. if not p.is_file():
  106. return None
  107. try:
  108. payload = json.loads(p.read_text(encoding="utf-8"))
  109. except (OSError, json.JSONDecodeError):
  110. return None
  111. if not isinstance(payload, dict):
  112. return None
  113. if payload.get("version") != MANIFEST_VERSION:
  114. return None
  115. if payload.get("engine") != MANIFEST_ENGINE:
  116. return None
  117. try:
  118. return Manifest.from_dict(payload)
  119. except (TypeError, ValueError):
  120. return None
  121. def write_manifest(raw_dir: Path, manifest: Manifest) -> None:
  122. """Atomically write the manifest. The temp-file + rename pattern
  123. guarantees the manifest never appears in a partially-written state."""
  124. raw_dir.mkdir(parents=True, exist_ok=True)
  125. final = manifest_path(raw_dir)
  126. tmp = final.with_suffix(".json.tmp")
  127. tmp.write_text(
  128. json.dumps(manifest.to_dict(), ensure_ascii=False, indent=2),
  129. encoding="utf-8",
  130. )
  131. os.replace(tmp, final)
  132. # Re-exported for convenience.
  133. __all__ = [
  134. "MANIFEST_FILENAME",
  135. "MANIFEST_VERSION",
  136. "MANIFEST_ENGINE",
  137. "Manifest",
  138. "ManifestFile",
  139. "load_manifest",
  140. "manifest_path",
  141. "write_manifest",
  142. ]