cache.py 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228
  1. """Cache validation for ``*.docling_raw/`` bundles.
  2. Validation policy (settled in
  3. ``docs/DoclingSidecarRefactorPlan-zh.md`` §4.1):
  4. 1. ``_manifest.json`` exists, parses, ``engine="docling"`` ∧ schema version
  5. matches.
  6. 2. **Source size fast-path**: ``source_file.stat().st_size`` matches the
  7. manifest; mismatch → miss without hashing.
  8. 3. **Source content_hash**: full sha256 of the current source file matches
  9. the manifest.
  10. 4. **Engine version**: if ``DOCLING_ENGINE_VERSION`` is set in env and the
  11. manifest recorded a non-empty value, they must match.
  12. 5. **Endpoint signature**: if the active ``DOCLING_ENDPOINT`` differs from
  13. what was recorded at parse time, miss (avoids re-using a bundle produced
  14. by a different docling-serve instance).
  15. 6. **Options signature**: covers every env or fixed constant that changes
  16. the produced bundle (OCR flags, language list, formula enrichment,
  17. target format and pipeline). Any change → miss.
  18. 7. **Critical file**: the main JSON must exist with matching size **and**
  19. sha256 — final tie-breaker against silent corruption affecting the file
  20. the adapter depends on.
  21. 8. **Other files**: size-only verification (cheap; covers most corruption
  22. modes for markdown / artifacts).
  23. Any failed step ⇒ cache miss; the caller wipes the directory contents and
  24. re-runs the download.
  25. """
  26. from __future__ import annotations
  27. import hashlib
  28. import json
  29. import os
  30. from pathlib import Path
  31. from lightrag.parser.external._common import compute_size_and_hash, env_bool
  32. from lightrag.parser.external._manifest import load_manifest
  33. from lightrag.parser.external.docling import MANIFEST_ENGINE
  34. from lightrag.utils import logger
  35. # Legacy upload-path suffix. ``env.example`` historically documented
  36. # ``DOCLING_ENDPOINT=http://host:5001/v1/convert/file/async`` (the full
  37. # upload URL); the current client expects a base URL and appends the path
  38. # itself. Strip the suffix so an unmodified pre-refactor ``.env`` keeps
  39. # working instead of producing
  40. # ``/v1/convert/file/async/v1/convert/file/async`` requests.
  41. _LEGACY_UPLOAD_PATH_SUFFIX = "/v1/convert/file/async"
  42. _legacy_endpoint_warned = False
  43. # Envs that change the bytes docling-serve produces. Any change here must
  44. # invalidate the bundle cache. ``DOCLING_BBOX_ATTRIBUTES`` is intentionally
  45. # NOT in this list: it only affects how the adapter writes IR meta, not the
  46. # docling bundle, so flipping it should re-emit the sidecar (which we always
  47. # do) without forcing a re-download.
  48. DOCLING_TUNABLE_ENVS: tuple[str, ...] = (
  49. "DOCLING_DO_OCR",
  50. "DOCLING_FORCE_OCR",
  51. "DOCLING_OCR_ENGINE",
  52. "DOCLING_OCR_PRESET",
  53. "DOCLING_OCR_LANG",
  54. "DOCLING_DO_FORMULA_ENRICHMENT",
  55. )
  56. def current_endpoint_signature() -> str:
  57. """The active docling endpoint, normalized to a base URL.
  58. Normalization:
  59. - Trims surrounding whitespace and strips trailing slashes.
  60. - Strips the legacy ``/v1/convert/file/async`` upload suffix if present,
  61. preserving backwards compatibility with the pre-refactor ``env.example``
  62. that documented the full upload URL.
  63. Returns ``""`` if ``DOCLING_ENDPOINT`` is unset — callers that need a
  64. real endpoint (``DoclingRawClient``) raise on empty; callers that only
  65. compare against a recorded manifest field (``is_bundle_valid``) silently
  66. skip the check when either side is empty.
  67. """
  68. global _legacy_endpoint_warned
  69. endpoint = os.getenv("DOCLING_ENDPOINT", "").strip().rstrip("/")
  70. if endpoint.endswith(_LEGACY_UPLOAD_PATH_SUFFIX):
  71. endpoint = endpoint[: -len(_LEGACY_UPLOAD_PATH_SUFFIX)]
  72. if not _legacy_endpoint_warned:
  73. _legacy_endpoint_warned = True
  74. logger.warning(
  75. "DOCLING_ENDPOINT still includes the legacy %r upload suffix; "
  76. "stripping it. Update your .env to a base URL "
  77. "(e.g. http://host:5001).",
  78. _LEGACY_UPLOAD_PATH_SUFFIX,
  79. )
  80. return endpoint
  81. def compute_options_signature(
  82. *,
  83. tunable_env: dict[str, str],
  84. fixed_constants: dict[str, object],
  85. ) -> str:
  86. """Stable signature over user-tunable env values and fixed pipeline
  87. constants.
  88. Storing the constants in the signature means a future code change that
  89. flips e.g. ``image_export_mode`` from ``referenced`` to ``embedded``
  90. invalidates every existing cache without anyone having to remember to
  91. bump a version.
  92. """
  93. payload = json.dumps(
  94. {"env": tunable_env, "fixed": fixed_constants},
  95. ensure_ascii=False,
  96. sort_keys=True,
  97. separators=(",", ":"),
  98. )
  99. return "sha256:" + hashlib.sha256(payload.encode("utf-8")).hexdigest()
  100. def snapshot_tunable_env() -> dict[str, str]:
  101. """Read effective docling tunables so equivalent requests share a signature."""
  102. return {
  103. "DOCLING_DO_OCR": str(env_bool("DOCLING_DO_OCR", True)).lower(),
  104. "DOCLING_FORCE_OCR": str(env_bool("DOCLING_FORCE_OCR", True)).lower(),
  105. "DOCLING_OCR_ENGINE": os.getenv("DOCLING_OCR_ENGINE", "auto").strip() or "auto",
  106. "DOCLING_OCR_PRESET": os.getenv("DOCLING_OCR_PRESET", "auto").strip() or "auto",
  107. "DOCLING_OCR_LANG": os.getenv("DOCLING_OCR_LANG", "").strip(),
  108. "DOCLING_DO_FORMULA_ENRICHMENT": str(
  109. env_bool("DOCLING_DO_FORMULA_ENRICHMENT", False)
  110. ).lower(),
  111. }
  112. def is_bundle_valid(raw_dir: Path, source_file: Path) -> bool:
  113. """Return True iff the bundle matches the current source + env state."""
  114. if not raw_dir.is_dir():
  115. return False
  116. manifest = load_manifest(raw_dir, expected_engine=MANIFEST_ENGINE)
  117. if manifest is None:
  118. return False
  119. # 1. Source size fast-path
  120. try:
  121. cur_size = source_file.stat().st_size
  122. except OSError:
  123. return False
  124. if cur_size != int(manifest.source_size_bytes):
  125. return False
  126. # 2. Source content_hash
  127. _, cur_hash = compute_size_and_hash(source_file)
  128. if cur_hash != manifest.source_content_hash:
  129. return False
  130. # 3. Engine version. Skip the comparison when either side is empty so
  131. # operators can opt out by unsetting the env, and so bundles from
  132. # earlier code that never recorded the field aren't force-invalidated.
  133. cur_engine_version = os.getenv("DOCLING_ENGINE_VERSION", "").strip()
  134. if (
  135. cur_engine_version
  136. and manifest.engine_version
  137. and cur_engine_version != manifest.engine_version
  138. ):
  139. return False
  140. # 4. Endpoint signature. Same "both non-empty to compare" rule: a bundle
  141. # parsed against a different docling-serve URL must not be reused, but
  142. # we don't reject the cache just because the env happens to be unset
  143. # at validation time (e.g. CLI tooling that only reads the cache).
  144. cur_endpoint = current_endpoint_signature()
  145. if (
  146. cur_endpoint
  147. and manifest.endpoint_signature
  148. and cur_endpoint != manifest.endpoint_signature
  149. ):
  150. return False
  151. # 5. Options signature: only enforced if the manifest recorded one
  152. # (manifests written before this commit have it empty — they are
  153. # treated as stale and re-downloaded the next time the env changes).
  154. #
  155. # Compare against the *current* fixed constants from client.py, not
  156. # the copy stashed in the manifest — using the manifest's copy would
  157. # always reproduce the recorded signature and silently swallow
  158. # code-only changes (e.g. flipping image_export_mode or to_formats),
  159. # defeating the invalidation this step is supposed to provide.
  160. # Lazy import: client.py imports from cache.py.
  161. if manifest.options_signature:
  162. from lightrag.parser.external.docling.client import FIXED_CONSTANTS
  163. cur_options = compute_options_signature(
  164. tunable_env=snapshot_tunable_env(),
  165. fixed_constants=FIXED_CONSTANTS,
  166. )
  167. if cur_options != manifest.options_signature:
  168. return False
  169. # 6. Critical file: size + sha256
  170. crit = manifest.critical_file
  171. crit_path = raw_dir / crit.path
  172. try:
  173. if crit_path.stat().st_size != int(crit.size):
  174. return False
  175. except OSError:
  176. return False
  177. if crit.sha256:
  178. _, crit_actual = compute_size_and_hash(crit_path)
  179. if crit_actual != crit.sha256:
  180. return False
  181. # 7. Other files: size only
  182. for entry in manifest.files:
  183. ep = raw_dir / entry.path
  184. try:
  185. if ep.stat().st_size != int(entry.size):
  186. return False
  187. except OSError:
  188. return False
  189. return True
  190. __all__ = [
  191. "DOCLING_TUNABLE_ENVS",
  192. "compute_options_signature",
  193. "current_endpoint_signature",
  194. "is_bundle_valid",
  195. "snapshot_tunable_env",
  196. ]