test_client.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562
  1. """Tests for :class:`DoclingRawClient`.
  2. Cover the contract guarantees that protect the sidecar pipeline:
  3. - the fixed pipeline constants (``pipeline=standard`` / ``target_type=zip``
  4. / ``to_formats=[json,md]`` / ``image_export_mode=referenced``) are sent
  5. on every upload, regardless of env;
  6. - terminal non-success states (``failure`` / ``partial_success`` /
  7. ``skipped``) abort the run **before** any result download;
  8. - ``DOCLING_OCR_LANG`` is omitted when empty so docling-serve falls back
  9. to its own default.
  10. Uses an in-process fake httpx client mirroring ``tests/parser/external/mineru/test_client.py``
  11. so we don't trip httpx's sync/async stream guard on multipart uploads.
  12. """
  13. from __future__ import annotations
  14. import io
  15. import json
  16. import zipfile
  17. from pathlib import Path
  18. from typing import Any
  19. import pytest
  20. from lightrag.parser.external.docling.client import (
  21. CONVERT_PATH,
  22. POLL_PATH,
  23. RESULT_PATH,
  24. DoclingRawClient,
  25. )
  26. # ---------------------------------------------------------------------------
  27. # Minimal httpx fake (no MockTransport — avoids the multipart encode path)
  28. # ---------------------------------------------------------------------------
  29. class _FakeResponse:
  30. def __init__(
  31. self,
  32. *,
  33. status_code: int = 200,
  34. text: str = "",
  35. content: bytes = b"",
  36. headers: dict[str, str] | None = None,
  37. ) -> None:
  38. self.status_code = status_code
  39. self.text = text
  40. self.content = content or text.encode("utf-8")
  41. self.headers = headers or {}
  42. def json(self) -> Any:
  43. return json.loads(self.text) if self.text else {}
  44. def raise_for_status(self) -> None:
  45. if self.status_code >= 400:
  46. raise RuntimeError(f"HTTP {self.status_code}")
  47. class _Recorder:
  48. def __init__(
  49. self,
  50. *,
  51. terminal_status: str,
  52. zip_bytes: bytes,
  53. task_id: str = "task-abc",
  54. submit_status_code: int = 200,
  55. submit_text: str | None = None,
  56. poll_status_code: int = 200,
  57. poll_text: str | None = None,
  58. result_status_code: int = 200,
  59. result_text: str | None = None,
  60. ) -> None:
  61. self.terminal_status = terminal_status
  62. self.zip_bytes = zip_bytes
  63. self.task_id = task_id
  64. self.submit_status_code = submit_status_code
  65. self.submit_text = submit_text
  66. self.poll_status_code = poll_status_code
  67. self.poll_text = poll_text
  68. self.result_status_code = result_status_code
  69. self.result_text = result_text
  70. self.post_calls: list[dict] = []
  71. self.get_calls: list[dict] = []
  72. self.result_calls = 0
  73. _CURRENT: dict[str, _Recorder] = {}
  74. class _FakeAsyncClient:
  75. def __init__(self, *_: Any, **__: Any) -> None:
  76. pass
  77. async def __aenter__(self) -> "_FakeAsyncClient":
  78. return self
  79. async def __aexit__(self, *_: Any) -> None:
  80. pass
  81. async def post(
  82. self,
  83. url: str,
  84. files: Any = None,
  85. data: Any = None,
  86. json: Any = None,
  87. headers: Any = None,
  88. ) -> _FakeResponse:
  89. recorder = _CURRENT["recorder"]
  90. # Production passes a file handle inside a `with` block — by the time
  91. # tests inspect `post_calls` it's already closed. Drain the stream
  92. # here so assertions can keep reading the payload as bytes.
  93. snapshot_files = files
  94. if files and "files" in files:
  95. name, payload, ctype = files["files"]
  96. if hasattr(payload, "read"):
  97. payload = payload.read()
  98. snapshot_files = {"files": (name, payload, ctype)}
  99. recorder.post_calls.append(
  100. {"url": url, "files": snapshot_files, "data": data, "json": json}
  101. )
  102. if CONVERT_PATH in url:
  103. if recorder.submit_status_code != 200:
  104. return _FakeResponse(
  105. status_code=recorder.submit_status_code,
  106. text=recorder.submit_text or "",
  107. )
  108. return _FakeResponse(
  109. status_code=200,
  110. text=json_dump({"task_id": recorder.task_id}),
  111. )
  112. raise AssertionError(f"unexpected POST {url}")
  113. async def get(
  114. self, url: str, params: Any = None, headers: Any = None
  115. ) -> _FakeResponse:
  116. recorder = _CURRENT["recorder"]
  117. recorder.get_calls.append({"url": url, "params": params})
  118. if POLL_PATH.format(task_id=recorder.task_id) in url:
  119. if recorder.poll_status_code != 200:
  120. return _FakeResponse(
  121. status_code=recorder.poll_status_code,
  122. text=recorder.poll_text or "",
  123. )
  124. payload: dict[str, Any] = {
  125. "task_id": recorder.task_id,
  126. "task_status": recorder.terminal_status,
  127. }
  128. if recorder.terminal_status != "success":
  129. payload["error_message"] = "synthetic-failure"
  130. return _FakeResponse(status_code=200, text=json_dump(payload))
  131. if RESULT_PATH.format(task_id=recorder.task_id) in url:
  132. recorder.result_calls += 1
  133. if recorder.result_status_code != 200:
  134. return _FakeResponse(
  135. status_code=recorder.result_status_code,
  136. text=recorder.result_text or "",
  137. )
  138. return _FakeResponse(
  139. status_code=200,
  140. content=recorder.zip_bytes,
  141. headers={"content-type": "application/zip"},
  142. )
  143. raise AssertionError(f"unexpected GET {url}")
  144. def json_dump(payload: Any) -> str:
  145. return json.dumps(payload)
  146. def _form_pairs(data: Any) -> list[tuple[str, str]]:
  147. """Normalize httpx form data into repeated ``(name, value)`` pairs.
  148. Production passes a mapping so httpx 0.28 keeps multipart ``files=`` on
  149. the async path. List values in that mapping represent repeated form keys.
  150. Older tests used tuple lists directly; accepting both keeps assertions
  151. focused on the wire contract instead of the container type.
  152. """
  153. if isinstance(data, dict):
  154. pairs: list[tuple[str, str]] = []
  155. for name, value in data.items():
  156. values = value if isinstance(value, list) else [value]
  157. pairs.extend((str(name), str(v)) for v in values)
  158. return pairs
  159. return [(str(name), str(value)) for name, value in data]
  160. def _fake_zip_with_main_json(stem: str) -> bytes:
  161. buf = io.BytesIO()
  162. with zipfile.ZipFile(buf, "w") as zf:
  163. zf.writestr(f"{stem}.json", b'{"schema_name": "DoclingDocument"}')
  164. zf.writestr(f"{stem}.md", b"# hello")
  165. return buf.getvalue()
  166. def _install_fake_httpx(monkeypatch: pytest.MonkeyPatch) -> None:
  167. """Replace ``httpx.AsyncClient`` and ``httpx.Timeout`` references in
  168. the docling client module with no-arg fakes."""
  169. monkeypatch.setattr(
  170. "lightrag.parser.external.docling.client.httpx.AsyncClient",
  171. _FakeAsyncClient,
  172. )
  173. monkeypatch.setattr(
  174. "lightrag.parser.external.docling.client.httpx.Timeout",
  175. lambda *a, **kw: None,
  176. )
  177. # ---------------------------------------------------------------------------
  178. # Fixtures
  179. # ---------------------------------------------------------------------------
  180. @pytest.fixture
  181. def source_pdf(tmp_path: Path) -> Path:
  182. p = tmp_path / "demo.pdf"
  183. p.write_bytes(b"%PDF-1.4 fake")
  184. return p
  185. @pytest.fixture(autouse=True)
  186. def docling_endpoint(monkeypatch: pytest.MonkeyPatch) -> None:
  187. monkeypatch.setenv("DOCLING_ENDPOINT", "http://docling.test")
  188. for name in (
  189. "DOCLING_DO_OCR",
  190. "DOCLING_FORCE_OCR",
  191. "DOCLING_OCR_ENGINE",
  192. "DOCLING_OCR_PRESET",
  193. "DOCLING_OCR_LANG",
  194. "DOCLING_DO_FORMULA_ENRICHMENT",
  195. "DOCLING_ENGINE_VERSION",
  196. ):
  197. monkeypatch.delenv(name, raising=False)
  198. # ---------------------------------------------------------------------------
  199. # Tests
  200. # ---------------------------------------------------------------------------
  201. async def test_docling_client_sends_fixed_constants(
  202. monkeypatch: pytest.MonkeyPatch,
  203. tmp_path: Path,
  204. source_pdf: Path,
  205. ) -> None:
  206. recorder = _Recorder(
  207. terminal_status="success",
  208. zip_bytes=_fake_zip_with_main_json("demo"),
  209. )
  210. _CURRENT["recorder"] = recorder
  211. _install_fake_httpx(monkeypatch)
  212. raw_dir = tmp_path / "demo.docling_raw"
  213. manifest = await DoclingRawClient().download_into(raw_dir, source_pdf)
  214. assert len(recorder.post_calls) == 1
  215. data = recorder.post_calls[0]["data"]
  216. field_map: dict[str, list[str]] = {}
  217. for name, value in _form_pairs(data):
  218. field_map.setdefault(name, []).append(value)
  219. assert field_map["pipeline"] == ["standard"]
  220. assert field_map["target_type"] == ["zip"]
  221. assert field_map["image_export_mode"] == ["referenced"]
  222. assert sorted(field_map["to_formats"]) == ["json", "md"]
  223. files = recorder.post_calls[0]["files"]
  224. assert "files" in files
  225. name, blob, ctype = files["files"]
  226. assert name == "demo.pdf"
  227. assert blob.startswith(b"%PDF-1.4")
  228. assert ctype == "application/octet-stream"
  229. assert manifest.task_id == recorder.task_id
  230. assert manifest.engine == "docling"
  231. assert manifest.extras["fixed_constants"]["pipeline"] == "standard"
  232. assert manifest.endpoint_signature == "http://docling.test"
  233. async def test_docling_client_partial_success_aborts(
  234. monkeypatch: pytest.MonkeyPatch,
  235. tmp_path: Path,
  236. source_pdf: Path,
  237. ) -> None:
  238. recorder = _Recorder(
  239. terminal_status="partial_success",
  240. zip_bytes=_fake_zip_with_main_json("demo"),
  241. )
  242. _CURRENT["recorder"] = recorder
  243. _install_fake_httpx(monkeypatch)
  244. with pytest.raises(RuntimeError) as excinfo:
  245. await DoclingRawClient().download_into(
  246. tmp_path / "demo.docling_raw", source_pdf
  247. )
  248. msg = str(excinfo.value)
  249. assert recorder.task_id in msg
  250. assert "partial_success" in msg
  251. assert "synthetic-failure" in msg
  252. assert recorder.result_calls == 0
  253. async def test_docling_client_failure_aborts(
  254. monkeypatch: pytest.MonkeyPatch,
  255. tmp_path: Path,
  256. source_pdf: Path,
  257. ) -> None:
  258. recorder = _Recorder(
  259. terminal_status="failure",
  260. zip_bytes=_fake_zip_with_main_json("demo"),
  261. )
  262. _CURRENT["recorder"] = recorder
  263. _install_fake_httpx(monkeypatch)
  264. with pytest.raises(RuntimeError):
  265. await DoclingRawClient().download_into(
  266. tmp_path / "demo.docling_raw", source_pdf
  267. )
  268. assert recorder.result_calls == 0
  269. async def test_docling_client_skipped_aborts(
  270. monkeypatch: pytest.MonkeyPatch,
  271. tmp_path: Path,
  272. source_pdf: Path,
  273. ) -> None:
  274. recorder = _Recorder(
  275. terminal_status="skipped",
  276. zip_bytes=_fake_zip_with_main_json("demo"),
  277. )
  278. _CURRENT["recorder"] = recorder
  279. _install_fake_httpx(monkeypatch)
  280. with pytest.raises(RuntimeError):
  281. await DoclingRawClient().download_into(
  282. tmp_path / "demo.docling_raw", source_pdf
  283. )
  284. assert recorder.result_calls == 0
  285. async def test_docling_client_upload_http_error_preserves_response_body(
  286. monkeypatch: pytest.MonkeyPatch,
  287. tmp_path: Path,
  288. source_pdf: Path,
  289. ) -> None:
  290. recorder = _Recorder(
  291. terminal_status="success",
  292. zip_bytes=_fake_zip_with_main_json("demo"),
  293. submit_status_code=400,
  294. submit_text=json_dump({"detail": "unsupported file type"}),
  295. )
  296. _CURRENT["recorder"] = recorder
  297. _install_fake_httpx(monkeypatch)
  298. with pytest.raises(RuntimeError) as excinfo:
  299. await DoclingRawClient().download_into(
  300. tmp_path / "demo.docling_raw", source_pdf
  301. )
  302. message = str(excinfo.value)
  303. assert "Docling upload for 'demo.pdf'" in message
  304. assert "HTTP 400" in message
  305. assert "unsupported file type" in message
  306. async def test_docling_client_poll_http_error_preserves_response_body(
  307. monkeypatch: pytest.MonkeyPatch,
  308. tmp_path: Path,
  309. source_pdf: Path,
  310. ) -> None:
  311. recorder = _Recorder(
  312. terminal_status="success",
  313. zip_bytes=_fake_zip_with_main_json("demo"),
  314. poll_status_code=503,
  315. poll_text=json_dump({"message": "queue unavailable"}),
  316. )
  317. _CURRENT["recorder"] = recorder
  318. _install_fake_httpx(monkeypatch)
  319. with pytest.raises(RuntimeError) as excinfo:
  320. await DoclingRawClient().download_into(
  321. tmp_path / "demo.docling_raw", source_pdf
  322. )
  323. message = str(excinfo.value)
  324. assert "Docling task task-abc poll" in message
  325. assert "HTTP 503" in message
  326. assert "queue unavailable" in message
  327. async def test_docling_client_result_redirect_treated_as_error(
  328. monkeypatch: pytest.MonkeyPatch,
  329. tmp_path: Path,
  330. source_pdf: Path,
  331. ) -> None:
  332. # docling-serve fronted by a misconfigured proxy could emit a 302 to a
  333. # CDN that httpx (default ``follow_redirects=False``) won't follow.
  334. # Without the explicit non-2xx guard the redirect body would fall into
  335. # the zip-decoder and surface as a cryptic "bad zip" error.
  336. recorder = _Recorder(
  337. terminal_status="success",
  338. zip_bytes=_fake_zip_with_main_json("demo"),
  339. result_status_code=302,
  340. result_text="",
  341. )
  342. _CURRENT["recorder"] = recorder
  343. _install_fake_httpx(monkeypatch)
  344. with pytest.raises(RuntimeError) as excinfo:
  345. await DoclingRawClient().download_into(
  346. tmp_path / "demo.docling_raw", source_pdf
  347. )
  348. message = str(excinfo.value)
  349. assert "Docling result task-abc download" in message
  350. assert "HTTP 302" in message
  351. async def test_docling_client_result_http_error_preserves_response_body(
  352. monkeypatch: pytest.MonkeyPatch,
  353. tmp_path: Path,
  354. source_pdf: Path,
  355. ) -> None:
  356. recorder = _Recorder(
  357. terminal_status="success",
  358. zip_bytes=_fake_zip_with_main_json("demo"),
  359. result_status_code=500,
  360. result_text="zip artifact missing",
  361. )
  362. _CURRENT["recorder"] = recorder
  363. _install_fake_httpx(monkeypatch)
  364. with pytest.raises(RuntimeError) as excinfo:
  365. await DoclingRawClient().download_into(
  366. tmp_path / "demo.docling_raw", source_pdf
  367. )
  368. message = str(excinfo.value)
  369. assert "Docling result task-abc download" in message
  370. assert "HTTP 500" in message
  371. assert "zip artifact missing" in message
  372. async def test_docling_client_ocr_lang_omitted_when_empty(
  373. monkeypatch: pytest.MonkeyPatch,
  374. tmp_path: Path,
  375. source_pdf: Path,
  376. ) -> None:
  377. recorder = _Recorder(
  378. terminal_status="success",
  379. zip_bytes=_fake_zip_with_main_json("demo"),
  380. )
  381. _CURRENT["recorder"] = recorder
  382. _install_fake_httpx(monkeypatch)
  383. await DoclingRawClient().download_into(tmp_path / "demo.docling_raw", source_pdf)
  384. data = recorder.post_calls[0]["data"]
  385. names = [name for name, _ in _form_pairs(data)]
  386. assert "ocr_lang" not in names
  387. async def test_docling_client_ocr_lang_sent_when_set(
  388. monkeypatch: pytest.MonkeyPatch,
  389. tmp_path: Path,
  390. source_pdf: Path,
  391. ) -> None:
  392. monkeypatch.setenv("DOCLING_OCR_LANG", '["en","zh"]')
  393. recorder = _Recorder(
  394. terminal_status="success",
  395. zip_bytes=_fake_zip_with_main_json("demo"),
  396. )
  397. _CURRENT["recorder"] = recorder
  398. _install_fake_httpx(monkeypatch)
  399. await DoclingRawClient().download_into(tmp_path / "demo.docling_raw", source_pdf)
  400. data = recorder.post_calls[0]["data"]
  401. langs = [v for name, v in _form_pairs(data) if name == "ocr_lang"]
  402. assert langs == ["en", "zh"]
  403. async def test_docling_client_ocr_lang_csv_form(
  404. monkeypatch: pytest.MonkeyPatch,
  405. tmp_path: Path,
  406. source_pdf: Path,
  407. ) -> None:
  408. """CSV fallback when value isn't valid JSON."""
  409. monkeypatch.setenv("DOCLING_OCR_LANG", "en, fr")
  410. recorder = _Recorder(
  411. terminal_status="success",
  412. zip_bytes=_fake_zip_with_main_json("demo"),
  413. )
  414. _CURRENT["recorder"] = recorder
  415. _install_fake_httpx(monkeypatch)
  416. await DoclingRawClient().download_into(tmp_path / "demo.docling_raw", source_pdf)
  417. data = recorder.post_calls[0]["data"]
  418. langs = [v for name, v in _form_pairs(data) if name == "ocr_lang"]
  419. assert langs == ["en", "fr"]
  420. async def test_docling_client_rejects_missing_endpoint(
  421. monkeypatch: pytest.MonkeyPatch,
  422. ) -> None:
  423. monkeypatch.setenv("DOCLING_ENDPOINT", "")
  424. with pytest.raises(ValueError, match="DOCLING_ENDPOINT"):
  425. DoclingRawClient()
  426. async def test_docling_client_strips_parser_hint_from_upload_filename(
  427. monkeypatch: pytest.MonkeyPatch, tmp_path: Path
  428. ) -> None:
  429. # Regression: a hinted source (``report.[docling].pdf``) used to cause
  430. # docling-serve to name its bundle JSON ``report.[docling].json``, which
  431. # the adapter (looking for ``report.json``) could not locate. The
  432. # pipeline now passes the canonical name as ``upload_filename`` so the
  433. # bundle is canonical-stem from the start.
  434. hinted = tmp_path / "report.[docling].pdf"
  435. hinted.write_bytes(b"%PDF-1.4 fake")
  436. # The fake zip mimics docling-serve responding with the *canonical* stem,
  437. # which is what would happen once we send the canonical filename.
  438. recorder = _Recorder(
  439. terminal_status="success",
  440. zip_bytes=_fake_zip_with_main_json("report"),
  441. )
  442. _CURRENT["recorder"] = recorder
  443. _install_fake_httpx(monkeypatch)
  444. raw_dir = tmp_path / "report.docling_raw"
  445. manifest = await DoclingRawClient().download_into(
  446. raw_dir, hinted, upload_filename="report.pdf"
  447. )
  448. name, _blob, _ctype = recorder.post_calls[0]["files"]["files"]
  449. assert name == "report.pdf"
  450. assert manifest.source_filename_at_parse == "report.pdf"
  451. assert manifest.critical_file.path == "report.json"
  452. assert (raw_dir / "report.json").is_file()
  453. async def test_docling_client_default_upload_filename_falls_back_to_source_name(
  454. monkeypatch: pytest.MonkeyPatch, tmp_path: Path, source_pdf: Path
  455. ) -> None:
  456. # Back-compat guard: callers that don't pass ``upload_filename`` (any
  457. # path other than the production pipeline) keep the legacy behavior of
  458. # using the on-disk source filename.
  459. recorder = _Recorder(
  460. terminal_status="success",
  461. zip_bytes=_fake_zip_with_main_json("demo"),
  462. )
  463. _CURRENT["recorder"] = recorder
  464. _install_fake_httpx(monkeypatch)
  465. await DoclingRawClient().download_into(tmp_path / "demo.docling_raw", source_pdf)
  466. name, _blob, _ctype = recorder.post_calls[0]["files"]["files"]
  467. assert name == "demo.pdf"