test_client.py 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851
  1. """``MinerURawClient.download_into`` integration tests.
  2. Uses an in-process fake httpx client so the upload / poll / result fetch
  3. choreography is exercised end-to-end without a live MinerU server. After
  4. the call, the raw dir contains:
  5. - ``content_list.json``
  6. - ``images/`` for any ``img_path`` references
  7. - ``_manifest.json`` whose hashes match the on-disk bytes
  8. """
  9. from __future__ import annotations
  10. import io
  11. import json
  12. import zipfile
  13. from pathlib import Path
  14. from typing import Any
  15. import pytest
  16. from lightrag.parser.external.mineru import is_bundle_valid
  17. from lightrag.parser.external.mineru.client import MinerURawClient
  18. # ---------------------------------------------------------------------------
  19. # Minimal httpx mock framework
  20. # ---------------------------------------------------------------------------
  21. class _FakeResponse:
  22. def __init__(
  23. self,
  24. *,
  25. status_code: int = 200,
  26. text: str = "",
  27. content: bytes = b"",
  28. headers: dict[str, str] | None = None,
  29. ) -> None:
  30. self.status_code = status_code
  31. self.text = text
  32. self.content = content or text.encode("utf-8")
  33. self.headers = headers or {}
  34. def json(self) -> Any:
  35. return json.loads(self.text) if self.text else {}
  36. def raise_for_status(self) -> None:
  37. if self.status_code >= 400:
  38. raise RuntimeError(f"HTTP {self.status_code}")
  39. class _FakeAsyncClient:
  40. """Routes calls through a per-test dispatcher."""
  41. def __init__(self, *_: Any, **__: Any) -> None:
  42. self.posts: list[dict] = []
  43. self.gets: list[str] = []
  44. async def __aenter__(self) -> "_FakeAsyncClient":
  45. return self
  46. async def __aexit__(self, *_: Any) -> None:
  47. pass
  48. async def post(
  49. self,
  50. url: str,
  51. content: Any = None,
  52. files: Any = None,
  53. json: Any = None,
  54. data: Any = None,
  55. headers: Any = None,
  56. ) -> _FakeResponse:
  57. self.posts.append(
  58. {
  59. "url": url,
  60. "content": content,
  61. "files": files,
  62. "json": json,
  63. "data": data,
  64. "headers": headers,
  65. }
  66. )
  67. return _CURRENT.dispatcher.post(
  68. url, content=content, files=files, json=json, data=data, headers=headers
  69. )
  70. async def put(
  71. self,
  72. url: str,
  73. data: Any = None,
  74. content: Any = None,
  75. headers: Any = None,
  76. ) -> _FakeResponse:
  77. return _CURRENT.dispatcher.put(url, data=data, content=content, headers=headers)
  78. async def get(
  79. self, url: str, params: Any = None, headers: Any = None
  80. ) -> _FakeResponse:
  81. self.gets.append(url)
  82. return _CURRENT.dispatcher.get(url, params=params, headers=headers)
  83. class _Dispatcher:
  84. def post(self, url: str, **_: Any) -> _FakeResponse: # pragma: no cover
  85. raise NotImplementedError
  86. def get(self, url: str, **_: Any) -> _FakeResponse: # pragma: no cover
  87. raise NotImplementedError
  88. def put(self, url: str, **_: Any) -> _FakeResponse: # pragma: no cover
  89. raise NotImplementedError
  90. class _CURRENT: # set per-test via monkeypatch
  91. dispatcher: _Dispatcher | None = None
  92. async def _collect_async_bytes(stream: Any) -> bytes:
  93. chunks = []
  94. async for chunk in stream:
  95. chunks.append(chunk)
  96. return b"".join(chunks)
  97. # ---------------------------------------------------------------------------
  98. # Common monkeypatch helpers
  99. # ---------------------------------------------------------------------------
  100. @pytest.fixture
  101. def fake_httpx(monkeypatch: pytest.MonkeyPatch) -> type:
  102. import lightrag.parser.external.mineru.client as mod
  103. fake = type(
  104. "FakeHttpx",
  105. (),
  106. {
  107. "AsyncClient": _FakeAsyncClient,
  108. "Timeout": lambda *a, **k: None,
  109. },
  110. )
  111. monkeypatch.setattr(mod, "httpx", fake)
  112. async def _instant_sleep(_t: float) -> None:
  113. return None
  114. # MinerURawClient uses asyncio.sleep directly; patch via module ref.
  115. import asyncio
  116. monkeypatch.setattr(asyncio, "sleep", _instant_sleep)
  117. return fake
  118. def _nested_mineru_zip() -> bytes:
  119. buf = io.BytesIO()
  120. with zipfile.ZipFile(buf, "w") as zf:
  121. zf.writestr(
  122. "demo/auto/demo_content_list.json",
  123. json.dumps(
  124. [
  125. {"type": "text", "text": "nested"},
  126. {"type": "image", "img_path": "images/img_001.png"},
  127. ],
  128. ensure_ascii=False,
  129. ),
  130. )
  131. zf.writestr("demo/auto/images/img_001.png", b"\x89PNGnested")
  132. zf.writestr("demo/auto/demo.md", "# Nested\n")
  133. return buf.getvalue()
  134. def _flat_mineru_zip() -> bytes:
  135. """Zip whose root already contains the canonical layout — normalization
  136. should be a no-op."""
  137. buf = io.BytesIO()
  138. with zipfile.ZipFile(buf, "w") as zf:
  139. zf.writestr(
  140. "content_list.json",
  141. json.dumps(
  142. [
  143. {"type": "text", "text": "flat"},
  144. {"type": "image", "img_path": "images/img_002.png"},
  145. ],
  146. ensure_ascii=False,
  147. ),
  148. )
  149. zf.writestr("images/img_002.png", b"\x89PNGflat")
  150. return buf.getvalue()
  151. def _multi_doc_mineru_zip() -> bytes:
  152. """Zip carrying two parse subtrees; only the entry matching the source
  153. stem should be picked as the canonical content_list."""
  154. buf = io.BytesIO()
  155. with zipfile.ZipFile(buf, "w") as zf:
  156. zf.writestr(
  157. "other/auto/other_content_list.json",
  158. json.dumps([{"type": "text", "text": "other"}], ensure_ascii=False),
  159. )
  160. zf.writestr(
  161. "demo/auto/demo_content_list.json",
  162. json.dumps(
  163. [
  164. {"type": "text", "text": "the right one"},
  165. {"type": "image", "img_path": "images/img_001.png"},
  166. ],
  167. ensure_ascii=False,
  168. ),
  169. )
  170. zf.writestr("demo/auto/images/img_001.png", b"\x89PNGmulti")
  171. return buf.getvalue()
  172. # ---------------------------------------------------------------------------
  173. # official mode: signed upload + batch poll + full_zip_url
  174. # ---------------------------------------------------------------------------
  175. class _OfficialDispatcher(_Dispatcher):
  176. def __init__(self) -> None:
  177. self.polls = 0
  178. self.uploaded = False
  179. self.apply_payload: dict[str, Any] | None = None
  180. self.upload_content: Any = None
  181. self.upload_headers: dict[str, str] | None = None
  182. def post(self, url: str, **kwargs: Any) -> _FakeResponse:
  183. if url == "https://mineru.net/api/v4/file-urls/batch":
  184. headers = kwargs.get("headers") or {}
  185. assert headers["Authorization"] == "Bearer token-123"
  186. self.apply_payload = kwargs.get("json")
  187. return _FakeResponse(
  188. text=json.dumps(
  189. {
  190. "code": 0,
  191. "msg": "ok",
  192. "data": {
  193. "batch_id": "B-1",
  194. "file_urls": ["https://upload.example/demo.pdf"],
  195. },
  196. }
  197. )
  198. )
  199. raise AssertionError(f"unexpected POST {url}")
  200. def put(self, url: str, **kwargs: Any) -> _FakeResponse:
  201. if url == "https://upload.example/demo.pdf":
  202. self.upload_content = kwargs.get("content")
  203. self.upload_headers = kwargs.get("headers")
  204. assert not isinstance(self.upload_content, bytes)
  205. assert hasattr(self.upload_content, "__aiter__")
  206. self.uploaded = True
  207. return _FakeResponse(status_code=200)
  208. raise AssertionError(f"unexpected PUT {url}")
  209. def get(self, url: str, **kwargs: Any) -> _FakeResponse:
  210. if url == "https://mineru.net/api/v4/extract-results/batch/B-1":
  211. headers = kwargs.get("headers") or {}
  212. assert headers["Authorization"] == "Bearer token-123"
  213. self.polls += 1
  214. state = "running" if self.polls == 1 else "done"
  215. result = {
  216. "file_name": "demo.pdf",
  217. "state": state,
  218. }
  219. if state == "done":
  220. result["full_zip_url"] = "https://download.example/full.zip"
  221. return _FakeResponse(
  222. text=json.dumps({"code": 0, "data": {"extract_result": [result]}})
  223. )
  224. if url == "https://download.example/full.zip":
  225. return _FakeResponse(
  226. content=_nested_mineru_zip(),
  227. headers={"Content-Type": "application/zip"},
  228. )
  229. raise AssertionError(f"unexpected GET {url}")
  230. @pytest.mark.offline
  231. async def test_client_official_mode_round_trip(
  232. tmp_path: Path,
  233. fake_httpx: type,
  234. monkeypatch: pytest.MonkeyPatch,
  235. ) -> None:
  236. monkeypatch.setenv("MINERU_API_MODE", "official")
  237. monkeypatch.setenv("MINERU_API_TOKEN", "token-123")
  238. monkeypatch.setenv("MINERU_POLL_INTERVAL_SECONDS", "0")
  239. monkeypatch.setenv("MINERU_MAX_POLLS", "5")
  240. src = tmp_path / "demo.pdf"
  241. src.write_bytes(b"PDFBYTES" * 200)
  242. raw = tmp_path / "demo.mineru_raw"
  243. raw.mkdir()
  244. dispatcher = _OfficialDispatcher()
  245. _CURRENT.dispatcher = dispatcher
  246. manifest = await MinerURawClient().download_into(raw, src)
  247. assert dispatcher.uploaded is True
  248. assert dispatcher.upload_headers == {"Content-Length": str(src.stat().st_size)}
  249. assert await _collect_async_bytes(dispatcher.upload_content) == src.read_bytes()
  250. assert dispatcher.apply_payload
  251. assert dispatcher.apply_payload["files"][0]["name"] == "demo.pdf"
  252. assert dispatcher.apply_payload["model_version"] == "vlm"
  253. assert manifest.task_id == "B-1"
  254. assert manifest.api_mode == "official"
  255. assert manifest.endpoint_signature == "https://mineru.net"
  256. assert (raw / "content_list.json").is_file()
  257. assert (raw / "images" / "img_001.png").read_bytes() == b"\x89PNGnested"
  258. assert is_bundle_valid(raw, src) is True
  259. @pytest.mark.offline
  260. async def test_client_official_upload_name_overrides_source_basename(
  261. tmp_path: Path,
  262. fake_httpx: type,
  263. monkeypatch: pytest.MonkeyPatch,
  264. ) -> None:
  265. monkeypatch.setenv("MINERU_API_MODE", "official")
  266. monkeypatch.setenv("MINERU_API_TOKEN", "token-123")
  267. monkeypatch.setenv("MINERU_POLL_INTERVAL_SECONDS", "0")
  268. src = tmp_path / "demo.[mineru-iet].pdf"
  269. src.write_bytes(b"PDFBYTES" * 200)
  270. raw = tmp_path / "demo.mineru_raw"
  271. raw.mkdir()
  272. dispatcher = _OfficialDispatcher()
  273. _CURRENT.dispatcher = dispatcher
  274. manifest = await MinerURawClient().download_into(
  275. raw,
  276. src,
  277. upload_name="demo.pdf",
  278. )
  279. assert dispatcher.apply_payload
  280. assert dispatcher.apply_payload["files"][0]["name"] == "demo.pdf"
  281. assert manifest.source_filename_at_parse == "demo.pdf"
  282. # ---------------------------------------------------------------------------
  283. # local mode: /tasks + /tasks/{id} + /tasks/{id}/result
  284. # ---------------------------------------------------------------------------
  285. class _LocalDispatcher(_Dispatcher):
  286. def __init__(self) -> None:
  287. self.content: Any = None
  288. self.form_data: dict[str, Any] | None = None
  289. self.files: Any = None
  290. self.headers: dict[str, str] | None = None
  291. self.upload_filename: str | None = None
  292. self.upload_payload: bytes | None = None
  293. self.upload_content_type: str | None = None
  294. def post(self, url: str, **kwargs: Any) -> _FakeResponse:
  295. if url == "http://127.0.0.1:8000/tasks":
  296. self.content = kwargs.get("content")
  297. self.form_data = kwargs.get("data")
  298. self.files = kwargs.get("files")
  299. self.headers = kwargs.get("headers")
  300. assert self.content is None
  301. assert self.files and "files" in self.files
  302. name, payload, ctype = self.files["files"]
  303. assert hasattr(payload, "read")
  304. assert not isinstance(payload, bytes)
  305. self.upload_filename = name
  306. self.upload_payload = payload.read()
  307. self.upload_content_type = ctype
  308. return _FakeResponse(text=json.dumps({"task_id": "L-1"}))
  309. raise AssertionError(f"unexpected POST {url}")
  310. def get(self, url: str, **_: Any) -> _FakeResponse:
  311. if url == "http://127.0.0.1:8000/tasks/L-1":
  312. return _FakeResponse(
  313. text=json.dumps({"task_id": "L-1", "status": "completed"})
  314. )
  315. if url == "http://127.0.0.1:8000/tasks/L-1/result":
  316. return _FakeResponse(
  317. content=_nested_mineru_zip(),
  318. headers={"Content-Type": "application/zip"},
  319. )
  320. raise AssertionError(f"unexpected GET {url}")
  321. @pytest.mark.offline
  322. async def test_client_local_mode_round_trip(
  323. tmp_path: Path,
  324. fake_httpx: type,
  325. monkeypatch: pytest.MonkeyPatch,
  326. ) -> None:
  327. monkeypatch.setenv("MINERU_API_MODE", "local")
  328. monkeypatch.setenv("MINERU_LOCAL_ENDPOINT", "http://127.0.0.1:8000")
  329. monkeypatch.setenv("MINERU_POLL_INTERVAL_SECONDS", "0")
  330. src = tmp_path / "demo.pdf"
  331. src.write_bytes(b"PDFBYTES" * 200)
  332. raw = tmp_path / "demo.mineru_raw"
  333. raw.mkdir()
  334. dispatcher = _LocalDispatcher()
  335. _CURRENT.dispatcher = dispatcher
  336. manifest = await MinerURawClient().download_into(raw, src)
  337. assert dispatcher.headers is None
  338. assert dispatcher.form_data
  339. assert dispatcher.form_data["backend"] == "hybrid-auto-engine"
  340. assert dispatcher.form_data["parse_method"] == "auto"
  341. assert dispatcher.form_data["image_analysis"] == "true"
  342. assert dispatcher.form_data["response_format_zip"] == "true"
  343. assert dispatcher.form_data["return_content_list"] == "true"
  344. assert dispatcher.form_data["return_images"] == "true"
  345. assert dispatcher.upload_filename == "demo.pdf"
  346. assert dispatcher.upload_content_type == "application/octet-stream"
  347. assert dispatcher.upload_payload == src.read_bytes()
  348. assert manifest.task_id == "L-1"
  349. assert manifest.api_mode == "local"
  350. assert manifest.endpoint_signature == "http://127.0.0.1:8000"
  351. assert manifest.options_signature.startswith("sha256:")
  352. assert (raw / "content_list.json").is_file()
  353. assert (raw / "images" / "img_001.png").read_bytes() == b"\x89PNGnested"
  354. @pytest.mark.offline
  355. async def test_client_local_upload_name_overrides_multipart_filename(
  356. tmp_path: Path,
  357. fake_httpx: type,
  358. monkeypatch: pytest.MonkeyPatch,
  359. ) -> None:
  360. monkeypatch.setenv("MINERU_API_MODE", "local")
  361. monkeypatch.setenv("MINERU_LOCAL_ENDPOINT", "http://127.0.0.1:8000")
  362. monkeypatch.setenv("MINERU_POLL_INTERVAL_SECONDS", "0")
  363. src = tmp_path / "demo.[mineru-R!].pdf"
  364. src.write_bytes(b"PDFBYTES" * 200)
  365. raw = tmp_path / "demo.mineru_raw"
  366. raw.mkdir()
  367. dispatcher = _LocalDispatcher()
  368. _CURRENT.dispatcher = dispatcher
  369. manifest = await MinerURawClient().download_into(
  370. raw,
  371. src,
  372. upload_name="demo.pdf",
  373. )
  374. assert dispatcher.content is None
  375. assert dispatcher.upload_filename == "demo.pdf"
  376. assert dispatcher.upload_payload == src.read_bytes()
  377. assert manifest.source_filename_at_parse == "demo.pdf"
  378. class _OfficialBadRequestDispatcher(_Dispatcher):
  379. def post(self, url: str, **_: Any) -> _FakeResponse:
  380. if url == "https://mineru.net/api/v4/file-urls/batch":
  381. return _FakeResponse(
  382. status_code=401,
  383. text=json.dumps({"code": 401, "msg": "invalid api token"}),
  384. )
  385. raise AssertionError(f"unexpected POST {url}")
  386. @pytest.mark.offline
  387. async def test_client_official_bad_request_preserves_response_body(
  388. tmp_path: Path,
  389. fake_httpx: type,
  390. monkeypatch: pytest.MonkeyPatch,
  391. ) -> None:
  392. monkeypatch.setenv("MINERU_API_MODE", "official")
  393. monkeypatch.setenv("MINERU_API_TOKEN", "token-123")
  394. monkeypatch.setenv("MINERU_POLL_INTERVAL_SECONDS", "0")
  395. src = tmp_path / "demo.pdf"
  396. src.write_bytes(b"PDFBYTES" * 200)
  397. raw = tmp_path / "demo.mineru_raw"
  398. raw.mkdir()
  399. _CURRENT.dispatcher = _OfficialBadRequestDispatcher()
  400. with pytest.raises(RuntimeError) as exc_info:
  401. await MinerURawClient().download_into(raw, src)
  402. message = str(exc_info.value)
  403. assert "MinerU official upload URL request" in message
  404. assert "HTTP 401" in message
  405. assert "invalid api token" in message
  406. class _OfficialFailedDispatcher(_OfficialDispatcher):
  407. def get(self, url: str, **kwargs: Any) -> _FakeResponse:
  408. if url == "https://mineru.net/api/v4/extract-results/batch/B-1":
  409. headers = kwargs.get("headers") or {}
  410. assert headers["Authorization"] == "Bearer token-123"
  411. return _FakeResponse(
  412. text=json.dumps(
  413. {
  414. "code": 0,
  415. "data": {
  416. "extract_result": [
  417. {
  418. "file_name": "demo.pdf",
  419. "state": "failed",
  420. "err_msg": "bad pdf",
  421. }
  422. ]
  423. },
  424. }
  425. )
  426. )
  427. raise AssertionError(f"unexpected GET {url}")
  428. @pytest.mark.offline
  429. async def test_client_official_failed_state_raises(
  430. tmp_path: Path,
  431. fake_httpx: type,
  432. monkeypatch: pytest.MonkeyPatch,
  433. ) -> None:
  434. monkeypatch.setenv("MINERU_API_MODE", "official")
  435. monkeypatch.setenv("MINERU_API_TOKEN", "token-123")
  436. monkeypatch.setenv("MINERU_POLL_INTERVAL_SECONDS", "0")
  437. src = tmp_path / "demo.pdf"
  438. src.write_bytes(b"PDFBYTES" * 200)
  439. raw = tmp_path / "demo.mineru_raw"
  440. raw.mkdir()
  441. _CURRENT.dispatcher = _OfficialFailedDispatcher()
  442. with pytest.raises(RuntimeError, match="bad pdf"):
  443. await MinerURawClient().download_into(raw, src)
  444. class _LocalFailedDispatcher(_Dispatcher):
  445. def post(self, url: str, **_: Any) -> _FakeResponse:
  446. if url == "http://127.0.0.1:8000/tasks":
  447. return _FakeResponse(text=json.dumps({"task_id": "L-bad"}))
  448. raise AssertionError(f"unexpected POST {url}")
  449. def get(self, url: str, **_: Any) -> _FakeResponse:
  450. if url == "http://127.0.0.1:8000/tasks/L-bad":
  451. return _FakeResponse(
  452. text=json.dumps(
  453. {"task_id": "L-bad", "status": "failed", "error": "bad pdf"}
  454. )
  455. )
  456. raise AssertionError(f"unexpected GET {url}")
  457. @pytest.mark.offline
  458. async def test_client_local_failed_state_raises(
  459. tmp_path: Path,
  460. fake_httpx: type,
  461. monkeypatch: pytest.MonkeyPatch,
  462. ) -> None:
  463. monkeypatch.setenv("MINERU_API_MODE", "local")
  464. monkeypatch.setenv("MINERU_LOCAL_ENDPOINT", "http://127.0.0.1:8000")
  465. monkeypatch.setenv("MINERU_POLL_INTERVAL_SECONDS", "0")
  466. src = tmp_path / "demo.pdf"
  467. src.write_bytes(b"PDFBYTES" * 200)
  468. raw = tmp_path / "demo.mineru_raw"
  469. raw.mkdir()
  470. _CURRENT.dispatcher = _LocalFailedDispatcher()
  471. with pytest.raises(RuntimeError, match="bad pdf"):
  472. await MinerURawClient().download_into(raw, src)
  473. class _LocalRedirectDispatcher(_Dispatcher):
  474. def post(self, url: str, **_: Any) -> _FakeResponse:
  475. if url == "http://127.0.0.1:8000/tasks":
  476. # Proxy/CDN misconfig: redirect with httpx default
  477. # ``follow_redirects=False`` would otherwise fall through and
  478. # break with a confusing "missing task_id" downstream.
  479. return _FakeResponse(
  480. status_code=302,
  481. headers={"Location": "http://alt.example/tasks"},
  482. )
  483. raise AssertionError(f"unexpected POST {url}")
  484. @pytest.mark.offline
  485. async def test_client_local_redirect_treated_as_error(
  486. tmp_path: Path,
  487. fake_httpx: type,
  488. monkeypatch: pytest.MonkeyPatch,
  489. ) -> None:
  490. monkeypatch.setenv("MINERU_API_MODE", "local")
  491. monkeypatch.setenv("MINERU_LOCAL_ENDPOINT", "http://127.0.0.1:8000")
  492. src = tmp_path / "demo.pdf"
  493. src.write_bytes(b"PDFBYTES" * 200)
  494. raw = tmp_path / "demo.mineru_raw"
  495. raw.mkdir()
  496. _CURRENT.dispatcher = _LocalRedirectDispatcher()
  497. with pytest.raises(RuntimeError) as exc_info:
  498. await MinerURawClient().download_into(raw, src)
  499. message = str(exc_info.value)
  500. assert "MinerU local task submission" in message
  501. assert "HTTP 302" in message
  502. class _LocalBadRequestDispatcher(_Dispatcher):
  503. def post(self, url: str, **_: Any) -> _FakeResponse:
  504. if url == "http://127.0.0.1:8000/tasks":
  505. return _FakeResponse(
  506. status_code=400,
  507. text=json.dumps(
  508. {
  509. "detail": "unsupported file type: .xlsx extension does not match payload"
  510. }
  511. ),
  512. )
  513. raise AssertionError(f"unexpected POST {url}")
  514. @pytest.mark.offline
  515. async def test_client_local_bad_request_preserves_response_body(
  516. tmp_path: Path,
  517. fake_httpx: type,
  518. monkeypatch: pytest.MonkeyPatch,
  519. ) -> None:
  520. monkeypatch.setenv("MINERU_API_MODE", "local")
  521. monkeypatch.setenv("MINERU_LOCAL_ENDPOINT", "http://127.0.0.1:8000")
  522. src = tmp_path / "demo.xlsx"
  523. src.write_bytes(b"not-really-xlsx")
  524. raw = tmp_path / "demo.mineru_raw"
  525. raw.mkdir()
  526. _CURRENT.dispatcher = _LocalBadRequestDispatcher()
  527. with pytest.raises(RuntimeError) as exc_info:
  528. await MinerURawClient().download_into(raw, src)
  529. message = str(exc_info.value)
  530. assert "MinerU local task submission" in message
  531. assert "HTTP 400" in message
  532. assert "unsupported file type" in message
  533. assert "demo.xlsx" in message
  534. @pytest.mark.offline
  535. def test_client_mode_specific_endpoint_validation(
  536. monkeypatch: pytest.MonkeyPatch,
  537. ) -> None:
  538. monkeypatch.setenv("MINERU_API_MODE", "official")
  539. monkeypatch.delenv("MINERU_API_TOKEN", raising=False)
  540. with pytest.raises(ValueError, match="MINERU_API_TOKEN"):
  541. MinerURawClient()
  542. monkeypatch.setenv("MINERU_API_TOKEN", "x")
  543. monkeypatch.setenv("MINERU_OFFICIAL_ENDPOINT", "https://mineru.net/api/v4")
  544. with pytest.raises(ValueError, match="MINERU_OFFICIAL_ENDPOINT"):
  545. MinerURawClient()
  546. monkeypatch.setenv("MINERU_API_MODE", "local")
  547. monkeypatch.delenv("MINERU_LOCAL_ENDPOINT", raising=False)
  548. with pytest.raises(ValueError, match="MINERU_LOCAL_ENDPOINT"):
  549. MinerURawClient()
  550. monkeypatch.setenv("MINERU_LOCAL_ENDPOINT", "http://127.0.0.1:8000/tasks")
  551. with pytest.raises(ValueError, match="MINERU_LOCAL_ENDPOINT"):
  552. MinerURawClient()
  553. monkeypatch.setenv("MINERU_API_MODE", "custom")
  554. with pytest.raises(ValueError, match="MINERU_API_MODE"):
  555. MinerURawClient()
  556. # ---------------------------------------------------------------------------
  557. # Manifest is *atomic*: presence implies fully written
  558. # ---------------------------------------------------------------------------
  559. @pytest.mark.offline
  560. async def test_client_manifest_written_atomically(
  561. tmp_path: Path,
  562. fake_httpx: type,
  563. monkeypatch: pytest.MonkeyPatch,
  564. ) -> None:
  565. monkeypatch.setenv("MINERU_API_MODE", "local")
  566. monkeypatch.setenv("MINERU_LOCAL_ENDPOINT", "http://127.0.0.1:8000")
  567. monkeypatch.setenv("MINERU_POLL_INTERVAL_SECONDS", "0")
  568. src = tmp_path / "demo.pdf"
  569. src.write_bytes(b"X" * 16)
  570. raw = tmp_path / "demo.mineru_raw"
  571. raw.mkdir()
  572. _CURRENT.dispatcher = _LocalDispatcher()
  573. await MinerURawClient().download_into(raw, src)
  574. # No leftover .tmp marker; only the final _manifest.json should exist.
  575. leftovers = list(raw.glob("_manifest*"))
  576. assert leftovers == [raw / "_manifest.json"]
  577. # ---------------------------------------------------------------------------
  578. # Bundle normalization: flat-zip fast path / multi-doc disambiguation
  579. # ---------------------------------------------------------------------------
  580. class _LocalFlatZipDispatcher(_Dispatcher):
  581. def post(self, url: str, **_: Any) -> _FakeResponse:
  582. if url == "http://127.0.0.1:8000/tasks":
  583. return _FakeResponse(text=json.dumps({"task_id": "L-flat"}))
  584. raise AssertionError(f"unexpected POST {url}")
  585. def get(self, url: str, **_: Any) -> _FakeResponse:
  586. if url == "http://127.0.0.1:8000/tasks/L-flat":
  587. return _FakeResponse(
  588. text=json.dumps({"task_id": "L-flat", "status": "completed"})
  589. )
  590. if url == "http://127.0.0.1:8000/tasks/L-flat/result":
  591. return _FakeResponse(
  592. content=_flat_mineru_zip(),
  593. headers={"Content-Type": "application/zip"},
  594. )
  595. raise AssertionError(f"unexpected GET {url}")
  596. @pytest.mark.offline
  597. async def test_client_flat_zip_normalize_is_noop(
  598. tmp_path: Path,
  599. fake_httpx: type,
  600. monkeypatch: pytest.MonkeyPatch,
  601. ) -> None:
  602. """A zip whose root already has content_list.json + images/ stays flat.
  603. The manifest must only record the two real files, not a duplicate."""
  604. monkeypatch.setenv("MINERU_API_MODE", "local")
  605. monkeypatch.setenv("MINERU_LOCAL_ENDPOINT", "http://127.0.0.1:8000")
  606. monkeypatch.setenv("MINERU_POLL_INTERVAL_SECONDS", "0")
  607. src = tmp_path / "demo.pdf"
  608. src.write_bytes(b"PDF" * 50)
  609. raw = tmp_path / "demo.mineru_raw"
  610. raw.mkdir()
  611. _CURRENT.dispatcher = _LocalFlatZipDispatcher()
  612. manifest = await MinerURawClient().download_into(raw, src)
  613. assert (raw / "content_list.json").is_file()
  614. assert (raw / "images" / "img_002.png").read_bytes() == b"\x89PNGflat"
  615. # Only one image listed in manifest files; no nested duplicate.
  616. file_paths = sorted(f.path for f in manifest.files)
  617. assert file_paths == ["images/img_002.png"]
  618. class _LocalMultiDocDispatcher(_Dispatcher):
  619. def post(self, url: str, **_: Any) -> _FakeResponse:
  620. if url == "http://127.0.0.1:8000/tasks":
  621. return _FakeResponse(text=json.dumps({"task_id": "L-multi"}))
  622. raise AssertionError(f"unexpected POST {url}")
  623. def get(self, url: str, **_: Any) -> _FakeResponse:
  624. if url == "http://127.0.0.1:8000/tasks/L-multi":
  625. return _FakeResponse(
  626. text=json.dumps({"task_id": "L-multi", "status": "completed"})
  627. )
  628. if url == "http://127.0.0.1:8000/tasks/L-multi/result":
  629. return _FakeResponse(
  630. content=_multi_doc_mineru_zip(),
  631. headers={"Content-Type": "application/zip"},
  632. )
  633. raise AssertionError(f"unexpected GET {url}")
  634. @pytest.mark.offline
  635. async def test_client_multi_doc_zip_picks_source_stem(
  636. tmp_path: Path,
  637. fake_httpx: type,
  638. monkeypatch: pytest.MonkeyPatch,
  639. ) -> None:
  640. """Two parse subtrees in the zip: the one whose stem matches the source
  641. file wins, and the rival's content_list.json must NOT bleed into root."""
  642. monkeypatch.setenv("MINERU_API_MODE", "local")
  643. monkeypatch.setenv("MINERU_LOCAL_ENDPOINT", "http://127.0.0.1:8000")
  644. monkeypatch.setenv("MINERU_POLL_INTERVAL_SECONDS", "0")
  645. src = tmp_path / "demo.pdf"
  646. src.write_bytes(b"PDF" * 50)
  647. raw = tmp_path / "demo.mineru_raw"
  648. raw.mkdir()
  649. _CURRENT.dispatcher = _LocalMultiDocDispatcher()
  650. await MinerURawClient().download_into(raw, src)
  651. content_list = json.loads((raw / "content_list.json").read_text())
  652. assert content_list[0]["text"] == "the right one"
  653. # Hoist removes the demo subtree; the unrelated 'other' subtree is left
  654. # untouched (still nested, no false root content_list).
  655. assert (raw / "images" / "img_001.png").read_bytes() == b"\x89PNGmulti"
  656. assert not (raw / "demo").exists()
  657. assert (raw / "other" / "auto" / "other_content_list.json").is_file()
  658. # ---------------------------------------------------------------------------
  659. # Official mode: multiple non-terminal poll rounds before "done"
  660. # ---------------------------------------------------------------------------
  661. class _OfficialSlowDispatcher(_OfficialDispatcher):
  662. """Returns pending → running → done across three polls."""
  663. def get(self, url: str, **_: Any) -> _FakeResponse:
  664. if url == "https://mineru.net/api/v4/extract-results/batch/B-1":
  665. self.polls += 1
  666. if self.polls == 1:
  667. state = "pending"
  668. elif self.polls == 2:
  669. state = "running"
  670. else:
  671. state = "done"
  672. result: dict[str, Any] = {"file_name": "demo.pdf", "state": state}
  673. if state == "done":
  674. result["full_zip_url"] = "https://download.example/full.zip"
  675. return _FakeResponse(
  676. text=json.dumps({"code": 0, "data": {"extract_result": [result]}})
  677. )
  678. if url == "https://download.example/full.zip":
  679. return _FakeResponse(
  680. content=_nested_mineru_zip(),
  681. headers={"Content-Type": "application/zip"},
  682. )
  683. raise AssertionError(f"unexpected GET {url}")
  684. @pytest.mark.offline
  685. async def test_client_official_polls_through_non_terminal_states(
  686. tmp_path: Path,
  687. fake_httpx: type,
  688. monkeypatch: pytest.MonkeyPatch,
  689. ) -> None:
  690. monkeypatch.setenv("MINERU_API_MODE", "official")
  691. monkeypatch.setenv("MINERU_API_TOKEN", "token-123")
  692. monkeypatch.setenv("MINERU_POLL_INTERVAL_SECONDS", "0")
  693. monkeypatch.setenv("MINERU_MAX_POLLS", "5")
  694. src = tmp_path / "demo.pdf"
  695. src.write_bytes(b"PDF" * 50)
  696. raw = tmp_path / "demo.mineru_raw"
  697. raw.mkdir()
  698. dispatcher = _OfficialSlowDispatcher()
  699. _CURRENT.dispatcher = dispatcher
  700. await MinerURawClient().download_into(raw, src)
  701. assert dispatcher.polls == 3
  702. assert (raw / "content_list.json").is_file()