client.py 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677
  1. """MinerU raw bundle downloader.
  2. Supports MinerU's official cloud and self-hosted API protocols and lands the
  3. final parser bundle on disk under ``raw_dir/``:
  4. - ``official`` — MinerU precision API v4: apply for signed upload URL, PUT the
  5. local file, poll batch results, download ``full_zip_url``.
  6. - ``local`` — self-hosted ``mineru-api`` / ``mineru-router``: submit
  7. ``POST /tasks``, poll ``GET /tasks/{task_id}``, download
  8. ``GET /tasks/{task_id}/result``.
  9. Both protocols request a zip result bundle. Archives are extracted under
  10. ``raw_dir/`` and normalized so the adapter can read a root-level
  11. ``content_list.json``.
  12. """
  13. from __future__ import annotations
  14. import asyncio
  15. import io
  16. import json
  17. import os
  18. import shutil
  19. import zipfile
  20. from collections.abc import AsyncIterator
  21. from datetime import datetime, timezone
  22. from pathlib import Path
  23. from typing import TYPE_CHECKING, Any
  24. from urllib.parse import urlparse
  25. from lightrag.parser.external._common import raise_for_status_with_detail
  26. from lightrag.parser.external.mineru.cache import (
  27. MinerUParserOptions,
  28. compute_size_and_hash,
  29. )
  30. from lightrag.parser.external.mineru.manifest import (
  31. Manifest,
  32. ManifestFile,
  33. write_manifest,
  34. )
  35. from lightrag.utils import logger
  36. if TYPE_CHECKING:
  37. import httpx
  38. else:
  39. try:
  40. import httpx
  41. except ImportError: # pragma: no cover
  42. httpx = None
  43. CONTENT_LIST_FILENAME = "content_list.json"
  44. DEFAULT_MINERU_API_MODE = "local"
  45. DEFAULT_MINERU_OFFICIAL_ENDPOINT = "https://mineru.net"
  46. VALID_MINERU_API_MODES = {"official", "local"}
  47. OFFICIAL_DONE_STATES = {"done"}
  48. OFFICIAL_FAILED_STATES = {"failed"}
  49. LOCAL_DONE_STATES = {"completed"}
  50. LOCAL_FAILED_STATES = {"failed"}
  51. UPLOAD_CHUNK_SIZE = 1024 * 1024
  52. def _get_by_path(payload: Any, path: str) -> Any:
  53. """Walk a dotted path through a nested dict; returns None if any segment
  54. is missing or non-dict."""
  55. if not path:
  56. return None
  57. cur = payload
  58. for part in path.split("."):
  59. if isinstance(cur, dict) and part in cur:
  60. cur = cur[part]
  61. else:
  62. return None
  63. return cur
  64. def _strip_trailing_slash(url: str) -> str:
  65. return url.rstrip("/")
  66. def _resolve_upload_name(upload_name: str | None, source_file_path: Path) -> str:
  67. candidate = Path(str(upload_name or "")).name
  68. return candidate or source_file_path.name
  69. async def _iter_file_bytes(path: Path) -> AsyncIterator[bytes]:
  70. with path.open("rb") as fh:
  71. while True:
  72. chunk = await asyncio.to_thread(fh.read, UPLOAD_CHUNK_SIZE)
  73. if not chunk:
  74. break
  75. yield chunk
  76. def _validate_base_url(
  77. name: str, endpoint: str, forbidden_segments: tuple[str, ...]
  78. ) -> None:
  79. parsed = urlparse(endpoint)
  80. path = (parsed.path or "").rstrip("/")
  81. for segment in forbidden_segments:
  82. if path.endswith(segment) or f"{segment}/" in path:
  83. raise ValueError(
  84. f"{name} must be a base URL, not an API path: {endpoint!r}"
  85. )
  86. class MinerURawClient:
  87. """Downloads MinerU bundles into ``raw_dir``.
  88. Construct once per call (cheap). Reads ``MINERU_*`` env vars at
  89. construction time. Methods are async and use a single shared httpx
  90. client across all calls in :meth:`download_into`.
  91. Implements the MinerU-specific upload + poll + zip download flow
  92. inline; bundle handling needs the ``result_url`` *and* the
  93. ``Content-Type`` of the response, which a generic protocol helper
  94. cannot expose without leaking abstractions.
  95. """
  96. def __init__(self) -> None:
  97. self.api_mode = (
  98. os.getenv("MINERU_API_MODE", DEFAULT_MINERU_API_MODE).strip().lower()
  99. )
  100. if self.api_mode not in VALID_MINERU_API_MODES:
  101. allowed = ", ".join(sorted(VALID_MINERU_API_MODES))
  102. raise ValueError(
  103. f"MINERU_API_MODE must be one of {allowed}, got {self.api_mode!r}"
  104. )
  105. self.official_endpoint = _strip_trailing_slash(
  106. os.getenv(
  107. "MINERU_OFFICIAL_ENDPOINT", DEFAULT_MINERU_OFFICIAL_ENDPOINT
  108. ).strip()
  109. or DEFAULT_MINERU_OFFICIAL_ENDPOINT
  110. )
  111. self.local_endpoint = _strip_trailing_slash(
  112. os.getenv("MINERU_LOCAL_ENDPOINT", "").strip()
  113. )
  114. self.api_token = os.getenv("MINERU_API_TOKEN", "").strip()
  115. if self.api_mode == "official":
  116. if not self.api_token:
  117. raise ValueError(
  118. "MINERU_API_TOKEN is required when MINERU_API_MODE=official"
  119. )
  120. _validate_base_url(
  121. "MINERU_OFFICIAL_ENDPOINT",
  122. self.official_endpoint,
  123. ("/api/v4", "/api/v4/file-urls/batch", "/api/v4/extract/task"),
  124. )
  125. self.endpoint = self.official_endpoint
  126. elif self.api_mode == "local":
  127. if not self.local_endpoint:
  128. raise ValueError(
  129. "MINERU_LOCAL_ENDPOINT is required when MINERU_API_MODE=local"
  130. )
  131. _validate_base_url(
  132. "MINERU_LOCAL_ENDPOINT",
  133. self.local_endpoint,
  134. ("/tasks", "/file_parse", "/health"),
  135. )
  136. self.endpoint = self.local_endpoint
  137. self.poll_interval = float(os.getenv("MINERU_POLL_INTERVAL_SECONDS", "2"))
  138. self.max_polls = int(os.getenv("MINERU_MAX_POLLS", "180"))
  139. self.engine_version = os.getenv("MINERU_ENGINE_VERSION", "").strip()
  140. options = MinerUParserOptions.from_env(api_mode=self.api_mode)
  141. self._parser_options = options
  142. self.model_version = options.model_version
  143. self.language = options.language
  144. self.enable_table = options.enable_table
  145. self.enable_formula = options.enable_formula
  146. self.is_ocr = options.is_ocr
  147. self.page_ranges = options.page_ranges
  148. self.local_backend = options.local_backend
  149. self.local_parse_method = options.local_parse_method
  150. self.local_image_analysis = options.local_image_analysis
  151. self.local_start_page_id = options.local_start_page_id
  152. self.local_end_page_id = options.local_end_page_id
  153. # ------------------------------------------------------------------
  154. # Public API
  155. # ------------------------------------------------------------------
  156. async def download_into(
  157. self,
  158. raw_dir: Path,
  159. source_file_path: Path,
  160. *,
  161. upload_name: str | None = None,
  162. ) -> Manifest:
  163. """Download a fresh bundle and write the manifest.
  164. Pre-condition: caller cleared ``raw_dir`` contents (recommended via
  165. :func:`clear_dir_contents`). This method does NOT clean the
  166. directory itself — leaving that to the caller keeps cache miss
  167. semantics explicit at the parse_mineru entry point.
  168. Returns the :class:`Manifest` describing the bundle.
  169. """
  170. if httpx is None:
  171. raise RuntimeError("httpx is required for MinerU parsing but not installed")
  172. raw_dir.mkdir(parents=True, exist_ok=True)
  173. resolved_upload_name = _resolve_upload_name(upload_name, source_file_path)
  174. timeout = httpx.Timeout(120.0, connect=30.0)
  175. async with httpx.AsyncClient(timeout=timeout) as client:
  176. if self.api_mode == "official":
  177. task_id = await self._download_official(
  178. client, source_file_path, raw_dir, resolved_upload_name
  179. )
  180. else:
  181. task_id = await self._download_local(
  182. client, source_file_path, raw_dir, resolved_upload_name
  183. )
  184. self._normalize_raw_bundle(raw_dir, source_file_path, resolved_upload_name)
  185. return self._build_and_write_manifest(
  186. raw_dir, source_file_path, task_id, resolved_upload_name
  187. )
  188. # ------------------------------------------------------------------
  189. # Upload + poll
  190. # ------------------------------------------------------------------
  191. def _official_headers(self) -> dict[str, str]:
  192. return {
  193. "Content-Type": "application/json",
  194. "Authorization": f"Bearer {self.api_token}",
  195. }
  196. def _official_payload(self, upload_name: str) -> dict[str, Any]:
  197. file_entry: dict[str, Any] = {"name": upload_name}
  198. if self.is_ocr:
  199. file_entry["is_ocr"] = True
  200. if self.page_ranges:
  201. file_entry["page_ranges"] = self.page_ranges
  202. return {
  203. "files": [file_entry],
  204. "model_version": self.model_version,
  205. "language": self.language,
  206. "enable_table": self.enable_table,
  207. "enable_formula": self.enable_formula,
  208. }
  209. async def _download_official(
  210. self,
  211. client: "httpx.AsyncClient",
  212. source_file_path: Path,
  213. raw_dir: Path,
  214. upload_name: str,
  215. ) -> str:
  216. apply_url = f"{self.official_endpoint}/api/v4/file-urls/batch"
  217. resp = await client.post(
  218. apply_url,
  219. headers=self._official_headers(),
  220. json=self._official_payload(upload_name),
  221. )
  222. raise_for_status_with_detail(resp, "MinerU official upload URL request")
  223. payload = resp.json() if resp.text else {}
  224. self._raise_if_official_error(payload, "MinerU official upload URL request")
  225. data = payload.get("data") if isinstance(payload, dict) else {}
  226. batch_id = str((data or {}).get("batch_id") or "")
  227. file_urls = (data or {}).get("file_urls") or []
  228. if not batch_id or not isinstance(file_urls, list) or not file_urls:
  229. raise RuntimeError(
  230. f"MinerU official upload URL response missing batch_id/file_urls: "
  231. f"{payload}"
  232. )
  233. first_file_url = file_urls[0]
  234. if isinstance(first_file_url, dict):
  235. upload_url = str(
  236. first_file_url.get("url") or first_file_url.get("file_url") or ""
  237. )
  238. else:
  239. upload_url = str(first_file_url)
  240. if not upload_url:
  241. raise RuntimeError(
  242. f"MinerU official upload URL response had an empty upload URL: "
  243. f"{payload}"
  244. )
  245. upload_resp = await client.put(
  246. upload_url,
  247. content=_iter_file_bytes(source_file_path),
  248. headers={"Content-Length": str(source_file_path.stat().st_size)},
  249. )
  250. raise_for_status_with_detail(upload_resp, "MinerU official file upload")
  251. result_url = await self._poll_official_batch(client, batch_id, upload_name)
  252. await self._download_zip(client, result_url, raw_dir)
  253. return batch_id
  254. async def _poll_official_batch(
  255. self,
  256. client: "httpx.AsyncClient",
  257. batch_id: str,
  258. upload_name: str,
  259. ) -> str:
  260. poll_url = f"{self.official_endpoint}/api/v4/extract-results/batch/{batch_id}"
  261. for _ in range(self.max_polls):
  262. await asyncio.sleep(self.poll_interval)
  263. resp = await client.get(poll_url, headers=self._official_headers())
  264. raise_for_status_with_detail(resp, "MinerU official batch poll")
  265. payload = resp.json() if resp.text else {}
  266. self._raise_if_official_error(payload, "MinerU official batch poll")
  267. results = _get_by_path(payload, "data.extract_result")
  268. if isinstance(results, dict):
  269. results = [results]
  270. if not isinstance(results, list):
  271. continue
  272. selected = _select_official_extract_result(results, upload_name)
  273. if selected is None:
  274. continue
  275. state = str(selected.get("state") or "").lower()
  276. if state in OFFICIAL_DONE_STATES:
  277. full_zip_url = str(selected.get("full_zip_url") or "")
  278. if not full_zip_url:
  279. raise RuntimeError(
  280. f"MinerU official batch {batch_id} is done but has no "
  281. f"full_zip_url: {selected}"
  282. )
  283. return full_zip_url
  284. if state in OFFICIAL_FAILED_STATES:
  285. err = selected.get("err_msg") or selected.get("error") or selected
  286. raise RuntimeError(
  287. f"MinerU official parse failed for batch {batch_id}: {err}"
  288. )
  289. raise TimeoutError(f"MinerU official batch polling timeout: {batch_id}")
  290. def _raise_if_official_error(self, payload: Any, operation: str) -> None:
  291. if not isinstance(payload, dict):
  292. raise RuntimeError(f"{operation} returned non-object payload: {payload!r}")
  293. code = payload.get("code", 0)
  294. if code not in (0, "0", None):
  295. raise RuntimeError(
  296. f"{operation} failed: code={code} msg={payload.get('msg')!r}"
  297. )
  298. def _local_form_data(self) -> dict[str, str]:
  299. return {
  300. "lang_list": self.language,
  301. "backend": self.local_backend,
  302. "parse_method": self.local_parse_method,
  303. "formula_enable": _bool_form(self.enable_formula),
  304. "table_enable": _bool_form(self.enable_table),
  305. "image_analysis": _bool_form(self.local_image_analysis),
  306. "return_md": "true",
  307. "return_middle_json": "true",
  308. "return_model_output": "true",
  309. "return_content_list": "true",
  310. "return_images": "true",
  311. "response_format_zip": "true",
  312. "return_original_file": "true",
  313. "start_page_id": str(self.local_start_page_id),
  314. "end_page_id": str(self.local_end_page_id),
  315. }
  316. async def _download_local(
  317. self,
  318. client: "httpx.AsyncClient",
  319. source_file_path: Path,
  320. raw_dir: Path,
  321. upload_name: str,
  322. ) -> str:
  323. submit_url = f"{self.local_endpoint}/tasks"
  324. # Keep data as a Mapping so httpx 0.28 builds an async MultipartStream
  325. # and reads the file handle in chunks instead of buffering the payload.
  326. with source_file_path.open("rb") as fh:
  327. files = {"files": (upload_name, fh, "application/octet-stream")}
  328. resp = await client.post(
  329. submit_url,
  330. data=self._local_form_data(),
  331. files=files,
  332. )
  333. raise_for_status_with_detail(
  334. resp,
  335. f"MinerU local task submission for {upload_name!r}",
  336. )
  337. payload = resp.json() if resp.text else {}
  338. task_id = str(payload.get("task_id") or "")
  339. if not task_id:
  340. raise RuntimeError(
  341. f"MinerU local /tasks response missing task_id: {payload}"
  342. )
  343. await self._poll_local_task(client, task_id)
  344. await self._download_zip(
  345. client,
  346. f"{self.local_endpoint}/tasks/{task_id}/result",
  347. raw_dir,
  348. )
  349. return task_id
  350. async def _poll_local_task(
  351. self,
  352. client: "httpx.AsyncClient",
  353. task_id: str,
  354. ) -> None:
  355. poll_url = f"{self.local_endpoint}/tasks/{task_id}"
  356. for _ in range(self.max_polls):
  357. await asyncio.sleep(self.poll_interval)
  358. resp = await client.get(poll_url)
  359. raise_for_status_with_detail(resp, "MinerU local task poll")
  360. payload = resp.json() if resp.text else {}
  361. status = str(payload.get("status") or "").lower()
  362. if status in LOCAL_DONE_STATES:
  363. return
  364. if status in LOCAL_FAILED_STATES:
  365. err = payload.get("error") or payload.get("message") or payload
  366. raise RuntimeError(
  367. f"MinerU local parse failed for task {task_id}: {err}"
  368. )
  369. raise TimeoutError(f"MinerU local task polling timeout: {task_id}")
  370. async def _download_zip(
  371. self,
  372. client: "httpx.AsyncClient",
  373. result_url: str,
  374. raw_dir: Path,
  375. resp: Any = None,
  376. ) -> None:
  377. """Download (or re-use already-fetched response) and extract."""
  378. if resp is None or not hasattr(resp, "content"):
  379. resp = await client.get(result_url)
  380. raise_for_status_with_detail(resp, "MinerU result bundle download")
  381. buf = io.BytesIO(resp.content)
  382. with zipfile.ZipFile(buf) as zf:
  383. # Safe-extract: refuse absolute paths and ``..`` traversal.
  384. for name in zf.namelist():
  385. norm = os.path.normpath(name)
  386. if norm.startswith("..") or os.path.isabs(norm):
  387. raise RuntimeError(f"Refusing zip entry with unsafe path: {name!r}")
  388. zf.extractall(raw_dir)
  389. # Normalize: if the zip nested everything under a single top-level
  390. # dir, hoist its contents up so content_list.json sits at raw_dir
  391. # root. This matches the common MinerU bundle layout.
  392. self._maybe_hoist_single_subdir(raw_dir)
  393. def _maybe_hoist_single_subdir(self, raw_dir: Path) -> None:
  394. entries = [p for p in raw_dir.iterdir() if p.name != "_manifest.json"]
  395. if len(entries) != 1 or not entries[0].is_dir():
  396. return
  397. sub = entries[0]
  398. for child in list(sub.iterdir()):
  399. child.rename(raw_dir / child.name)
  400. try:
  401. sub.rmdir()
  402. except OSError:
  403. pass
  404. def _normalize_raw_bundle(
  405. self,
  406. raw_dir: Path,
  407. source_file_path: Path,
  408. upload_name: str | None = None,
  409. ) -> None:
  410. """Ensure a downloaded bundle has root-level ``content_list.json``.
  411. Official and local MinerU zip archives commonly place parser outputs at
  412. ``<doc>/<parse_method>/<doc>_content_list.json``. The adapter consumes a
  413. canonical root ``content_list.json`` plus optional root ``images/``.
  414. After hoisting we delete the nested originals so the manifest does not
  415. bookkeep two copies (and disk usage doesn't double for big bundles).
  416. Sibling artifacts of the parse subdir (``*.md``, ``middle.json`` etc.)
  417. are also hoisted to ``raw_dir`` root for easier diagnostics.
  418. """
  419. if (raw_dir / CONTENT_LIST_FILENAME).is_file():
  420. return
  421. candidate = _select_content_list_candidate(
  422. raw_dir, source_file_path, upload_name
  423. )
  424. if candidate is None:
  425. return
  426. source_dir = candidate.parent
  427. target_root = raw_dir.resolve()
  428. # Guard: never hoist from above raw_dir (defensive — candidate already
  429. # comes from rglob inside raw_dir, but cheap to verify).
  430. try:
  431. source_dir.resolve().relative_to(target_root)
  432. except ValueError:
  433. shutil.copy2(candidate, raw_dir / CONTENT_LIST_FILENAME)
  434. return
  435. # Move the critical file first; then hoist sibling files/dirs that
  436. # don't already exist at raw_dir root.
  437. shutil.move(str(candidate), str(raw_dir / CONTENT_LIST_FILENAME))
  438. for entry in list(source_dir.iterdir()):
  439. target = raw_dir / entry.name
  440. if target.exists():
  441. continue
  442. shutil.move(str(entry), str(target))
  443. # Best-effort cleanup of the now-empty parse subtree.
  444. cursor = source_dir
  445. while cursor != raw_dir and cursor.is_dir():
  446. try:
  447. cursor.rmdir()
  448. except OSError:
  449. break
  450. cursor = cursor.parent
  451. # ------------------------------------------------------------------
  452. # Manifest construction
  453. # ------------------------------------------------------------------
  454. def _build_and_write_manifest(
  455. self,
  456. raw_dir: Path,
  457. source_file_path: Path,
  458. task_id: str,
  459. upload_name: str,
  460. ) -> Manifest:
  461. source_size, source_hash = compute_size_and_hash(source_file_path)
  462. # Critical file — required.
  463. crit_path = raw_dir / CONTENT_LIST_FILENAME
  464. if not crit_path.is_file():
  465. raise RuntimeError(
  466. f"MinerU bundle missing required {CONTENT_LIST_FILENAME} "
  467. f"after download (raw_dir={raw_dir})"
  468. )
  469. crit_size, crit_hash = compute_size_and_hash(crit_path)
  470. # Other files.
  471. others: list[ManifestFile] = []
  472. total = crit_size
  473. for p in sorted(raw_dir.rglob("*")):
  474. if not p.is_file():
  475. continue
  476. if p.name == "_manifest.json":
  477. continue
  478. rel = p.relative_to(raw_dir).as_posix()
  479. if rel == CONTENT_LIST_FILENAME:
  480. continue
  481. size = p.stat().st_size
  482. others.append(ManifestFile(path=rel, size=size))
  483. total += size
  484. manifest = Manifest(
  485. source_content_hash=source_hash,
  486. source_size_bytes=source_size,
  487. source_filename_at_parse=upload_name,
  488. critical_file=ManifestFile(
  489. path=CONTENT_LIST_FILENAME,
  490. size=crit_size,
  491. sha256=crit_hash,
  492. ),
  493. files=others,
  494. total_size_bytes=total,
  495. task_id=task_id,
  496. api_mode=self.api_mode,
  497. engine_version=self.engine_version,
  498. endpoint_signature=self.endpoint,
  499. options_signature=self._options_signature(),
  500. downloaded_at=datetime.now(timezone.utc).isoformat(),
  501. )
  502. write_manifest(raw_dir, manifest)
  503. return manifest
  504. def _options_signature(self) -> str:
  505. return self._parser_options.signature()
  506. def _find_content_list(payload: Any, content_field: str) -> list[dict] | None:
  507. """Heuristic content_list extractor.
  508. Tries (in order):
  509. 1. The provided dotted path if it lands on a list of dicts.
  510. 2. Direct ``content_list`` / ``content`` / ``items`` / ``result`` keys.
  511. 3. Recursive descent.
  512. """
  513. if isinstance(payload, list):
  514. if payload and all(isinstance(x, dict) for x in payload):
  515. return payload
  516. return None
  517. if not isinstance(payload, dict):
  518. return None
  519. via_field = _get_by_path(payload, content_field)
  520. candidate = _find_content_list(via_field, content_field)
  521. if candidate is not None:
  522. return candidate
  523. for key in ("content_list", "content", "items", "result"):
  524. value = payload.get(key)
  525. candidate = _find_content_list(value, content_field)
  526. if candidate is not None:
  527. return candidate
  528. for value in payload.values():
  529. candidate = _find_content_list(value, content_field)
  530. if candidate is not None:
  531. return candidate
  532. return None
  533. def _bool_form(value: bool) -> str:
  534. return "true" if value else "false"
  535. def _select_official_extract_result(
  536. results: list[Any],
  537. source_filename: str,
  538. ) -> dict[str, Any] | None:
  539. """Pick the extract_result entry that matches the file we uploaded.
  540. Invariant: :meth:`MinerURawClient._download_official` always submits a
  541. single-file batch, so a non-matching ``file_name`` from the API would
  542. indicate either a server response we don't understand or a future
  543. multi-file extension. We fall back to ``dict_results[0]`` to remain
  544. forward-compatible but log a warning so the mismatch is visible.
  545. """
  546. dict_results = [item for item in results if isinstance(item, dict)]
  547. if not dict_results:
  548. return None
  549. source_name = Path(source_filename).name
  550. source_stem = Path(source_filename).stem
  551. for item in dict_results:
  552. file_name = str(item.get("file_name") or item.get("name") or "")
  553. if Path(file_name).name == source_name or Path(file_name).stem == source_stem:
  554. return item
  555. logger.warning(
  556. "[mineru_raw] official extract_result did not contain a match for "
  557. "%r; falling back to the first entry (%r). This is unexpected for "
  558. "a single-file batch.",
  559. source_name,
  560. str(dict_results[0].get("file_name") or dict_results[0].get("name") or ""),
  561. )
  562. return dict_results[0]
  563. def _select_content_list_candidate(
  564. raw_dir: Path,
  565. source_file_path: Path,
  566. upload_name: str | None = None,
  567. ) -> Path | None:
  568. source_stem = Path(upload_name or source_file_path.name).stem
  569. candidates: list[tuple[int, int, str, Path]] = []
  570. for path in raw_dir.rglob("*.json"):
  571. if not path.is_file():
  572. continue
  573. if path.name != CONTENT_LIST_FILENAME and not path.name.endswith(
  574. "_content_list.json"
  575. ):
  576. continue
  577. try:
  578. payload = json.loads(path.read_text(encoding="utf-8"))
  579. except (OSError, json.JSONDecodeError):
  580. continue
  581. content_list = _find_content_list(payload, "content")
  582. if content_list is None:
  583. continue
  584. score = 10
  585. if path.name == CONTENT_LIST_FILENAME:
  586. score = 0
  587. elif path.name == f"{source_stem}_content_list.json":
  588. score = 1
  589. elif path.stem.endswith("_content_list"):
  590. score = 2
  591. depth = len(path.relative_to(raw_dir).parts)
  592. candidates.append((score, depth, path.as_posix(), path))
  593. if not candidates:
  594. return None
  595. candidates.sort()
  596. return candidates[0][3]
  597. __all__ = ["MinerURawClient", "CONTENT_LIST_FILENAME"]