wxcz_admin
/
lightrag-cn-git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749
							"""MinerU IR builder: ``content_list.json`` (+ images/) → :class:`IRDoc`.

Input contract: a ``*.mineru_raw/`` directory containing at least
``content_list.json``. Optional sibling resources (``images/``,
``middle.json``, ``full.md``, ``layout.pdf``) are kept as-is; this builder
only reads the content list and image asset bytes.

Conversion rules (informed by spec §3-§六):

- ``text`` items with ``text_level>0`` and ``title`` / ``section_header``
  start a NEW block. The heading text is rendered with a markdown ``#``
  prefix matching the level (``# foo``, ``## bar`` …) as the first line of
  the new block's content.
- All other items (``text``, ``list``, ``code``, ``table``, ``image``,
  ``equation``) are MERGED into the current block — their text / placeholder
  is appended (newline-separated) to the heading's block. This mirrors the
  native docx parser's "split-by-heading, merge-everything-under-heading"
  behavior (see ``parser/docx/parse_document.py``).
- Content emitted before the first heading lands in a synthetic
  ``Preface/Uncategorized`` block at level 0.
- ``list`` items joined with ``\n``; ``code`` body taken from ``code_body``
  if present.
- ``table`` → IRTable + ``{{TBL:k}}`` placeholder. ``table_body`` (HTML) or
  the ``rows`` field (2D array) become ``html`` / ``rows`` on IRTable.
  ``num_rows`` / ``num_cols`` are taken from MinerU if present, otherwise
  inferred. ``header`` populates ``table_header`` (per spec §5).
- ``image`` / ``picture`` / ``drawing`` → IRDrawing + ``{{IMG:k}}`` placeholder.
  Asset bytes are referenced via ``img_path`` relative to the raw dir.
- ``equation`` → IREquation. ``is_block`` is decided by whether
  ``text_format=="block"`` (MinerU explicit flag) OR ``text_level==0`` with
  no inline neighbours; otherwise inline. The latex string is preserved
  verbatim (including any ``$$``/``$`` wrappers) so ``blocks.jsonl``'s
  ``<equation>`` body matches MinerU's raw output; the writer strips the
  wrappers when persisting ``equations.json`` content.
- ``page_idx`` + ``bbox`` → ``IRPosition(type="bbox", anchor=page, range=[x0,y0,x1,y1])``.
  Empty/missing bbox is acceptable; positions accumulate on the merged block.
- ``IRDoc.split_option`` records the MinerU engine version when available.
- ``IRDoc.bbox_attributes`` defaults to ``{"origin":"LEFTTOP","max":1000}``
  reflecting MinerU's PDF coordinate convention. Operators may override
  via ``MINERU_BBOX_ATTRIBUTES`` (JSON string).
"""

from __future__ import annotations

import json
import os
from pathlib import Path
from typing import Any
from urllib.parse import urlparse

from lightrag.sidecar.ir import (
    AssetSpec,
    IRBlock,
    IRDoc,
    IRDrawing,
    IREquation,
    IRPosition,
    IRTable,
)
from lightrag.utils import logger


PREFACE_HEADING = "Preface/Uncategorized"
CONTENT_LIST_FILENAME = "content_list.json"


class MinerUIRBuilder:
    """Stateless except for env-driven config. Reusable across calls."""

    def __init__(self) -> None:
        self.engine_version = os.getenv("MINERU_ENGINE_VERSION", "").strip()
        # Mirror MinerURawClient.__init__: when this is set, the downloader
        # stores ALL referenced images (including relative ones) under
        # ``images/<basename>``. The builder has to look in the same place.
        self.image_url_template = os.getenv("MINERU_IMAGE_URL_TEMPLATE", "").strip()
        self.bbox_attributes = self._load_bbox_attributes_env()

    def _load_bbox_attributes_env(self) -> dict[str, Any]:
        default = {"origin": "LEFTTOP", "max": 1000}
        raw = os.getenv("MINERU_BBOX_ATTRIBUTES", "").strip()
        if not raw:
            return default
        try:
            parsed = json.loads(raw)
        except json.JSONDecodeError as exc:
            logger.warning(
                "[mineru_ir_builder] MINERU_BBOX_ATTRIBUTES is not valid JSON "
                "(%s); falling back to default %s",
                exc,
                default,
            )
            return default
        if not isinstance(parsed, dict):
            logger.warning(
                "[mineru_ir_builder] MINERU_BBOX_ATTRIBUTES must decode to a JSON "
                "object, got %s; falling back to default %s",
                type(parsed).__name__,
                default,
            )
            return default
        return parsed

    # ------------------------------------------------------------------
    # Entry point
    # ------------------------------------------------------------------

    def normalize_from_workdir(
        self,
        raw_dir: Path,
        *,
        document_name: str,
    ) -> IRDoc:
        """Read ``raw_dir/content_list.json`` and emit an IRDoc.

        ``document_name`` is the canonical filename (e.g. ``foo.pdf``) used
        for ``meta.document_name``; resolved by the caller from the parser
        hint chain.
        """
        content_list_path = raw_dir / "content_list.json"
        if not content_list_path.is_file():
            raise FileNotFoundError(
                f"MinerU raw bundle missing content_list.json at {raw_dir}"
            )
        content_list = json.loads(content_list_path.read_text(encoding="utf-8"))
        if not isinstance(content_list, list):
            raise ValueError(
                f"MinerU content_list.json malformed (not a JSON array) at {raw_dir}"
            )
        return self._normalize_content_list(
            content_list, raw_dir, document_name=document_name
        )

    # ------------------------------------------------------------------
    # Core
    # ------------------------------------------------------------------

    def _normalize_content_list(
        self,
        content_list: list[Any],
        raw_dir: Path,
        *,
        document_name: str,
    ) -> IRDoc:
        document_format = Path(document_name).suffix.lower().lstrip(".")

        blocks: list[IRBlock] = []
        assets: list[AssetSpec] = []
        seen_assets: dict[str, str] = {}  # ref → suggested_name
        doc_title = ""
        placeholder_counter = 0

        def _next_key(prefix: str) -> str:
            nonlocal placeholder_counter
            placeholder_counter += 1
            return f"{prefix}{placeholder_counter}"

        # Heading hierarchy stack — index = level-1 (level 1 lives at [0]).
        heading_stack: list[str] = []

        # Current-block accumulator. The block is materialized when the next
        # heading arrives (or at end-of-document). The initial block is the
        # synthetic "Preface/Uncategorized" container at level 0.
        cb_lines: list[str] = []
        cb_tables: list[IRTable] = []
        cb_drawings: list[IRDrawing] = []
        cb_equations: list[IREquation] = []
        # Positions are split into two channels:
        # - ``cb_page_set`` collects ``page_idx`` of bbox-less items; at flush
        #   each unique page becomes one anchor-only summary ``IRPosition``.
        # - ``cb_bbox_positions`` keeps one fine-grained position per item that
        #   carried a parseable bbox (anchor + range), in source order, with
        #   no deduplication.
        cb_page_set: set[str] = set()
        cb_bbox_positions: list[IRPosition] = []
        cb_heading = PREFACE_HEADING
        cb_level = 0
        cb_parents: list[str] = []
        # ``cb_has_body`` flips True the moment we accumulate any non-heading
        # payload into the current block. While it stays False, an adjacent
        # deeper heading is folded into this block as a body line (aligning
        # with the native docx parser's behaviour for back-to-back headings).
        cb_has_body = False

        def _record_position(item: dict) -> None:
            """Route an item's positional info into the right channel.

            Items with a parseable ``bbox`` produce one fine-grained
            IRPosition appended to ``cb_bbox_positions`` (no dedupe).
            Otherwise, ``page_idx`` (if any) is added to ``cb_page_set``
            and emitted as a single anchor-only summary entry at flush.
            """
            bbox_pos = _extract_bbox_position(item)
            if bbox_pos is not None:
                cb_bbox_positions.append(bbox_pos)
                return
            page = _extract_page_anchor(item)
            if page is not None:
                cb_page_set.add(page)

        def _flush_block() -> None:
            """Emit the in-flight block if it carries any content."""
            nonlocal cb_lines, cb_tables, cb_drawings, cb_equations
            nonlocal cb_page_set, cb_bbox_positions, cb_has_body
            has_payload = bool(cb_lines or cb_tables or cb_drawings or cb_equations)
            if not has_payload:
                return
            content = "\n".join(line for line in cb_lines if line)
            if not content.strip() and not (cb_tables or cb_drawings or cb_equations):
                # Reset and skip — nothing meaningful to emit.
                cb_lines = []
                cb_page_set = set()
                cb_bbox_positions = []
                cb_has_body = False
                return
            positions = [
                IRPosition(type="bbox", anchor=p)
                for p in _sort_page_anchors(cb_page_set)
            ] + list(cb_bbox_positions)
            blocks.append(
                IRBlock(
                    content_template=content,
                    heading=cb_heading,
                    level=cb_level,
                    parent_headings=list(cb_parents),
                    positions=positions,
                    tables=list(cb_tables),
                    drawings=list(cb_drawings),
                    equations=list(cb_equations),
                )
            )
            cb_lines = []
            cb_tables = []
            cb_drawings = []
            cb_equations = []
            cb_page_set = set()
            cb_bbox_positions = []
            cb_has_body = False

        def _open_block(heading: str, level: int, parents: list[str]) -> None:
            nonlocal cb_heading, cb_level, cb_parents
            cb_heading = heading
            cb_level = level
            cb_parents = parents
            # Render the heading line into the block body so the merged
            # text reads like markdown (``# Foo`` / ``## Bar`` / …).
            md_prefix = "#" * max(level, 1)
            cb_lines.append(f"{md_prefix} {heading}")

        def _append_text(text: str) -> bool:
            """Append ``text`` to the current block body and return whether
            anything was actually written. Callers use the return value to
            decide whether to also record the item's source position — an
            empty text item must NOT leak its ``page_idx`` to the block.
            """
            nonlocal cb_has_body
            if not text:
                return False
            cb_lines.append(text)
            cb_has_body = True
            return True

        def _merge_heading_as_body(heading: str, level: int) -> None:
            """Fold an adjacent deeper heading into the current block.

            The line keeps its markdown ``#`` prefix so the rendered block
            still reads as ``# Section\n## Subsection``. Does NOT flip
            ``cb_has_body`` — successive headings can keep folding until a
            real body item lands.
            """
            md_prefix = "#" * max(level, 1)
            cb_lines.append(f"{md_prefix} {heading}")

        for item_index, item in enumerate(content_list):
            if not isinstance(item, dict):
                continue
            item_type = str(item.get("type") or item.get("label") or "").lower()

            heading_text, heading_level = _detect_heading(item, item_type)
            if heading_text:
                # Heading hierarchy is updated unconditionally so deeper
                # parents resolve correctly once the next real body item
                # opens a fresh block.
                heading_stack = heading_stack[: max(heading_level - 1, 0)]
                parents = [h for h in heading_stack if h]
                heading_stack.append(heading_text)

                # Adjacency merge: previous block is a real heading with no
                # body yet AND the new heading is strictly deeper — append
                # this heading as body to the existing block instead of
                # flushing. (Preface, level=0, is never merged into.)
                if cb_level > 0 and not cb_has_body and heading_level > cb_level:
                    _merge_heading_as_body(heading_text, heading_level)
                    _record_position(item)
                    if not doc_title and heading_level == 1:
                        doc_title = heading_text
                    continue

                _flush_block()
                _open_block(heading_text, heading_level, parents)
                _record_position(item)

                if not doc_title and heading_level == 1:
                    doc_title = heading_text
                continue

            if item_type == "text":
                if _append_text(_coerce_text(item)):
                    _record_position(item)
                continue

            if item_type == "list":
                items = item.get("list_items")
                if isinstance(items, list):
                    text = "\n".join(str(x) for x in items if str(x).strip())
                else:
                    text = _coerce_text(item)
                if _append_text(text):
                    _record_position(item)
                continue

            if item_type == "code":
                if _append_text(item.get("code_body") or _coerce_text(item)):
                    _record_position(item)
                continue

            if item_type == "equation":
                latex_raw = _coerce_text(item)
                if not latex_raw:
                    # Spec compliance fix: empty equation must not enter sidecar.
                    continue
                # Preserve MinerU's raw latex (including any ``$$``/``$``
                # wrappers); the writer strips them when emitting
                # equations.json so blocks.jsonl shows the raw form while
                # the per-equation sidecar holds clean latex.
                latex = latex_raw.strip()
                is_block = _is_block_equation(item)
                caption = str(item.get("caption") or "")
                placeholder = _next_key("eq")
                token = "EQ" if is_block else "EQI"
                cb_equations.append(
                    IREquation(
                        placeholder_key=placeholder,
                        latex=latex,
                        is_block=is_block,
                        caption=caption,
                        footnotes=_as_str_list(item.get("footnotes")),
                        self_ref=_content_list_self_ref(item_index) if is_block else "",
                    )
                )
                cb_lines.append(f"{{{{{token}:{placeholder}}}}}")
                cb_has_body = True
                _record_position(item)
                continue

            if item_type == "table":
                table = self._build_ir_table(item)
                if table is None:
                    # Empty body — _build_ir_table already logged the drop.
                    # Skip placeholder allocation and position recording so
                    # the misidentified item leaves no trace in the IR.
                    continue
                placeholder = _next_key("tb")
                table.placeholder_key = placeholder
                table.self_ref = _content_list_self_ref(item_index)
                cb_tables.append(table)
                cb_lines.append(f"{{{{TBL:{placeholder}}}}}")
                cb_has_body = True
                _record_position(item)
                continue

            if item_type in {"image", "picture", "drawing"}:
                drawing, asset = self._build_ir_drawing(item, raw_dir, seen_assets)
                placeholder = _next_key("im")
                drawing.placeholder_key = placeholder
                drawing.self_ref = _content_list_self_ref(item_index)
                if asset is not None and asset.ref not in {a.ref for a in assets}:
                    assets.append(asset)
                cb_drawings.append(drawing)
                cb_lines.append(f"{{{{IMG:{placeholder}}}}}")
                cb_has_body = True
                _record_position(item)
                continue

            # Fallback: serialize unknown items as plain text so we don't
            # silently drop information. Position only recorded when the
            # fallback actually contributed text — empty unknown items must
            # not leak their page_idx into the current block.
            if _append_text(_coerce_text(item)):
                _record_position(item)

        _flush_block()

        if not doc_title:
            doc_title = Path(document_name).stem or document_name

        split_option: dict[str, Any] = {}
        if self.engine_version:
            split_option["engine_version"] = self.engine_version
        # Reserved hook for later: detect OCR flag from middle.json / config.

        return IRDoc(
            document_name=document_name,
            document_format=document_format,
            doc_title=doc_title,
            split_option=split_option,
            blocks=blocks,
            assets=assets,
            bbox_attributes=dict(self.bbox_attributes),
        )

    # ------------------------------------------------------------------
    # Tables / drawings
    # ------------------------------------------------------------------

    def _build_ir_table(self, item: dict) -> IRTable | None:
        rows: list[list[str]] | None = None
        html: str | None = None
        body_field = item.get("rows")
        body = body_field if body_field is not None else item.get("table_body")

        if isinstance(body, list):
            rows = _normalize_grid(body)
        elif isinstance(body, str):
            stripped = body.strip()
            if stripped.startswith("[") and stripped.endswith("]"):
                try:
                    decoded = json.loads(stripped)
                    if isinstance(decoded, list):
                        rows = _normalize_grid(decoded)
                except json.JSONDecodeError:
                    pass
            if rows is None:
                html = stripped or None
        elif isinstance(body, dict):
            grid = body.get("grid") or body.get("rows")
            if isinstance(grid, list):
                rows = _normalize_grid(grid)
            else:
                html = json.dumps(body, ensure_ascii=False)

        # MinerU occasionally emits table items with no usable body (e.g. when
        # a page number or blank region is misidentified as a table). Dropping
        # them here keeps the sidecar free of items that would later trip the
        # analyze worker's "missing table content" hard-failure path.
        if not _ir_table_body_has_content(rows, html):
            logger.debug(
                "[mineru_ir_builder] dropping empty table item "
                "(body type=%s, num_rows=%s, num_cols=%s)",
                type(body).__name__,
                item.get("num_rows"),
                item.get("num_cols"),
            )
            return None

        num_rows = int(item.get("num_rows") or (len(rows) if rows else 0) or 0)
        num_cols_default = max((len(r) for r in rows), default=0) if rows else 0
        num_cols = int(item.get("num_cols") or num_cols_default or 0)

        captions = item.get("table_caption")
        caption = str(item.get("caption") or "")
        if not caption and isinstance(captions, list) and captions:
            caption = str(captions[0])

        table_header_raw = item.get("header")
        table_header: list[list[str]] | None = None
        if isinstance(table_header_raw, list) and table_header_raw:
            table_header = _normalize_grid(table_header_raw)

        return IRTable(
            placeholder_key="",  # filled by caller
            rows=rows,
            html=html,
            num_rows=num_rows,
            num_cols=num_cols,
            caption=caption,
            footnotes=_as_str_list(item.get("table_footnote") or item.get("footnotes")),
            table_header=table_header,
        )

    def _build_ir_drawing(
        self,
        item: dict,
        raw_dir: Path,
        seen: dict[str, str],
    ) -> tuple[IRDrawing, AssetSpec | None]:
        img_path = str(item.get("img_path") or item.get("path") or "")
        src_val = str(item.get("src") or "")
        captions = item.get("image_caption") or item.get("captions")
        caption = str(item.get("caption") or "")
        if not caption and isinstance(captions, list) and captions:
            caption = str(captions[0])

        fmt = Path(img_path).suffix.lower().lstrip(".") if img_path else ""
        if not fmt:
            fmt = str(item.get("format") or "")

        asset: AssetSpec | None = None
        ref = ""
        if img_path:
            ref = img_path
            if ref in seen:
                # Already declared by a previous block; reuse name.
                pass
            else:
                # Asset source: file on disk inside raw_dir. ``img_path`` is
                # untrusted (it comes from MinerU's content_list.json or a
                # downloaded zip), so we go through a safe resolver that
                # refuses to escape ``raw_dir`` and mirrors the downloader's
                # storage layout for absolute-URL / templated references.
                local_path = _safe_local_asset_path(
                    raw_dir,
                    img_path,
                    image_url_template=self.image_url_template,
                )
                suggested_name = _suggested_asset_name(img_path, fmt, len(seen))
                asset = AssetSpec(
                    ref=ref,
                    suggested_name=suggested_name,
                    source=local_path
                    if local_path is not None and local_path.is_file()
                    else None,
                )
                seen[ref] = suggested_name

        drawing = IRDrawing(
            placeholder_key="",  # filled by caller
            asset_ref=ref,
            fmt=fmt,
            caption=caption,
            footnotes=_as_str_list(item.get("image_footnote") or item.get("footnotes")),
            src=src_val,
        )
        return drawing, asset


# ----------------------------------------------------------------------
# helpers
# ----------------------------------------------------------------------


def _detect_heading(item: dict, item_type: str) -> tuple[str, int]:
    """Return ``(heading_text, level)`` if ``item`` is a heading, else ``("", 0)``.

    A heading is either an explicit ``title``/``section_header`` block, or a
    ``text`` block whose ``text_level`` is positive (MinerU's convention).
    """
    if item_type in {"title", "section_header"}:
        text = _coerce_text(item).strip()
        level = max(int(item.get("text_level") or item.get("level") or 1), 1)
        return text, level
    if item_type == "text":
        try:
            tl = int(item.get("text_level") or 0)
        except (TypeError, ValueError):
            tl = 0
        if tl > 0:
            return _coerce_text(item).strip(), tl
    return "", 0


def _coerce_text(item: dict) -> str:
    for key in ("text", "content", "body", "code_body"):
        val = item.get(key)
        if isinstance(val, str) and val.strip():
            return val
    return ""


def _as_str_list(value: Any) -> list[str]:
    if value is None:
        return []
    if isinstance(value, list):
        return [str(x) for x in value if str(x).strip()]
    s = str(value).strip()
    return [s] if s else []


def _content_list_self_ref(index: int) -> str:
    return f"{CONTENT_LIST_FILENAME}#/{index}"


def _normalize_grid(grid: Any) -> list[list[str]]:
    out: list[list[str]] = []
    if not isinstance(grid, list):
        return out
    for row in grid:
        if not isinstance(row, list):
            continue
        out_row: list[str] = []
        for cell in row:
            if isinstance(cell, dict):
                out_row.append(str(cell.get("text", "")).strip())
            else:
                out_row.append(str(cell).strip())
        out.append(out_row)
    return out


def _ir_table_body_has_content(rows: list[list[str]] | None, html: str | None) -> bool:
    """True iff the parsed table body carries any visible cell text or HTML."""
    if html and html.strip():
        return True
    if rows:
        for row in rows:
            for cell in row:
                if isinstance(cell, str) and cell.strip():
                    return True
    return False


def _is_block_equation(item: dict) -> bool:
    """Heuristic: MinerU's ``text_format`` distinguishes block vs inline.

    Fallback when absent: treat as block (most MinerU equation items in
    PDF context represent display equations); inline equations are usually
    embedded inside ``text`` items rather than first-class ``equation``
    items.
    """
    fmt = str(item.get("text_format") or "").lower()
    if fmt in {"inline", "inline_equation"}:
        return False
    if fmt in {"block", "block_equation", "display"}:
        return True
    return True


def _extract_page_anchor(item: dict) -> str | None:
    """Return a 1-based page anchor from MinerU's ``page_idx`` / ``page``.

    Always returns a string so ``blocks.jsonl`` carries a uniform anchor
    type across Roman / letter / numeric page labels. Integers are bumped
    to 1-based (``page_idx=0`` → ``"1"``); strings are stripped and passed
    through verbatim. Returns ``None`` when no usable page info is present.
    """
    page_raw = item.get("page_idx")
    if page_raw is None:
        page_raw = item.get("page")
    if isinstance(page_raw, bool):
        # bool is a subclass of int — guard so True/False don't sneak in.
        return None
    if isinstance(page_raw, int):
        return str(page_raw + 1 if page_raw >= 0 else page_raw)
    if isinstance(page_raw, str) and page_raw.strip():
        return page_raw.strip()
    return None


def _sort_page_anchors(pages: set[str]) -> list[str]:
    """Order page anchors using book pagination convention.

    Non-numeric labels (Roman preface pages ``i``/``ii``/``iv``…, letter
    pages like ``A``, ``B-1``) come first in lexical order; numeric labels
    follow, sorted by their integer value so ``"2"`` precedes ``"10"``.
    Mixing both kinds is safe — the bucketed key avoids the ``TypeError``
    that ``sorted({"ii", "1"})`` raises when ints and strings mix.
    """
    non_numeric = sorted(p for p in pages if not p.isdigit())
    numeric = sorted((p for p in pages if p.isdigit()), key=int)
    return non_numeric + numeric


def _extract_bbox_position(item: dict) -> IRPosition | None:
    """Build a fine-grained ``IRPosition`` when ``bbox`` is parseable.

    Returns ``None`` when ``bbox`` is missing or malformed; the caller then
    falls back to page-only tracking via :func:`_extract_page_anchor`.
    """
    bbox = item.get("bbox")
    if not isinstance(bbox, (list, tuple)) or len(bbox) < 4:
        return None
    try:
        coords = [float(x) for x in bbox[:4]]
    except (TypeError, ValueError):
        return None
    return IRPosition(type="bbox", anchor=_extract_page_anchor(item), range=coords)


def _safe_local_asset_path(
    raw_dir: Path,
    img_path: str,
    *,
    image_url_template: str = "",
) -> Path | None:
    """Resolve ``img_path`` to a concrete file location inside ``raw_dir``.

    ``img_path`` comes from MinerU's ``content_list.json`` and is therefore
    untrusted. This resolver mirrors :meth:`MinerURawClient._fetch_one_image`
    storage rules so the builder always looks where the downloader wrote
    the file:

    - absolute http(s) URLs and absolute filesystem paths
      → ``raw_dir/images/<basename>``;
    - any ref when ``MINERU_IMAGE_URL_TEMPLATE`` is configured (the
      downloader routes ALL refs — including relative ones — through
      :meth:`_image_dest_rel`) → ``raw_dir/images/<basename>``;
    - otherwise relative paths resolve under ``raw_dir`` with ``..``
      traversal refused and a final ``Path.relative_to`` check.

    Returns ``None`` when the candidate is unsafe or cannot be expressed
    inside ``raw_dir``. The caller treats ``None`` the same as "file missing"
    — the drawing tag still gets written, but no bytes are copied.
    """
    if not img_path:
        return None

    if img_path.startswith(("http://", "https://")):
        name = Path(urlparse(img_path).path).name
        return raw_dir / "images" / name if name else None

    if os.path.isabs(img_path):
        # Absolute filesystem path in img_path is never trusted to point
        # outside raw_dir; mirror the downloader's basename rule.
        name = Path(img_path).name
        return raw_dir / "images" / name if name else None

    if image_url_template:
        # Templated mode: downloader stored every ref (incl. relative) at
        # images/<basename>, so we must look there too.
        name = Path(img_path).name
        return raw_dir / "images" / name if name else None

    normalized = os.path.normpath(img_path)
    if normalized.startswith("..") or os.path.isabs(normalized):
        return None
    candidate = (raw_dir / normalized).resolve()
    try:
        candidate.relative_to(raw_dir.resolve())
    except ValueError:
        return None
    return candidate


def _suggested_asset_name(img_path: str, fmt: str, seen_count: int) -> str:
    """Pick an in-assets-dir filename for an asset.

    For URL refs, use the URL path's basename so we get a useful filename
    (``foo.png`` rather than the whole URL). For local refs, the regular
    basename. Falls back to ``image-<n>[.fmt]`` when nothing usable.
    """
    if img_path.startswith(("http://", "https://")):
        name = Path(urlparse(img_path).path).name
    else:
        name = Path(img_path).name
    if name:
        return name
    return f"image-{seen_count + 1}{('.' + fmt) if fmt else ''}"


__all__ = ["MinerUIRBuilder"]