writer.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627
  1. """Spec-compliant sidecar writer.
  2. This module is the *single executable specification* of the LightRAG sidecar
  3. format (``docs/LightRAGSidecarFormat-zh.md``). Engine adapters hand it an
  4. :class:`IRDoc`; it emits the ``*.parsed/`` directory.
  5. Responsibilities (none of these belong in adapters):
  6. - id allocation: ``tb-/im-/eq-<doc_hash>-NNNN`` (4-digit zero-padded,
  7. global per-doc sequence)
  8. - placeholder rendering: ``{{TBL:k}}`` / ``{{IMG:k}}`` / ``{{EQ:k}}`` /
  9. ``{{EQI:k}}`` → spec-shaped XML-style tags
  10. - blockid computation: ``md5(doc_id:block_index:heading:content)``
  11. - assets dir creation and file copying; ``asset_dir`` flag in meta is
  12. derived from "directory exists and is non-empty"
  13. - merged_text + document_hash
  14. - meta line shape (spec §3.1)
  15. - conditional writes: ``tables.json`` / ``drawings.json`` / ``equations.json``
  16. appear only when their dict is non-empty
  17. """
  18. from __future__ import annotations
  19. import hashlib
  20. import json
  21. import re
  22. import shutil
  23. from datetime import datetime, timezone
  24. from pathlib import Path
  25. from typing import Any
  26. from lightrag.constants import FULL_DOCS_FORMAT_LIGHTRAG
  27. from lightrag.sidecar.ir import (
  28. AssetSpec,
  29. IRBlock,
  30. IRDoc,
  31. IRDrawing,
  32. IREquation,
  33. IRTable,
  34. )
  35. from lightrag.sidecar.placeholders import (
  36. render_drawing_tag,
  37. render_equation_tag,
  38. render_table_tag,
  39. render_template,
  40. table_body_for_rows,
  41. )
  42. from lightrag.utils import logger
  43. # ---------------------------------------------------------------------------
  44. # Public entry point
  45. # ---------------------------------------------------------------------------
  46. _VALID_BLOCK_DRAWING_PATH_STYLES = {"with_prefix", "basename_only"}
  47. def write_sidecar(
  48. ir: IRDoc,
  49. *,
  50. parsed_dir: Path,
  51. doc_id: str,
  52. engine: str,
  53. clean_parsed_dir: bool = True,
  54. block_drawing_path_style: str = "with_prefix",
  55. ) -> dict[str, Any]:
  56. """Emit a spec-compliant ``*.parsed/`` directory from an IR.
  57. Args:
  58. ir: Document IR produced by an engine adapter.
  59. parsed_dir: Output directory. By default cleared and recreated; the
  60. caller is responsible for placing it under
  61. ``__parsed__/<base>.parsed/``.
  62. doc_id: ``doc-<md5>``; ``doc_hash`` for sidecar ids is the 32-char
  63. tail after stripping the ``doc-`` prefix.
  64. engine: One of ``native`` / ``mineru`` / ``docling`` / ``legacy``;
  65. written verbatim to ``meta.parse_engine``.
  66. clean_parsed_dir: When True (default) the writer ``rmtree``s
  67. ``parsed_dir`` before writing. Set to False when the caller has
  68. already pre-populated the directory with side artifacts that
  69. must survive — e.g. the native docx adapter pre-extracts image
  70. bytes into ``<base>.blocks.assets/`` before the writer runs,
  71. and passing ``AssetSpec.source=None`` lets the writer record
  72. them without copying.
  73. block_drawing_path_style: How ``<drawing path="...">`` in
  74. ``blocks.jsonl`` resolves the asset path. ``"with_prefix"``
  75. (default) renders ``<base>.blocks.assets/<filename>`` — matches
  76. the path stored in ``drawings.json``. ``"basename_only"``
  77. renders just ``<filename>``; legacy native docx convention
  78. (downstream consumers read the file path from ``drawings.json``,
  79. not from this attribute, so the basename-only form is purely
  80. cosmetic but kept for byte-equivalence with the original
  81. adapter).
  82. Returns:
  83. Dict shaped like the pipeline's existing ``parsed_data`` payload:
  84. ``{doc_id, file_path, parse_format, content, blocks_path}``.
  85. ``file_path`` is ``ir.document_name``; the caller resolves it to the
  86. actual on-disk path it wants persisted.
  87. """
  88. if block_drawing_path_style not in _VALID_BLOCK_DRAWING_PATH_STYLES:
  89. allowed = ", ".join(sorted(_VALID_BLOCK_DRAWING_PATH_STYLES))
  90. raise ValueError(
  91. f"block_drawing_path_style must be one of {allowed}, "
  92. f"got {block_drawing_path_style!r}"
  93. )
  94. if clean_parsed_dir and parsed_dir.exists():
  95. shutil.rmtree(parsed_dir)
  96. parsed_dir.mkdir(parents=True, exist_ok=True)
  97. base_name = Path(ir.document_name).stem or ir.document_name
  98. blocks_path = parsed_dir / f"{base_name}.blocks.jsonl"
  99. tables_path = parsed_dir / f"{base_name}.tables.json"
  100. drawings_path = parsed_dir / f"{base_name}.drawings.json"
  101. equations_path = parsed_dir / f"{base_name}.equations.json"
  102. assets_dir = parsed_dir / f"{base_name}.blocks.assets"
  103. # ``clean_parsed_dir=False`` is reserved for callers that pre-populate
  104. # the directory with artifacts that must survive (e.g. the native docx
  105. # adapter pre-extracts assets). If a stale ``blocks.jsonl`` is sitting
  106. # there, the caller forgot to pre-clean — warn so the leftover doesn't
  107. # get silently overwritten with partially-stale neighbors.
  108. if not clean_parsed_dir and blocks_path.exists():
  109. logger.warning(
  110. "[sidecar] clean_parsed_dir=False but %s already exists; "
  111. "caller is expected to pre-clean before invoking write_sidecar",
  112. blocks_path,
  113. )
  114. # Stage 1: realize assets first so drawings can carry resolved paths.
  115. asset_paths = _materialize_assets(ir.assets, assets_dir)
  116. # Stage 2: walk blocks, allocate ids, render templates, accumulate
  117. # sidecar item dicts and blocks.jsonl lines.
  118. doc_hash = doc_id.removeprefix("doc-")
  119. tables: dict[str, dict[str, Any]] = {}
  120. drawings: dict[str, dict[str, Any]] = {}
  121. equations: dict[str, dict[str, Any]] = {}
  122. blocks_lines: list[str] = []
  123. merged_parts: list[str] = []
  124. table_seq = 0
  125. drawing_seq = 0
  126. equation_seq = 0
  127. asset_prefix = f"{assets_dir.name}/"
  128. # ``block_index`` in the blockid hash refers to the position in the
  129. # SOURCE block list (``enumerate`` over ``ir.blocks``), not the emitted
  130. # position. Otherwise an editor turning a previously-non-empty block
  131. # into an empty one — which then gets dropped — would shift the
  132. # blockids of every block after it; we want stable ids across edits.
  133. for block_index, block in enumerate(ir.blocks):
  134. # Allocate ids for items declared on this block. Order: tables ->
  135. # drawings -> equations (per-block deterministic; the global
  136. # sequence advances across blocks).
  137. table_id_by_key: dict[str, str] = {}
  138. for table in block.tables:
  139. table_seq += 1
  140. tb_id = f"tb-{doc_hash}-{table_seq:04d}"
  141. table_id_by_key[table.placeholder_key] = tb_id
  142. drawing_id_by_key: dict[str, str] = {}
  143. for drawing in block.drawings:
  144. drawing_seq += 1
  145. im_id = f"im-{doc_hash}-{drawing_seq:04d}"
  146. drawing_id_by_key[drawing.placeholder_key] = im_id
  147. equation_id_by_key: dict[str, str] = {}
  148. for equation in block.equations:
  149. if not equation.is_block:
  150. continue
  151. equation_seq += 1
  152. eq_id = f"eq-{doc_hash}-{equation_seq:04d}"
  153. equation_id_by_key[equation.placeholder_key] = eq_id
  154. # Render placeholder template.
  155. rendered = _render_block_content(
  156. block,
  157. table_id_by_key=table_id_by_key,
  158. drawing_id_by_key=drawing_id_by_key,
  159. equation_id_by_key=equation_id_by_key,
  160. asset_paths=asset_paths,
  161. asset_prefix=asset_prefix,
  162. block_drawing_path_style=block_drawing_path_style,
  163. )
  164. rendered = rendered.strip()
  165. if not rendered:
  166. # Drop empty blocks entirely — neither blocks.jsonl entry nor
  167. # sidecar items (the items were tied to the placeholder; if it
  168. # vanished, the items are orphans). This mirrors the existing
  169. # native_docx behaviour and ensures merged_text is contiguous.
  170. continue
  171. blockid = hashlib.md5(
  172. f"{doc_id}:{block_index}:{block.heading}:{rendered}".encode("utf-8")
  173. ).hexdigest()
  174. # Realize per-block sidecar item dicts now that blockid is known.
  175. # Defensive: an adapter that declares an item on block.tables /
  176. # drawings / equations but omits the matching ``{{TBL/IMG/EQ:k}}``
  177. # token from ``content_template`` would leave the rendered text
  178. # without the corresponding tag. We detect that by checking whether
  179. # the allocated id (which is doc-unique) appears in the rendered
  180. # output, warn, and skip the sidecar entry — otherwise the per-
  181. # modality JSON would reference a blockid whose body never names it.
  182. for table in block.tables:
  183. tb_id = table_id_by_key[table.placeholder_key]
  184. if tb_id not in rendered:
  185. logger.warning(
  186. "[sidecar] orphan table id=%s on block %d "
  187. "(placeholder %r not referenced in content_template); "
  188. "skipping sidecar entry",
  189. tb_id,
  190. block_index,
  191. table.placeholder_key,
  192. )
  193. continue
  194. tables[tb_id] = _table_item_dict(tb_id, blockid, block.heading, table)
  195. for drawing in block.drawings:
  196. im_id = drawing_id_by_key[drawing.placeholder_key]
  197. if im_id not in rendered:
  198. logger.warning(
  199. "[sidecar] orphan drawing id=%s on block %d "
  200. "(placeholder %r not referenced in content_template); "
  201. "skipping sidecar entry",
  202. im_id,
  203. block_index,
  204. drawing.placeholder_key,
  205. )
  206. continue
  207. drawings[im_id] = _drawing_item_dict(
  208. im_id, blockid, block.heading, drawing, asset_paths, asset_prefix
  209. )
  210. for equation in block.equations:
  211. if not equation.is_block:
  212. continue
  213. eq_id = equation_id_by_key[equation.placeholder_key]
  214. if eq_id not in rendered:
  215. logger.warning(
  216. "[sidecar] orphan equation id=%s on block %d "
  217. "(placeholder %r not referenced in content_template); "
  218. "skipping sidecar entry",
  219. eq_id,
  220. block_index,
  221. equation.placeholder_key,
  222. )
  223. continue
  224. equations[eq_id] = _equation_item_dict(
  225. eq_id, blockid, block.heading, equation
  226. )
  227. row: dict[str, Any] = {
  228. "type": "content",
  229. "blockid": blockid,
  230. "format": "plain_text",
  231. "content": rendered,
  232. "heading": block.heading,
  233. "parent_headings": list(block.parent_headings),
  234. "level": int(block.level),
  235. "session_type": block.session_type or "body",
  236. "table_slice": block.table_slice or "none",
  237. "positions": [p.to_jsonable() for p in block.positions],
  238. }
  239. if block.table_header:
  240. row["table_header"] = block.table_header
  241. blocks_lines.append(json.dumps(row, ensure_ascii=False))
  242. merged_parts.append(rendered)
  243. # Stage 3: doc-level metadata.
  244. merged_text = "\n\n".join(p for p in merged_parts if p.strip())
  245. document_hash = hashlib.sha256(merged_text.encode("utf-8")).hexdigest()
  246. parse_time = datetime.now(timezone.utc).isoformat()
  247. asset_dir_present = assets_dir.exists() and any(assets_dir.iterdir())
  248. if not asset_dir_present and assets_dir.exists():
  249. try:
  250. assets_dir.rmdir()
  251. except OSError:
  252. pass
  253. meta: dict[str, Any] = {
  254. "type": "meta",
  255. "format": "lightrag",
  256. "version": "1.0",
  257. "document_name": ir.document_name,
  258. "document_format": ir.document_format,
  259. "document_hash": f"sha256:{document_hash}",
  260. "table_file": bool(tables),
  261. "equation_file": bool(equations),
  262. "drawing_file": bool(drawings),
  263. "asset_dir": asset_dir_present,
  264. "split_option": dict(ir.split_option or {}),
  265. "blocks": len(blocks_lines),
  266. "doc_id": doc_id,
  267. "parse_engine": engine,
  268. "parse_time": parse_time,
  269. "doc_title": ir.doc_title,
  270. }
  271. if ir.bbox_attributes is not None:
  272. meta["bbox_attributes"] = dict(ir.bbox_attributes)
  273. blocks_path.write_text(
  274. "\n".join([json.dumps(meta, ensure_ascii=False)] + blocks_lines) + "\n",
  275. encoding="utf-8",
  276. )
  277. # Sidecar JSONs end with a trailing newline (POSIX text-file convention;
  278. # also keeps end-of-file linters / pre-commit hooks happy and matches the
  279. # ``blocks.jsonl`` convention above).
  280. if tables:
  281. tables_path.write_text(
  282. json.dumps(
  283. {"version": "1.0", "tables": tables},
  284. ensure_ascii=False,
  285. indent=2,
  286. )
  287. + "\n",
  288. encoding="utf-8",
  289. )
  290. if drawings:
  291. drawings_path.write_text(
  292. json.dumps(
  293. {"version": "1.0", "drawings": drawings},
  294. ensure_ascii=False,
  295. indent=2,
  296. )
  297. + "\n",
  298. encoding="utf-8",
  299. )
  300. if equations:
  301. equations_path.write_text(
  302. json.dumps(
  303. {"version": "1.0", "equations": equations},
  304. ensure_ascii=False,
  305. indent=2,
  306. )
  307. + "\n",
  308. encoding="utf-8",
  309. )
  310. logger.info(
  311. "[sidecar] wrote %d blocks for doc_id=%s "
  312. "(%d tables, %d drawings, %d equations, assets=%s, engine=%s)",
  313. len(blocks_lines),
  314. doc_id,
  315. len(tables),
  316. len(drawings),
  317. len(equations),
  318. asset_dir_present,
  319. engine,
  320. )
  321. return {
  322. "doc_id": doc_id,
  323. "file_path": ir.document_name,
  324. "parse_format": FULL_DOCS_FORMAT_LIGHTRAG,
  325. "content": merged_text,
  326. "blocks_path": str(blocks_path),
  327. }
  328. # ---------------------------------------------------------------------------
  329. # Helpers
  330. # ---------------------------------------------------------------------------
  331. def _materialize_assets(
  332. assets: list[AssetSpec],
  333. assets_dir: Path,
  334. ) -> dict[str, str]:
  335. """Materialize :class:`AssetSpec` objects into ``assets_dir``.
  336. Returns: ``{ref: filename_inside_assets_dir}``.
  337. Collision policy: if two specs map to the same target name, the second
  338. gets a ``-2``, ``-3``, ... suffix on the stem. We never overwrite a file
  339. we've already produced.
  340. """
  341. if not assets:
  342. return {}
  343. assets_dir.mkdir(parents=True, exist_ok=True)
  344. out: dict[str, str] = {}
  345. used_names: set[str] = set()
  346. for spec in assets:
  347. target_name = _allocate_unique_name(spec.suggested_name, used_names)
  348. target_path = assets_dir / target_name
  349. if isinstance(spec.source, (str, Path)):
  350. src_path = Path(spec.source)
  351. if not src_path.exists():
  352. logger.warning(
  353. "[sidecar] asset source missing for ref=%s (%s); " "skipping copy",
  354. spec.ref,
  355. src_path,
  356. )
  357. continue
  358. if src_path.resolve() != target_path.resolve():
  359. shutil.copyfile(src_path, target_path)
  360. elif isinstance(spec.source, bytes):
  361. target_path.write_bytes(spec.source)
  362. elif spec.source is None:
  363. # Assumed already on disk at the target location (native_docx
  364. # writes assets during extraction). Verify presence; warn if
  365. # missing.
  366. if not target_path.exists():
  367. logger.warning(
  368. "[sidecar] asset ref=%s declared in place but %s " "is absent",
  369. spec.ref,
  370. target_path,
  371. )
  372. continue
  373. else:
  374. logger.warning(
  375. "[sidecar] unsupported AssetSpec.source type for ref=%s: %s",
  376. spec.ref,
  377. type(spec.source).__name__,
  378. )
  379. continue
  380. used_names.add(target_name)
  381. out[spec.ref] = target_name
  382. return out
  383. def _allocate_unique_name(suggested: str, used: set[str]) -> str:
  384. """Make ``suggested`` unique within ``used``: ``foo.png`` → ``foo-2.png``."""
  385. if suggested not in used:
  386. return suggested
  387. stem = Path(suggested).stem
  388. suffix = Path(suggested).suffix
  389. n = 2
  390. while True:
  391. cand = f"{stem}-{n}{suffix}"
  392. if cand not in used:
  393. return cand
  394. n += 1
  395. def _render_block_content(
  396. block: IRBlock,
  397. *,
  398. table_id_by_key: dict[str, str],
  399. drawing_id_by_key: dict[str, str],
  400. equation_id_by_key: dict[str, str],
  401. asset_paths: dict[str, str],
  402. asset_prefix: str,
  403. block_drawing_path_style: str = "with_prefix",
  404. ) -> str:
  405. """Expand placeholder tokens in ``block.content_template``."""
  406. tables_by_key = {t.placeholder_key: t for t in block.tables}
  407. drawings_by_key = {d.placeholder_key: d for d in block.drawings}
  408. equations_by_key = {e.placeholder_key: e for e in block.equations}
  409. def _table(key: str) -> str:
  410. table = tables_by_key.get(key)
  411. if table is None:
  412. return ""
  413. tb_id = table_id_by_key.get(key, "")
  414. if table.body_override is not None:
  415. # Verbatim block-text body — used by adapters that need to
  416. # preserve the parser's original whitespace/escaping (native
  417. # docx). Sidecar entry's ``content`` field still gets the
  418. # canonical ``table_body_for_rows`` encoding via
  419. # ``_table_item_dict``.
  420. fmt = "json" if table.rows is not None else "html"
  421. return render_table_tag(tb_id, fmt, table.body_override)
  422. if table.rows is not None:
  423. return render_table_tag(tb_id, "json", table_body_for_rows(table.rows))
  424. return render_table_tag(tb_id, "html", table.html or "")
  425. def _drawing(key: str) -> str:
  426. drawing = drawings_by_key.get(key)
  427. if drawing is None:
  428. return ""
  429. im_id = drawing_id_by_key.get(key, "")
  430. if drawing.path_override is not None:
  431. # Verbatim external/linked reference — pass through unchanged.
  432. path = drawing.path_override
  433. else:
  434. filename = asset_paths.get(drawing.asset_ref, "")
  435. if not filename:
  436. path = ""
  437. elif block_drawing_path_style == "basename_only":
  438. path = filename
  439. else:
  440. path = f"{asset_prefix}{filename}"
  441. return render_drawing_tag(
  442. im_id,
  443. drawing.fmt,
  444. drawing.caption,
  445. path,
  446. drawing.src,
  447. )
  448. def _equation(key: str) -> str:
  449. eq = equations_by_key.get(key)
  450. if eq is None:
  451. return ""
  452. if not eq.is_block:
  453. # Adapter mistake: an EQ token should only be used for block
  454. # equations. Treat as inline to avoid a dangling token.
  455. return render_equation_tag(None, eq.latex, eq.caption)
  456. eq_id = equation_id_by_key.get(key, "")
  457. return render_equation_tag(eq_id, eq.latex, eq.caption)
  458. def _inline_equation(key: str) -> str:
  459. eq = equations_by_key.get(key)
  460. if eq is None:
  461. return ""
  462. return render_equation_tag(None, eq.latex, eq.caption)
  463. return render_template(
  464. block.content_template,
  465. table_renderer=_table,
  466. drawing_renderer=_drawing,
  467. equation_renderer=_equation,
  468. inline_equation_renderer=_inline_equation,
  469. )
  470. def _table_item_dict(
  471. table_id: str,
  472. blockid: str,
  473. heading: str,
  474. table: IRTable,
  475. ) -> dict[str, Any]:
  476. if table.rows is not None:
  477. fmt = "json"
  478. content = table_body_for_rows(table.rows)
  479. else:
  480. fmt = "html"
  481. content = table.html or ""
  482. item: dict[str, Any] = {
  483. "id": table_id,
  484. "blockid": blockid,
  485. "heading": heading,
  486. "dimension": [int(table.num_rows), int(table.num_cols)],
  487. "format": fmt,
  488. "content": content,
  489. "caption": table.caption,
  490. "footnotes": list(table.footnotes),
  491. }
  492. if table.table_header is not None:
  493. # Spec §5: stored as JSON string.
  494. item["table_header"] = json.dumps(table.table_header, ensure_ascii=False)
  495. if table.self_ref:
  496. item["self_ref"] = table.self_ref
  497. if table.extras:
  498. item["extras"] = dict(table.extras)
  499. return item
  500. def _drawing_item_dict(
  501. drawing_id: str,
  502. blockid: str,
  503. heading: str,
  504. drawing: IRDrawing,
  505. asset_paths: dict[str, str],
  506. asset_prefix: str,
  507. ) -> dict[str, Any]:
  508. if drawing.path_override is not None:
  509. path = drawing.path_override
  510. else:
  511. filename = asset_paths.get(drawing.asset_ref, "")
  512. path = f"{asset_prefix}{filename}" if filename else ""
  513. item: dict[str, Any] = {
  514. "id": drawing_id,
  515. "blockid": blockid,
  516. "heading": heading,
  517. "format": drawing.fmt,
  518. "path": path,
  519. "src": drawing.src,
  520. "caption": drawing.caption,
  521. "footnotes": list(drawing.footnotes),
  522. }
  523. if drawing.self_ref:
  524. item["self_ref"] = drawing.self_ref
  525. if drawing.extras:
  526. item["extras"] = dict(drawing.extras)
  527. return item
  528. _LATEX_DOLLAR_RE = re.compile(r"^\s*\$\$?(.+?)\$\$?\s*$", re.DOTALL)
  529. def _strip_latex_dollar_wrappers(latex: str) -> str:
  530. """Strip leading/trailing ``$``/``$$`` wrappers from a latex string.
  531. ``equations.json`` stores clean latex (per the MinerU adapter contract:
  532. ``blocks.jsonl`` keeps the parser's raw form so the rendered
  533. ``<equation>`` body is byte-identical to the source, while the
  534. per-equation sidecar carries delimiter-free latex). Leaves strings
  535. without wrappers untouched.
  536. """
  537. if not latex:
  538. return latex
  539. m = _LATEX_DOLLAR_RE.match(latex)
  540. return m.group(1).strip() if m else latex.strip()
  541. def _equation_item_dict(
  542. eq_id: str,
  543. blockid: str,
  544. heading: str,
  545. equation: IREquation,
  546. ) -> dict[str, Any]:
  547. item: dict[str, Any] = {
  548. "id": eq_id,
  549. "blockid": blockid,
  550. "heading": heading,
  551. "format": "latex",
  552. "content": _strip_latex_dollar_wrappers(equation.latex),
  553. "caption": equation.caption,
  554. "footnotes": list(equation.footnotes),
  555. }
  556. if equation.self_ref:
  557. item["self_ref"] = equation.self_ref
  558. if equation.extras:
  559. item["extras"] = dict(equation.extras)
  560. return item