placeholders.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
  1. """Placeholder token rendering for spec-shaped multimodal tags.
  2. Adapters populate :attr:`IRBlock.content_template` with ``{{TBL:k}}``,
  3. ``{{IMG:k}}``, ``{{EQ:k}}`` and ``{{EQI:k}}`` tokens. The writer assigns
  4. ``tb-`` / ``im-`` / ``eq-`` ids, then calls :func:`render_template` to
  5. substitute the spec-shaped XML-style tags described in
  6. ``LightRAGSidecarFormat-zh.md`` §3.3.
  7. """
  8. from __future__ import annotations
  9. import json
  10. import re
  11. from typing import Callable
  12. _TOKEN_RE = re.compile(r"\{\{(TBL|IMG|EQ|EQI):([A-Za-z0-9_\-]+)\}\}")
  13. def xml_attr_escape(value: str) -> str:
  14. """Escape an attribute value for an XML-style tag attribute."""
  15. return (
  16. str(value)
  17. .replace("&", "&")
  18. .replace("<", "&lt;")
  19. .replace(">", "&gt;")
  20. .replace('"', "&quot;")
  21. )
  22. def caption_attr(caption: str) -> str:
  23. """Render a leading-space ``caption="..."`` attribute; empty when absent.
  24. Matches the existing native_docx adapter convention exactly so consumers
  25. that grep for ``caption="``-prefixed substrings keep working.
  26. """
  27. return f' caption="{xml_attr_escape(caption)}"' if caption else ""
  28. def render_table_tag(table_id: str, fmt: str, body: str) -> str:
  29. """``<table id="tb-..." format="json|html">body</table>`` per spec §3.3.
  30. ``body`` is the table content; for ``json`` it is the JSON array, for
  31. ``html`` it is the raw ``<table>...</table>`` HTML inside (the outer
  32. wrapper is added here).
  33. """
  34. return (
  35. f'<table id="{xml_attr_escape(table_id)}" '
  36. f'format="{xml_attr_escape(fmt)}">{body}</table>'
  37. )
  38. def render_drawing_tag(
  39. drawing_id: str,
  40. fmt: str,
  41. caption: str,
  42. path: str,
  43. src: str,
  44. ) -> str:
  45. """``<drawing id="im-..." format="..." caption="..." path="..." src="..." />``."""
  46. return (
  47. f'<drawing id="{xml_attr_escape(drawing_id)}" '
  48. f'format="{xml_attr_escape(fmt)}"'
  49. f"{caption_attr(caption)} "
  50. f'path="{xml_attr_escape(path)}" '
  51. f'src="{xml_attr_escape(src)}" />'
  52. )
  53. def render_equation_tag(
  54. eq_id: str | None,
  55. latex: str,
  56. caption: str = "",
  57. ) -> str:
  58. """Block equation: ``<equation id="eq-..." format="latex" caption="...">latex</equation>``.
  59. Inline equation (``eq_id is None``): ``<equation format="latex">latex</equation>``
  60. — no id, never written to ``equations.json``. Caption is preserved for
  61. both forms (spec §3.3 allows ``caption`` on ``<equation>``).
  62. """
  63. if eq_id is None:
  64. return f'<equation format="latex"{caption_attr(caption)}>{latex}</equation>'
  65. return (
  66. f'<equation id="{xml_attr_escape(eq_id)}" '
  67. f'format="latex"{caption_attr(caption)}>{latex}</equation>'
  68. )
  69. def render_template(
  70. template: str,
  71. *,
  72. table_renderer: Callable[[str], str],
  73. drawing_renderer: Callable[[str], str],
  74. equation_renderer: Callable[[str], str],
  75. inline_equation_renderer: Callable[[str], str],
  76. ) -> str:
  77. """Replace ``{{TBL:k}}`` / ``{{IMG:k}}`` / ``{{EQ:k}}`` / ``{{EQI:k}}``.
  78. Each renderer takes the placeholder *key* (the ``k`` portion) and returns
  79. the rendered XML-style tag.
  80. """
  81. def _replace(match: "re.Match[str]") -> str:
  82. kind, key = match.group(1), match.group(2)
  83. if kind == "TBL":
  84. return table_renderer(key)
  85. if kind == "IMG":
  86. return drawing_renderer(key)
  87. if kind == "EQ":
  88. return equation_renderer(key)
  89. return inline_equation_renderer(key)
  90. return _TOKEN_RE.sub(_replace, template)
  91. def table_body_for_rows(rows: list[list[str]]) -> str:
  92. """Encode rows as the JSON body that lives inside ``<table format="json">``."""
  93. return json.dumps(rows, ensure_ascii=False)