_native_docx_fixtures.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329
  1. """Scenario fixtures for the native docx → SidecarWriter migration tests.
  2. Each scenario describes:
  3. - ``blocks`` — what ``extract_docx_blocks`` would return (the synthetic
  4. block dicts that the adapter consumes).
  5. - ``parse_metadata`` — the dict the upstream parser fills in (only
  6. ``first_heading`` is currently consumed by the adapter).
  7. - ``assets`` — files the upstream extractor would have written into
  8. ``<base>.blocks.assets/`` before the IR builder runs. Maps relative names
  9. inside the asset dir → byte content.
  10. - ``doc_id`` — fixed so blockid + sidecar ids are deterministic.
  11. - ``file_path`` — used for canonical basename / doc_title fallback.
  12. The captured outputs (``blocks.jsonl`` + per-modality JSONs + assets) live
  13. under ``tests/parser/docx/golden/native_docx/<scenario>/``. The
  14. production path (``LightRAG.parse_native``) must produce byte-identical
  15. bytes vs those fixtures; the regen script under ``scripts/`` rewrites
  16. them when the format intentionally changes.
  17. """
  18. from __future__ import annotations
  19. from dataclasses import dataclass, field
  20. from typing import Any
  21. def _block(
  22. content: str,
  23. *,
  24. heading: str = "",
  25. level: int = 0,
  26. parent: list[str] | None = None,
  27. uuid: str = "p1",
  28. uuid_end: str | None = None,
  29. table_headers: list[Any] | None = None,
  30. table_chunk_role: str = "none",
  31. ) -> dict[str, Any]:
  32. """Build a synthetic block matching ``extract_docx_blocks`` output."""
  33. out: dict[str, Any] = {
  34. "uuid": uuid,
  35. "uuid_end": uuid_end if uuid_end is not None else uuid,
  36. "heading": heading,
  37. "content": content,
  38. "type": "text",
  39. "parent_headings": list(parent or []),
  40. "level": level,
  41. "table_chunk_role": table_chunk_role,
  42. }
  43. if table_headers is not None:
  44. out["table_headers"] = table_headers
  45. return out
  46. @dataclass
  47. class Scenario:
  48. name: str
  49. doc_id: str
  50. file_path: str # canonical-ish; what the pipeline would pass
  51. blocks: list[dict[str, Any]]
  52. parse_metadata: dict[str, Any] = field(default_factory=dict)
  53. assets: dict[str, bytes] = field(default_factory=dict)
  54. SCENARIOS: list[Scenario] = [
  55. # --- 1: text-only, multi-heading -----------------------------------
  56. Scenario(
  57. name="text_only_hierarchy",
  58. doc_id="doc-aaaa111122223333aaaa111122223333",
  59. file_path="paper.docx",
  60. parse_metadata={"first_heading": "Introduction"},
  61. blocks=[
  62. _block(
  63. "Introduction",
  64. heading="Introduction",
  65. level=1,
  66. uuid="h1",
  67. ),
  68. _block(
  69. "Body paragraph one.",
  70. heading="Introduction",
  71. level=1,
  72. uuid="p1",
  73. uuid_end="p2",
  74. ),
  75. _block(
  76. "Background",
  77. heading="Background",
  78. level=2,
  79. parent=["Introduction"],
  80. uuid="h2",
  81. ),
  82. _block(
  83. "Sub body.",
  84. heading="Background",
  85. level=2,
  86. parent=["Introduction"],
  87. uuid="p3",
  88. ),
  89. ],
  90. ),
  91. # --- 2: block + inline equations -----------------------------------
  92. Scenario(
  93. name="equations_block_and_inline",
  94. doc_id="doc-bbbb222233334444bbbb222233334444",
  95. file_path="formulas.docx",
  96. parse_metadata={"first_heading": "Equations"},
  97. blocks=[
  98. _block(
  99. "Equations",
  100. heading="Equations",
  101. level=1,
  102. uuid="h1",
  103. ),
  104. _block(
  105. # Inline equation (no surrounding \n on either side)
  106. "Energy is <equation>E=mc^2</equation> per Einstein.",
  107. heading="Equations",
  108. level=1,
  109. uuid="p1",
  110. ),
  111. _block(
  112. # Block equation (wedged between newlines)
  113. "Consider:\n<equation>x^2 + y^2 = r^2</equation>\nThe circle equation.",
  114. heading="Equations",
  115. level=1,
  116. uuid="p2",
  117. ),
  118. _block(
  119. # Block at content edge (start == 0)
  120. "<equation>a + b = c</equation>\ntext after",
  121. heading="Equations",
  122. level=1,
  123. uuid="p3",
  124. ),
  125. ],
  126. ),
  127. # --- 3: tables with and without table_headers ----------------------
  128. Scenario(
  129. name="tables_mixed",
  130. doc_id="doc-cccc333344445555cccc333344445555",
  131. file_path="report.docx",
  132. parse_metadata={"first_heading": "Report"},
  133. blocks=[
  134. _block(
  135. "Report",
  136. heading="Report",
  137. level=1,
  138. uuid="h1",
  139. ),
  140. _block(
  141. # Table with table_headers (cross-page repeating)
  142. 'See table:\n<table>[["X","Y"],["1","2"],["3","4"]]</table>',
  143. heading="Report",
  144. level=1,
  145. uuid="t1",
  146. table_headers=[[["X", "Y"]]], # one table, one header row
  147. ),
  148. _block(
  149. # Table without table_headers
  150. 'Plain table:\n<table>[["a","b"]]</table>',
  151. heading="Report",
  152. level=1,
  153. uuid="t2",
  154. ),
  155. _block(
  156. # Two tables in one block
  157. '<table>[["p"]]</table>\nthen\n<table>[["q","r"],["s","t"]]</table>',
  158. heading="Report",
  159. level=1,
  160. uuid="t3",
  161. table_headers=[None, [["q", "r"]]],
  162. ),
  163. ],
  164. ),
  165. # --- 4: drawings + assets ------------------------------------------
  166. Scenario(
  167. name="drawings_with_assets",
  168. doc_id="doc-dddd444455556666dddd444455556666",
  169. file_path="diagrams.docx",
  170. parse_metadata={"first_heading": "Diagrams"},
  171. assets={
  172. "fig1.png": b"\x89PNG\r\n\x1a\n-fig1-fake",
  173. "fig2.jpg": b"\xff\xd8\xff\xe0-fig2-fake",
  174. },
  175. blocks=[
  176. _block(
  177. "Diagrams",
  178. heading="Diagrams",
  179. level=1,
  180. uuid="h1",
  181. ),
  182. _block(
  183. "Figure one:\n"
  184. '<drawing id="x" format="png" '
  185. 'path="diagrams.blocks.assets/fig1.png" '
  186. 'src="docx://image1" />\n'
  187. "Figure two:\n"
  188. '<drawing id="y" format="jpg" '
  189. 'path="diagrams.blocks.assets/fig2.jpg" '
  190. 'src="docx://image2" />',
  191. heading="Diagrams",
  192. level=1,
  193. uuid="p1",
  194. ),
  195. ],
  196. ),
  197. # --- 5: all modalities mixed ---------------------------------------
  198. Scenario(
  199. name="all_modalities",
  200. doc_id="doc-eeee555566667777eeee555566667777",
  201. file_path="combo.docx",
  202. parse_metadata={"first_heading": "Combined"},
  203. assets={"pic.png": b"PNG-combo"},
  204. blocks=[
  205. _block(
  206. "Combined",
  207. heading="Combined",
  208. level=1,
  209. uuid="h1",
  210. ),
  211. _block(
  212. "Look at this figure:\n"
  213. '<drawing id="z" format="png" '
  214. 'path="combo.blocks.assets/pic.png" '
  215. 'src="docx://img" />\n'
  216. "Plus a table:\n"
  217. '<table>[["α","β"],["γ","δ"]]</table>\n'
  218. "And a block equation:\n"
  219. "<equation>F = ma</equation>\n"
  220. "And an inline <equation>v=d/t</equation> here.",
  221. heading="Combined",
  222. level=1,
  223. uuid="p1",
  224. ),
  225. ],
  226. ),
  227. # --- 6: empty block dropped ----------------------------------------
  228. Scenario(
  229. name="empty_block_dropped",
  230. doc_id="doc-ffff666677778888ffff666677778888",
  231. file_path="sparse.docx",
  232. parse_metadata={"first_heading": "Sparse"},
  233. blocks=[
  234. _block(
  235. "Sparse",
  236. heading="Sparse",
  237. level=1,
  238. uuid="h1",
  239. ),
  240. _block(
  241. " \n ", # strips to empty — must be dropped
  242. heading="Sparse",
  243. level=1,
  244. uuid="p_empty",
  245. ),
  246. _block(
  247. "Real content after empty.",
  248. heading="Sparse",
  249. level=1,
  250. uuid="p_real",
  251. ),
  252. ],
  253. ),
  254. # --- 7: external / linked image references ------------------------
  255. # DOCX can carry ``<a:blip r:link="rId…"/>`` references to image
  256. # targets that live outside the package — the upstream extractor
  257. # then emits ``<drawing path="<external URL or unresolved path>" />``
  258. # WITHOUT writing bytes into ``<base>.blocks.assets/``. The adapter
  259. # must pass those paths through verbatim (both in ``blocks.jsonl``
  260. # and ``drawings.json``); turning them into AssetSpecs with
  261. # ``source=None`` would make the writer warn-and-skip → ``path=""``,
  262. # losing the only reference downstream consumers have.
  263. Scenario(
  264. name="external_image_link",
  265. doc_id="doc-1111aaaa2222bbbb1111aaaa2222bbbb",
  266. file_path="linked.docx",
  267. parse_metadata={"first_heading": "Linked"},
  268. # No on-disk assets — the path points elsewhere.
  269. assets={},
  270. blocks=[
  271. _block(
  272. "Linked",
  273. heading="Linked",
  274. level=1,
  275. uuid="h1",
  276. ),
  277. _block(
  278. "See the diagram online:\n"
  279. '<drawing id="z" format="png" '
  280. 'path="https://example.com/diagrams/architecture.png" '
  281. 'src="docx://external" />\n'
  282. "And a relative-but-not-asset path:\n"
  283. '<drawing id="z2" format="gif" '
  284. 'path="../images/legacy.gif" '
  285. 'src="docx://legacy" />',
  286. heading="Linked",
  287. level=1,
  288. uuid="p1",
  289. ),
  290. ],
  291. ),
  292. # --- 8: missing paraid ---------------------------------------------
  293. Scenario(
  294. name="missing_paraid",
  295. doc_id="doc-99990000111122229999000011112222",
  296. file_path="legacy.docx",
  297. parse_metadata={"first_heading": ""}, # no headings at all
  298. blocks=[
  299. _block(
  300. "Just plain text without a heading.",
  301. heading="",
  302. level=0,
  303. uuid="", # missing
  304. uuid_end="",
  305. ),
  306. _block(
  307. "Another paragraph with no paraId.",
  308. heading="",
  309. level=0,
  310. uuid="",
  311. uuid_end="",
  312. ),
  313. ],
  314. ),
  315. ]
  316. __all__ = ["Scenario", "SCENARIOS", "_block"]