test_native_docx_extract.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157
  1. """Unit tests for native docx tracked-change / comment / empty-table handling.
  2. Locks in the contract that:
  3. - ``w:ins`` content survives (final revised text),
  4. - ``w:del`` / ``w:moveFrom`` content is dropped,
  5. - ``w:commentRangeStart`` / ``w:commentRangeEnd`` / ``w:commentReference`` /
  6. ``w:annotationRef`` markers are dropped,
  7. - tables whose every cell is whitespace-only are omitted from the parser
  8. output (no ``<table>`` placeholder, so no IRTable downstream).
  9. These were previously emergent properties (skip lists in two places + a
  10. run-level white-list); regressions now fail loudly.
  11. """
  12. from __future__ import annotations
  13. from io import BytesIO
  14. import pytest
  15. from docx import Document
  16. from docx.oxml.ns import qn
  17. from lxml import etree
  18. from lightrag.parser.docx.parse_document import (
  19. extract_docx_blocks,
  20. extract_paragraph_content,
  21. )
  22. from lightrag.parser.docx.table_extractor import extract_paragraph_content_table
  23. W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
  24. PARAGRAPH_NS = {
  25. "w": W_NS,
  26. "wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
  27. "m": "http://schemas.openxmlformats.org/officeDocument/2006/math",
  28. }
  29. def _p(inner_xml: str):
  30. """Build a ``<w:p>`` lxml element from inner OOXML."""
  31. return etree.fromstring(f'<w:p xmlns:w="{W_NS}">{inner_xml}</w:p>'.encode("utf-8"))
  32. # --- paragraph walker (parse_document.extract_paragraph_content) -----------
  33. @pytest.mark.offline
  34. def test_paragraph_keeps_w_ins_content() -> None:
  35. para = _p("<w:ins><w:r><w:t>kept</w:t></w:r></w:ins>")
  36. assert extract_paragraph_content(para, PARAGRAPH_NS) == "kept"
  37. @pytest.mark.offline
  38. def test_paragraph_drops_w_del_content() -> None:
  39. para = _p("<w:del><w:r><w:t>removed</w:t></w:r></w:del>")
  40. assert extract_paragraph_content(para, PARAGRAPH_NS) == ""
  41. @pytest.mark.offline
  42. def test_paragraph_drops_w_movefrom_content() -> None:
  43. para = _p("<w:moveFrom><w:r><w:t>moved away</w:t></w:r></w:moveFrom>")
  44. assert extract_paragraph_content(para, PARAGRAPH_NS) == ""
  45. @pytest.mark.offline
  46. def test_paragraph_drops_comment_markers_but_keeps_surrounding_text() -> None:
  47. para = _p(
  48. '<w:commentRangeStart w:id="0"/>'
  49. "<w:r><w:t>visible</w:t></w:r>"
  50. '<w:commentRangeEnd w:id="0"/>'
  51. "<w:r>"
  52. ' <w:rPr><w:rStyle w:val="CommentReference"/></w:rPr>'
  53. ' <w:commentReference w:id="0"/>'
  54. "</w:r>"
  55. )
  56. assert extract_paragraph_content(para, PARAGRAPH_NS) == "visible"
  57. # --- table-cell walker (table_extractor.extract_paragraph_content_table) ---
  58. @pytest.mark.offline
  59. def test_table_cell_keeps_w_ins_content() -> None:
  60. para = _p("<w:ins><w:r><w:t>kept</w:t></w:r></w:ins>")
  61. assert extract_paragraph_content_table(para, qn) == "kept"
  62. @pytest.mark.offline
  63. def test_table_cell_drops_w_del_content() -> None:
  64. para = _p("<w:del><w:r><w:t>removed</w:t></w:r></w:del>")
  65. assert extract_paragraph_content_table(para, qn) == ""
  66. @pytest.mark.offline
  67. def test_table_cell_drops_comment_markers() -> None:
  68. para = _p(
  69. '<w:commentRangeStart w:id="0"/>'
  70. "<w:r><w:t>cell-text</w:t></w:r>"
  71. '<w:commentRangeEnd w:id="0"/>'
  72. )
  73. assert extract_paragraph_content_table(para, qn) == "cell-text"
  74. # --- end-to-end: empty tables vanish from blocks output --------------------
  75. def _populate_cell(cell, text: str) -> None:
  76. cell.paragraphs[0].text = text
  77. def _build_docx_with_three_tables() -> BytesIO:
  78. """One real table, one all-empty table, one whitespace-only table.
  79. Returns a BytesIO containing the rendered .docx so it can be fed to
  80. ``extract_docx_blocks`` via a tempfile-backed path or directly.
  81. """
  82. doc = Document()
  83. doc.add_paragraph("intro")
  84. real = doc.add_table(rows=1, cols=2)
  85. _populate_cell(real.rows[0].cells[0], "A")
  86. _populate_cell(real.rows[0].cells[1], "B")
  87. empty = doc.add_table(rows=2, cols=2)
  88. # leave every cell at python-docx default (empty paragraph)
  89. assert all(c.text == "" for row in empty.rows for c in row.cells)
  90. whitespace = doc.add_table(rows=1, cols=2)
  91. _populate_cell(whitespace.rows[0].cells[0], " ")
  92. _populate_cell(whitespace.rows[0].cells[1], "\t")
  93. buf = BytesIO()
  94. doc.save(buf)
  95. buf.seek(0)
  96. return buf
  97. @pytest.mark.offline
  98. def test_empty_tables_are_skipped(tmp_path) -> None:
  99. docx_bytes = _build_docx_with_three_tables()
  100. docx_path = tmp_path / "three_tables.docx"
  101. docx_path.write_bytes(docx_bytes.getvalue())
  102. blocks = extract_docx_blocks(str(docx_path))
  103. table_placeholders = sum(b["content"].count("<table>") for b in blocks)
  104. assert table_placeholders == 1, (
  105. "exactly one real table should survive; empty + whitespace tables "
  106. "must be dropped before the placeholder is emitted"
  107. )
  108. assert any('"A"' in b["content"] and '"B"' in b["content"] for b in blocks)