"""Unit tests for native docx tracked-change / comment / empty-table handling.
Locks in the contract that:
- ``w:ins`` content survives (final revised text),
- ``w:del`` / ``w:moveFrom`` content is dropped,
- ``w:commentRangeStart`` / ``w:commentRangeEnd`` / ``w:commentReference`` /
``w:annotationRef`` markers are dropped,
- tables whose every cell is whitespace-only are omitted from the parser
output (no ``
`` placeholder, so no IRTable downstream).
These were previously emergent properties (skip lists in two places + a
run-level white-list); regressions now fail loudly.
"""
from __future__ import annotations
from io import BytesIO
import pytest
from docx import Document
from docx.oxml.ns import qn
from lxml import etree
from lightrag.parser.docx.parse_document import (
extract_docx_blocks,
extract_paragraph_content,
)
from lightrag.parser.docx.table_extractor import extract_paragraph_content_table
W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
PARAGRAPH_NS = {
"w": W_NS,
"wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
"m": "http://schemas.openxmlformats.org/officeDocument/2006/math",
}
def _p(inner_xml: str):
"""Build a ```` lxml element from inner OOXML."""
return etree.fromstring(f'{inner_xml}'.encode("utf-8"))
# --- paragraph walker (parse_document.extract_paragraph_content) -----------
@pytest.mark.offline
def test_paragraph_keeps_w_ins_content() -> None:
para = _p("kept")
assert extract_paragraph_content(para, PARAGRAPH_NS) == "kept"
@pytest.mark.offline
def test_paragraph_drops_w_del_content() -> None:
para = _p("removed")
assert extract_paragraph_content(para, PARAGRAPH_NS) == ""
@pytest.mark.offline
def test_paragraph_drops_w_movefrom_content() -> None:
para = _p("moved away")
assert extract_paragraph_content(para, PARAGRAPH_NS) == ""
@pytest.mark.offline
def test_paragraph_drops_comment_markers_but_keeps_surrounding_text() -> None:
para = _p(
''
"visible"
''
""
' '
' '
""
)
assert extract_paragraph_content(para, PARAGRAPH_NS) == "visible"
# --- table-cell walker (table_extractor.extract_paragraph_content_table) ---
@pytest.mark.offline
def test_table_cell_keeps_w_ins_content() -> None:
para = _p("kept")
assert extract_paragraph_content_table(para, qn) == "kept"
@pytest.mark.offline
def test_table_cell_drops_w_del_content() -> None:
para = _p("removed")
assert extract_paragraph_content_table(para, qn) == ""
@pytest.mark.offline
def test_table_cell_drops_comment_markers() -> None:
para = _p(
''
"cell-text"
''
)
assert extract_paragraph_content_table(para, qn) == "cell-text"
# --- end-to-end: empty tables vanish from blocks output --------------------
def _populate_cell(cell, text: str) -> None:
cell.paragraphs[0].text = text
def _build_docx_with_three_tables() -> BytesIO:
"""One real table, one all-empty table, one whitespace-only table.
Returns a BytesIO containing the rendered .docx so it can be fed to
``extract_docx_blocks`` via a tempfile-backed path or directly.
"""
doc = Document()
doc.add_paragraph("intro")
real = doc.add_table(rows=1, cols=2)
_populate_cell(real.rows[0].cells[0], "A")
_populate_cell(real.rows[0].cells[1], "B")
empty = doc.add_table(rows=2, cols=2)
# leave every cell at python-docx default (empty paragraph)
assert all(c.text == "" for row in empty.rows for c in row.cells)
whitespace = doc.add_table(rows=1, cols=2)
_populate_cell(whitespace.rows[0].cells[0], " ")
_populate_cell(whitespace.rows[0].cells[1], "\t")
buf = BytesIO()
doc.save(buf)
buf.seek(0)
return buf
@pytest.mark.offline
def test_empty_tables_are_skipped(tmp_path) -> None:
docx_bytes = _build_docx_with_three_tables()
docx_path = tmp_path / "three_tables.docx"
docx_path.write_bytes(docx_bytes.getvalue())
blocks = extract_docx_blocks(str(docx_path))
table_placeholders = sum(b["content"].count("") for b in blocks)
assert table_placeholders == 1, (
"exactly one real table should survive; empty + whitespace tables "
"must be dropped before the placeholder is emitted"
)
assert any('"A"' in b["content"] and '"B"' in b["content"] for b in blocks)