wxcz_admin
/
lightrag-cn-git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157
							"""Unit tests for native docx tracked-change / comment / empty-table handling.

Locks in the contract that:
- ``w:ins`` content survives (final revised text),
- ``w:del`` / ``w:moveFrom`` content is dropped,
- ``w:commentRangeStart`` / ``w:commentRangeEnd`` / ``w:commentReference`` /
  ``w:annotationRef`` markers are dropped,
- tables whose every cell is whitespace-only are omitted from the parser
  output (no ``<table>`` placeholder, so no IRTable downstream).

These were previously emergent properties (skip lists in two places + a
run-level white-list); regressions now fail loudly.
"""

from __future__ import annotations

from io import BytesIO

import pytest
from docx import Document
from docx.oxml.ns import qn
from lxml import etree

from lightrag.parser.docx.parse_document import (
    extract_docx_blocks,
    extract_paragraph_content,
)
from lightrag.parser.docx.table_extractor import extract_paragraph_content_table

W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
PARAGRAPH_NS = {
    "w": W_NS,
    "wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
    "m": "http://schemas.openxmlformats.org/officeDocument/2006/math",
}


def _p(inner_xml: str):
    """Build a ``<w:p>`` lxml element from inner OOXML."""
    return etree.fromstring(f'<w:p xmlns:w="{W_NS}">{inner_xml}</w:p>'.encode("utf-8"))


# --- paragraph walker (parse_document.extract_paragraph_content) -----------


@pytest.mark.offline
def test_paragraph_keeps_w_ins_content() -> None:
    para = _p("<w:ins><w:r><w:t>kept</w:t></w:r></w:ins>")

    assert extract_paragraph_content(para, PARAGRAPH_NS) == "kept"


@pytest.mark.offline
def test_paragraph_drops_w_del_content() -> None:
    para = _p("<w:del><w:r><w:t>removed</w:t></w:r></w:del>")

    assert extract_paragraph_content(para, PARAGRAPH_NS) == ""


@pytest.mark.offline
def test_paragraph_drops_w_movefrom_content() -> None:
    para = _p("<w:moveFrom><w:r><w:t>moved away</w:t></w:r></w:moveFrom>")

    assert extract_paragraph_content(para, PARAGRAPH_NS) == ""


@pytest.mark.offline
def test_paragraph_drops_comment_markers_but_keeps_surrounding_text() -> None:
    para = _p(
        '<w:commentRangeStart w:id="0"/>'
        "<w:r><w:t>visible</w:t></w:r>"
        '<w:commentRangeEnd w:id="0"/>'
        "<w:r>"
        '  <w:rPr><w:rStyle w:val="CommentReference"/></w:rPr>'
        '  <w:commentReference w:id="0"/>'
        "</w:r>"
    )

    assert extract_paragraph_content(para, PARAGRAPH_NS) == "visible"


# --- table-cell walker (table_extractor.extract_paragraph_content_table) ---


@pytest.mark.offline
def test_table_cell_keeps_w_ins_content() -> None:
    para = _p("<w:ins><w:r><w:t>kept</w:t></w:r></w:ins>")

    assert extract_paragraph_content_table(para, qn) == "kept"


@pytest.mark.offline
def test_table_cell_drops_w_del_content() -> None:
    para = _p("<w:del><w:r><w:t>removed</w:t></w:r></w:del>")

    assert extract_paragraph_content_table(para, qn) == ""


@pytest.mark.offline
def test_table_cell_drops_comment_markers() -> None:
    para = _p(
        '<w:commentRangeStart w:id="0"/>'
        "<w:r><w:t>cell-text</w:t></w:r>"
        '<w:commentRangeEnd w:id="0"/>'
    )

    assert extract_paragraph_content_table(para, qn) == "cell-text"


# --- end-to-end: empty tables vanish from blocks output --------------------


def _populate_cell(cell, text: str) -> None:
    cell.paragraphs[0].text = text


def _build_docx_with_three_tables() -> BytesIO:
    """One real table, one all-empty table, one whitespace-only table.

    Returns a BytesIO containing the rendered .docx so it can be fed to
    ``extract_docx_blocks`` via a tempfile-backed path or directly.
    """
    doc = Document()
    doc.add_paragraph("intro")

    real = doc.add_table(rows=1, cols=2)
    _populate_cell(real.rows[0].cells[0], "A")
    _populate_cell(real.rows[0].cells[1], "B")

    empty = doc.add_table(rows=2, cols=2)
    # leave every cell at python-docx default (empty paragraph)
    assert all(c.text == "" for row in empty.rows for c in row.cells)

    whitespace = doc.add_table(rows=1, cols=2)
    _populate_cell(whitespace.rows[0].cells[0], "   ")
    _populate_cell(whitespace.rows[0].cells[1], "\t")

    buf = BytesIO()
    doc.save(buf)
    buf.seek(0)
    return buf


@pytest.mark.offline
def test_empty_tables_are_skipped(tmp_path) -> None:
    docx_bytes = _build_docx_with_three_tables()
    docx_path = tmp_path / "three_tables.docx"
    docx_path.write_bytes(docx_bytes.getvalue())

    blocks = extract_docx_blocks(str(docx_path))

    table_placeholders = sum(b["content"].count("<table>") for b in blocks)
    assert table_placeholders == 1, (
        "exactly one real table should survive; empty + whitespace tables "
        "must be dropped before the placeholder is emitted"
    )
    assert any('"A"' in b["content"] and '"B"' in b["content"] for b in blocks)