wxcz_admin
/
lightrag-cn-git-d5efd3


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970
							"""Cross-filename content_hash dedup via merged_text normalization.

Sidecar-rendered bodies embed ``tb-<doc_hash>-NNNN`` / ``im-<doc_hash>-NNNN`` /
``eq-<doc_hash>-NNNN`` ids and ``path="<base>.blocks.assets/..."`` asset
references — both derive from the filename for pending_parse uploads.
``compute_text_content_hash`` normalizes those surfaces before hashing so
the same content under two filenames produces the same ``content_hash``
and post-parse dedup fires.
"""

from __future__ import annotations

from lightrag.utils_pipeline import (
    compute_text_content_hash,
    normalize_merged_text_for_hash,
)


def _render(doc_hash: str, base_name: str) -> str:
    """Approximate the sidecar writer's merged_text for a doc with one table,
    one drawing, and one block equation."""
    return (
        "标题 1\n\n"
        f'正文段落引用表格 <table id="tb-{doc_hash}-0001" format="json">[[]]</table>。\n\n'
        f'<drawing id="im-{doc_hash}-0001" format="png" '
        f'path="{base_name}.blocks.assets/image1.png" src="rId4" />\n\n'
        f'<equation id="eq-{doc_hash}-0001" format="latex">E=mc^2</equation>'
    )


def test_same_content_different_filename_dedupes():
    """Same merged_text with two different doc_hash / base names hashes to
    the same content_hash."""
    text_a = _render("a" * 32, "report-A")
    text_b = _render("b" * 32, "report-B")

    assert text_a != text_b, "sanity: raw bodies must differ"
    assert compute_text_content_hash(text_a) == compute_text_content_hash(text_b)


def test_different_content_still_distinguishes():
    """Distinct bodies (different block text) still produce distinct hashes
    after normalization."""
    text_a = _render("a" * 32, "doc-A") + "\n\n附加段落 X"
    text_b = _render("a" * 32, "doc-A") + "\n\n附加段落 Y"

    assert compute_text_content_hash(text_a) != compute_text_content_hash(text_b)


def test_plain_text_unaffected():
    """RAW text without sidecar markup is passed through unchanged so its
    hash matches the legacy ``MD5(text)`` value."""
    plain = "纯文本上传，没有任何 sidecar 标签。"
    assert normalize_merged_text_for_hash(plain) == plain


def test_asset_filename_still_distinguishes():
    """The asset filename suffix is preserved — only the ``<base>.blocks.assets/``
    prefix is stripped — so two drawings pointing at different images still
    yield different hashes."""
    h = "c" * 32
    text_a = (
        f'<drawing id="im-{h}-0001" format="png" '
        f'path="doc.blocks.assets/image1.png" src="r1" />'
    )
    text_b = (
        f'<drawing id="im-{h}-0001" format="png" '
        f'path="doc.blocks.assets/image2.png" src="r1" />'
    )
    assert compute_text_content_hash(text_a) != compute_text_content_hash(text_b)