test_content_hash_normalization.py 2.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970
  1. """Cross-filename content_hash dedup via merged_text normalization.
  2. Sidecar-rendered bodies embed ``tb-<doc_hash>-NNNN`` / ``im-<doc_hash>-NNNN`` /
  3. ``eq-<doc_hash>-NNNN`` ids and ``path="<base>.blocks.assets/..."`` asset
  4. references — both derive from the filename for pending_parse uploads.
  5. ``compute_text_content_hash`` normalizes those surfaces before hashing so
  6. the same content under two filenames produces the same ``content_hash``
  7. and post-parse dedup fires.
  8. """
  9. from __future__ import annotations
  10. from lightrag.utils_pipeline import (
  11. compute_text_content_hash,
  12. normalize_merged_text_for_hash,
  13. )
  14. def _render(doc_hash: str, base_name: str) -> str:
  15. """Approximate the sidecar writer's merged_text for a doc with one table,
  16. one drawing, and one block equation."""
  17. return (
  18. "标题 1\n\n"
  19. f'正文段落引用表格 <table id="tb-{doc_hash}-0001" format="json">[[]]</table>。\n\n'
  20. f'<drawing id="im-{doc_hash}-0001" format="png" '
  21. f'path="{base_name}.blocks.assets/image1.png" src="rId4" />\n\n'
  22. f'<equation id="eq-{doc_hash}-0001" format="latex">E=mc^2</equation>'
  23. )
  24. def test_same_content_different_filename_dedupes():
  25. """Same merged_text with two different doc_hash / base names hashes to
  26. the same content_hash."""
  27. text_a = _render("a" * 32, "report-A")
  28. text_b = _render("b" * 32, "report-B")
  29. assert text_a != text_b, "sanity: raw bodies must differ"
  30. assert compute_text_content_hash(text_a) == compute_text_content_hash(text_b)
  31. def test_different_content_still_distinguishes():
  32. """Distinct bodies (different block text) still produce distinct hashes
  33. after normalization."""
  34. text_a = _render("a" * 32, "doc-A") + "\n\n附加段落 X"
  35. text_b = _render("a" * 32, "doc-A") + "\n\n附加段落 Y"
  36. assert compute_text_content_hash(text_a) != compute_text_content_hash(text_b)
  37. def test_plain_text_unaffected():
  38. """RAW text without sidecar markup is passed through unchanged so its
  39. hash matches the legacy ``MD5(text)`` value."""
  40. plain = "纯文本上传,没有任何 sidecar 标签。"
  41. assert normalize_merged_text_for_hash(plain) == plain
  42. def test_asset_filename_still_distinguishes():
  43. """The asset filename suffix is preserved — only the ``<base>.blocks.assets/``
  44. prefix is stripped — so two drawings pointing at different images still
  45. yield different hashes."""
  46. h = "c" * 32
  47. text_a = (
  48. f'<drawing id="im-{h}-0001" format="png" '
  49. f'path="doc.blocks.assets/image1.png" src="r1" />'
  50. )
  51. text_b = (
  52. f'<drawing id="im-{h}-0001" format="png" '
  53. f'path="doc.blocks.assets/image2.png" src="r1" />'
  54. )
  55. assert compute_text_content_hash(text_a) != compute_text_content_hash(text_b)