test_postgres_doc_status_lookup.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166
  1. """Unit tests for PGDocStatusStorage database-native overrides.
  2. Covers the PG-specific implementations of:
  3. * get_doc_by_file_basename
  4. * get_doc_by_content_hash
  5. Both override the base-class full-table scan with indexed SQL queries.
  6. """
  7. from datetime import datetime
  8. import pytest
  9. from unittest.mock import AsyncMock, MagicMock
  10. from lightrag.kg.postgres_impl import PGDocStatusStorage
  11. from lightrag.namespace import NameSpace
  12. def _make_storage():
  13. storage = PGDocStatusStorage.__new__(PGDocStatusStorage)
  14. storage.namespace = NameSpace.DOC_STATUS
  15. storage.workspace = "test_ws"
  16. storage.global_config = {"embedding_batch_num": 10}
  17. storage.db = MagicMock()
  18. storage.db.query = AsyncMock()
  19. return storage
  20. def _row(**overrides):
  21. base = {
  22. "id": "doc-1",
  23. "content_summary": "summary",
  24. "content_length": 12,
  25. "chunks_count": 1,
  26. "status": "processed",
  27. "file_path": "report.pdf",
  28. "chunks_list": "[]",
  29. "metadata": "{}",
  30. "error_msg": None,
  31. "track_id": None,
  32. "content_hash": "abc123",
  33. "created_at": datetime(2024, 1, 1, 0, 0, 0),
  34. "updated_at": datetime(2024, 1, 1, 0, 0, 0),
  35. }
  36. base.update(overrides)
  37. return base
  38. # ---------------------------------------------------------------------------
  39. # get_doc_by_file_basename
  40. # ---------------------------------------------------------------------------
  41. @pytest.mark.asyncio
  42. async def test_get_doc_by_file_basename_empty_returns_none():
  43. storage = _make_storage()
  44. assert await storage.get_doc_by_file_basename("") is None
  45. storage.db.query.assert_not_called()
  46. @pytest.mark.asyncio
  47. async def test_get_doc_by_file_basename_unknown_source_returns_none():
  48. storage = _make_storage()
  49. # normalize_document_file_path returns "unknown_source" for None-ish inputs
  50. assert await storage.get_doc_by_file_basename("unknown_source") is None
  51. storage.db.query.assert_not_called()
  52. @pytest.mark.asyncio
  53. async def test_get_doc_by_file_basename_exact_match():
  54. storage = _make_storage()
  55. storage.db.query.return_value = [_row(file_path="report.pdf")]
  56. result = await storage.get_doc_by_file_basename("report.pdf")
  57. assert result is not None
  58. doc_id, doc = result
  59. assert doc_id == "doc-1"
  60. assert doc["file_path"] == "report.pdf"
  61. assert doc["content_hash"] == "abc123"
  62. call = storage.db.query.call_args
  63. sql = call.args[0]
  64. params = call.args[1]
  65. assert "LIGHTRAG_DOC_STATUS" in sql
  66. assert "workspace=$1" in sql
  67. assert params[0] == "test_ws"
  68. assert params[1] == "report.pdf"
  69. assert params == ["test_ws", "report.pdf"]
  70. assert "LIKE" not in sql
  71. @pytest.mark.asyncio
  72. async def test_get_doc_by_file_basename_orders_stably_for_canonical_rows():
  73. storage = _make_storage()
  74. storage.db.query.return_value = [_row(id="doc-exact", file_path="report.pdf")]
  75. result = await storage.get_doc_by_file_basename("report.pdf")
  76. assert result is not None
  77. assert result[0] == "doc-exact"
  78. sql = storage.db.query.call_args.args[0]
  79. assert "file_path = $2" in sql
  80. assert "created_at ASC" in sql
  81. assert "id ASC" in sql
  82. @pytest.mark.asyncio
  83. async def test_get_doc_by_file_basename_uses_exact_match_for_like_metacharacters():
  84. storage = _make_storage()
  85. storage.db.query.return_value = []
  86. await storage.get_doc_by_file_basename("100%_off.pdf")
  87. sql = storage.db.query.call_args.args[0]
  88. params = storage.db.query.call_args.args[1]
  89. assert "LIKE" not in sql
  90. assert params == ["test_ws", "100%_off.pdf"]
  91. @pytest.mark.asyncio
  92. async def test_get_doc_by_file_basename_no_match_returns_none():
  93. storage = _make_storage()
  94. storage.db.query.return_value = []
  95. assert await storage.get_doc_by_file_basename("missing.pdf") is None
  96. # ---------------------------------------------------------------------------
  97. # get_doc_by_content_hash
  98. # ---------------------------------------------------------------------------
  99. @pytest.mark.asyncio
  100. async def test_get_doc_by_content_hash_empty_returns_none():
  101. storage = _make_storage()
  102. assert await storage.get_doc_by_content_hash("") is None
  103. storage.db.query.assert_not_called()
  104. @pytest.mark.asyncio
  105. async def test_get_doc_by_content_hash_match():
  106. storage = _make_storage()
  107. storage.db.query.return_value = [_row(content_hash="hash-abc")]
  108. result = await storage.get_doc_by_content_hash("hash-abc")
  109. assert result is not None
  110. doc_id, doc = result
  111. assert doc_id == "doc-1"
  112. assert doc["content_hash"] == "hash-abc"
  113. call = storage.db.query.call_args
  114. sql = call.args[0]
  115. params = call.args[1]
  116. assert "content_hash=$2" in sql
  117. # Stable ordering for repeatability across re-runs / replicas
  118. assert "ORDER BY created_at ASC, id ASC" in sql
  119. assert "LIMIT 1" in sql
  120. assert params == ["test_ws", "hash-abc"]
  121. @pytest.mark.asyncio
  122. async def test_get_doc_by_content_hash_no_match_returns_none():
  123. storage = _make_storage()
  124. storage.db.query.return_value = []
  125. assert await storage.get_doc_by_content_hash("nope") is None