test_postgres_index_name.py 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
  1. """
  2. Unit tests for PostgreSQL safe index name generation.
  3. This module tests the _safe_index_name helper function which prevents
  4. PostgreSQL's silent 63-byte identifier truncation from causing index
  5. lookup failures.
  6. """
  7. import pytest
  8. # Mark all tests as offline (no external dependencies)
  9. pytestmark = pytest.mark.offline
  10. class TestSafeIndexName:
  11. """Test suite for _safe_index_name function."""
  12. def test_short_name_unchanged(self):
  13. """Short index names should remain unchanged."""
  14. from lightrag.kg.postgres_impl import _safe_index_name
  15. # Short table name - should return unchanged
  16. result = _safe_index_name("lightrag_vdb_entity", "hnsw_cosine")
  17. assert result == "idx_lightrag_vdb_entity_hnsw_cosine"
  18. assert len(result.encode("utf-8")) <= 63
  19. def test_long_name_gets_hashed(self):
  20. """Long table names exceeding 63 bytes should get hashed."""
  21. from lightrag.kg.postgres_impl import _safe_index_name
  22. # Long table name that would exceed 63 bytes
  23. long_table_name = "LIGHTRAG_VDB_ENTITY_text_embedding_3_large_3072d"
  24. result = _safe_index_name(long_table_name, "hnsw_cosine")
  25. # Should be within 63 bytes
  26. assert len(result.encode("utf-8")) <= 63
  27. # Should start with idx_ prefix
  28. assert result.startswith("idx_")
  29. # Should contain the suffix
  30. assert result.endswith("_hnsw_cosine")
  31. # Should NOT be the naive concatenation (which would be truncated)
  32. naive_name = f"idx_{long_table_name.lower()}_hnsw_cosine"
  33. assert result != naive_name
  34. def test_deterministic_output(self):
  35. """Same input should always produce same output (deterministic)."""
  36. from lightrag.kg.postgres_impl import _safe_index_name
  37. table_name = "LIGHTRAG_VDB_CHUNKS_text_embedding_3_large_3072d"
  38. suffix = "hnsw_cosine"
  39. result1 = _safe_index_name(table_name, suffix)
  40. result2 = _safe_index_name(table_name, suffix)
  41. assert result1 == result2
  42. def test_different_suffixes_different_results(self):
  43. """Different suffixes should produce different index names."""
  44. from lightrag.kg.postgres_impl import _safe_index_name
  45. table_name = "LIGHTRAG_VDB_ENTITY_text_embedding_3_large_3072d"
  46. result1 = _safe_index_name(table_name, "hnsw_cosine")
  47. result2 = _safe_index_name(table_name, "ivfflat_cosine")
  48. assert result1 != result2
  49. def test_case_insensitive(self):
  50. """Table names should be normalized to lowercase."""
  51. from lightrag.kg.postgres_impl import _safe_index_name
  52. result_upper = _safe_index_name("LIGHTRAG_VDB_ENTITY", "hnsw_cosine")
  53. result_lower = _safe_index_name("lightrag_vdb_entity", "hnsw_cosine")
  54. assert result_upper == result_lower
  55. def test_boundary_case_exactly_63_bytes(self):
  56. """Test boundary case where name is exactly at 63-byte limit."""
  57. from lightrag.kg.postgres_impl import _safe_index_name
  58. # Create a table name that results in exactly 63 bytes
  59. # idx_ (4) + table_name + _ (1) + suffix = 63
  60. # So table_name + suffix = 58
  61. # Test a name that's just under the limit (should remain unchanged)
  62. short_suffix = "id"
  63. # idx_ (4) + 56 chars + _ (1) + id (2) = 63
  64. table_56 = "a" * 56
  65. result = _safe_index_name(table_56, short_suffix)
  66. expected = f"idx_{table_56}_{short_suffix}"
  67. assert result == expected
  68. assert len(result.encode("utf-8")) == 63
  69. def test_unicode_handling(self):
  70. """Unicode characters should be properly handled (bytes, not chars)."""
  71. from lightrag.kg.postgres_impl import _safe_index_name
  72. # Unicode characters can take more bytes than visible chars
  73. # Chinese characters are 3 bytes each in UTF-8
  74. table_name = "lightrag_测试_table" # Contains Chinese chars
  75. result = _safe_index_name(table_name, "hnsw_cosine")
  76. # Should always be within 63 bytes
  77. assert len(result.encode("utf-8")) <= 63
  78. def test_real_world_model_names(self):
  79. """Test with real-world embedding model names that cause issues."""
  80. from lightrag.kg.postgres_impl import _safe_index_name
  81. # These are actual model names that have caused issues
  82. test_cases = [
  83. ("LIGHTRAG_VDB_CHUNKS_text_embedding_3_large_3072d", "hnsw_cosine"),
  84. ("LIGHTRAG_VDB_ENTITY_text_embedding_3_large_3072d", "hnsw_cosine"),
  85. ("LIGHTRAG_VDB_RELATION_text_embedding_3_large_3072d", "hnsw_cosine"),
  86. (
  87. "LIGHTRAG_VDB_ENTITY_bge_m3_1024d",
  88. "hnsw_cosine",
  89. ), # Shorter model name
  90. (
  91. "LIGHTRAG_VDB_CHUNKS_nomic_embed_text_v1_768d",
  92. "ivfflat_cosine",
  93. ), # Different index type
  94. ]
  95. for table_name, suffix in test_cases:
  96. result = _safe_index_name(table_name, suffix)
  97. # Critical: must be within PostgreSQL's 63-byte limit
  98. assert (
  99. len(result.encode("utf-8")) <= 63
  100. ), f"Index name too long: {result} for table {table_name}"
  101. # Must have consistent format
  102. assert result.startswith("idx_"), f"Missing idx_ prefix: {result}"
  103. assert result.endswith(f"_{suffix}"), f"Missing suffix {suffix}: {result}"
  104. def test_hash_uniqueness_for_similar_tables(self):
  105. """Similar but different table names should produce different hashes."""
  106. from lightrag.kg.postgres_impl import _safe_index_name
  107. # These tables have similar names but should have different hashes
  108. tables = [
  109. "LIGHTRAG_VDB_CHUNKS_model_a_1024d",
  110. "LIGHTRAG_VDB_CHUNKS_model_b_1024d",
  111. "LIGHTRAG_VDB_ENTITY_model_a_1024d",
  112. ]
  113. results = [_safe_index_name(t, "hnsw_cosine") for t in tables]
  114. # All results should be unique
  115. assert len(set(results)) == len(results), "Hash collision detected!"
  116. class TestIndexNameIntegration:
  117. """Integration-style tests for index name usage patterns."""
  118. def test_pg_indexes_lookup_compatibility(self):
  119. """
  120. Test that the generated index name will work with pg_indexes lookup.
  121. This is the core problem: PostgreSQL stores the truncated name,
  122. but we were looking up the untruncated name. Our fix ensures we
  123. always use a name that fits within 63 bytes.
  124. """
  125. from lightrag.kg.postgres_impl import _safe_index_name
  126. table_name = "LIGHTRAG_VDB_CHUNKS_text_embedding_3_large_3072d"
  127. suffix = "hnsw_cosine"
  128. # Generate the index name
  129. index_name = _safe_index_name(table_name, suffix)
  130. # Simulate what PostgreSQL would store (truncate at 63 bytes)
  131. stored_name = index_name.encode("utf-8")[:63].decode("utf-8", errors="ignore")
  132. # The key fix: our generated name should equal the stored name
  133. # because it's already within the 63-byte limit
  134. assert (
  135. index_name == stored_name
  136. ), "Index name would be truncated by PostgreSQL, causing lookup failures!"
  137. def test_backward_compatibility_short_names(self):
  138. """
  139. Ensure backward compatibility with existing short index names.
  140. For tables that have existing indexes with short names (pre-model-suffix era),
  141. the function should not change their names.
  142. """
  143. from lightrag.kg.postgres_impl import _safe_index_name
  144. # Legacy table names without model suffix
  145. legacy_tables = [
  146. "LIGHTRAG_VDB_ENTITY",
  147. "LIGHTRAG_VDB_RELATION",
  148. "LIGHTRAG_VDB_CHUNKS",
  149. ]
  150. for table in legacy_tables:
  151. for suffix in ["hnsw_cosine", "ivfflat_cosine", "id"]:
  152. result = _safe_index_name(table, suffix)
  153. expected = f"idx_{table.lower()}_{suffix}"
  154. # Short names should remain unchanged for backward compatibility
  155. if len(expected.encode("utf-8")) <= 63:
  156. assert (
  157. result == expected
  158. ), f"Short name changed unexpectedly: {result} != {expected}"