| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210 |
- """
- Unit tests for PostgreSQL safe index name generation.
- This module tests the _safe_index_name helper function which prevents
- PostgreSQL's silent 63-byte identifier truncation from causing index
- lookup failures.
- """
- import pytest
- # Mark all tests as offline (no external dependencies)
- pytestmark = pytest.mark.offline
- class TestSafeIndexName:
- """Test suite for _safe_index_name function."""
- def test_short_name_unchanged(self):
- """Short index names should remain unchanged."""
- from lightrag.kg.postgres_impl import _safe_index_name
- # Short table name - should return unchanged
- result = _safe_index_name("lightrag_vdb_entity", "hnsw_cosine")
- assert result == "idx_lightrag_vdb_entity_hnsw_cosine"
- assert len(result.encode("utf-8")) <= 63
- def test_long_name_gets_hashed(self):
- """Long table names exceeding 63 bytes should get hashed."""
- from lightrag.kg.postgres_impl import _safe_index_name
- # Long table name that would exceed 63 bytes
- long_table_name = "LIGHTRAG_VDB_ENTITY_text_embedding_3_large_3072d"
- result = _safe_index_name(long_table_name, "hnsw_cosine")
- # Should be within 63 bytes
- assert len(result.encode("utf-8")) <= 63
- # Should start with idx_ prefix
- assert result.startswith("idx_")
- # Should contain the suffix
- assert result.endswith("_hnsw_cosine")
- # Should NOT be the naive concatenation (which would be truncated)
- naive_name = f"idx_{long_table_name.lower()}_hnsw_cosine"
- assert result != naive_name
- def test_deterministic_output(self):
- """Same input should always produce same output (deterministic)."""
- from lightrag.kg.postgres_impl import _safe_index_name
- table_name = "LIGHTRAG_VDB_CHUNKS_text_embedding_3_large_3072d"
- suffix = "hnsw_cosine"
- result1 = _safe_index_name(table_name, suffix)
- result2 = _safe_index_name(table_name, suffix)
- assert result1 == result2
- def test_different_suffixes_different_results(self):
- """Different suffixes should produce different index names."""
- from lightrag.kg.postgres_impl import _safe_index_name
- table_name = "LIGHTRAG_VDB_ENTITY_text_embedding_3_large_3072d"
- result1 = _safe_index_name(table_name, "hnsw_cosine")
- result2 = _safe_index_name(table_name, "ivfflat_cosine")
- assert result1 != result2
- def test_case_insensitive(self):
- """Table names should be normalized to lowercase."""
- from lightrag.kg.postgres_impl import _safe_index_name
- result_upper = _safe_index_name("LIGHTRAG_VDB_ENTITY", "hnsw_cosine")
- result_lower = _safe_index_name("lightrag_vdb_entity", "hnsw_cosine")
- assert result_upper == result_lower
- def test_boundary_case_exactly_63_bytes(self):
- """Test boundary case where name is exactly at 63-byte limit."""
- from lightrag.kg.postgres_impl import _safe_index_name
- # Create a table name that results in exactly 63 bytes
- # idx_ (4) + table_name + _ (1) + suffix = 63
- # So table_name + suffix = 58
- # Test a name that's just under the limit (should remain unchanged)
- short_suffix = "id"
- # idx_ (4) + 56 chars + _ (1) + id (2) = 63
- table_56 = "a" * 56
- result = _safe_index_name(table_56, short_suffix)
- expected = f"idx_{table_56}_{short_suffix}"
- assert result == expected
- assert len(result.encode("utf-8")) == 63
- def test_unicode_handling(self):
- """Unicode characters should be properly handled (bytes, not chars)."""
- from lightrag.kg.postgres_impl import _safe_index_name
- # Unicode characters can take more bytes than visible chars
- # Chinese characters are 3 bytes each in UTF-8
- table_name = "lightrag_测试_table" # Contains Chinese chars
- result = _safe_index_name(table_name, "hnsw_cosine")
- # Should always be within 63 bytes
- assert len(result.encode("utf-8")) <= 63
- def test_real_world_model_names(self):
- """Test with real-world embedding model names that cause issues."""
- from lightrag.kg.postgres_impl import _safe_index_name
- # These are actual model names that have caused issues
- test_cases = [
- ("LIGHTRAG_VDB_CHUNKS_text_embedding_3_large_3072d", "hnsw_cosine"),
- ("LIGHTRAG_VDB_ENTITY_text_embedding_3_large_3072d", "hnsw_cosine"),
- ("LIGHTRAG_VDB_RELATION_text_embedding_3_large_3072d", "hnsw_cosine"),
- (
- "LIGHTRAG_VDB_ENTITY_bge_m3_1024d",
- "hnsw_cosine",
- ), # Shorter model name
- (
- "LIGHTRAG_VDB_CHUNKS_nomic_embed_text_v1_768d",
- "ivfflat_cosine",
- ), # Different index type
- ]
- for table_name, suffix in test_cases:
- result = _safe_index_name(table_name, suffix)
- # Critical: must be within PostgreSQL's 63-byte limit
- assert (
- len(result.encode("utf-8")) <= 63
- ), f"Index name too long: {result} for table {table_name}"
- # Must have consistent format
- assert result.startswith("idx_"), f"Missing idx_ prefix: {result}"
- assert result.endswith(f"_{suffix}"), f"Missing suffix {suffix}: {result}"
- def test_hash_uniqueness_for_similar_tables(self):
- """Similar but different table names should produce different hashes."""
- from lightrag.kg.postgres_impl import _safe_index_name
- # These tables have similar names but should have different hashes
- tables = [
- "LIGHTRAG_VDB_CHUNKS_model_a_1024d",
- "LIGHTRAG_VDB_CHUNKS_model_b_1024d",
- "LIGHTRAG_VDB_ENTITY_model_a_1024d",
- ]
- results = [_safe_index_name(t, "hnsw_cosine") for t in tables]
- # All results should be unique
- assert len(set(results)) == len(results), "Hash collision detected!"
- class TestIndexNameIntegration:
- """Integration-style tests for index name usage patterns."""
- def test_pg_indexes_lookup_compatibility(self):
- """
- Test that the generated index name will work with pg_indexes lookup.
- This is the core problem: PostgreSQL stores the truncated name,
- but we were looking up the untruncated name. Our fix ensures we
- always use a name that fits within 63 bytes.
- """
- from lightrag.kg.postgres_impl import _safe_index_name
- table_name = "LIGHTRAG_VDB_CHUNKS_text_embedding_3_large_3072d"
- suffix = "hnsw_cosine"
- # Generate the index name
- index_name = _safe_index_name(table_name, suffix)
- # Simulate what PostgreSQL would store (truncate at 63 bytes)
- stored_name = index_name.encode("utf-8")[:63].decode("utf-8", errors="ignore")
- # The key fix: our generated name should equal the stored name
- # because it's already within the 63-byte limit
- assert (
- index_name == stored_name
- ), "Index name would be truncated by PostgreSQL, causing lookup failures!"
- def test_backward_compatibility_short_names(self):
- """
- Ensure backward compatibility with existing short index names.
- For tables that have existing indexes with short names (pre-model-suffix era),
- the function should not change their names.
- """
- from lightrag.kg.postgres_impl import _safe_index_name
- # Legacy table names without model suffix
- legacy_tables = [
- "LIGHTRAG_VDB_ENTITY",
- "LIGHTRAG_VDB_RELATION",
- "LIGHTRAG_VDB_CHUNKS",
- ]
- for table in legacy_tables:
- for suffix in ["hnsw_cosine", "ivfflat_cosine", "id"]:
- result = _safe_index_name(table, suffix)
- expected = f"idx_{table.lower()}_{suffix}"
- # Short names should remain unchanged for backward compatibility
- if len(expected.encode("utf-8")) <= 63:
- assert (
- result == expected
- ), f"Short name changed unexpectedly: {result} != {expected}"
|