wxcz_admin
/
lightrag-cn-git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
							"""
Unit tests for PostgreSQL safe index name generation.

This module tests the _safe_index_name helper function which prevents
PostgreSQL's silent 63-byte identifier truncation from causing index
lookup failures.
"""

import pytest

# Mark all tests as offline (no external dependencies)
pytestmark = pytest.mark.offline


class TestSafeIndexName:
    """Test suite for _safe_index_name function."""

    def test_short_name_unchanged(self):
        """Short index names should remain unchanged."""
        from lightrag.kg.postgres_impl import _safe_index_name

        # Short table name - should return unchanged
        result = _safe_index_name("lightrag_vdb_entity", "hnsw_cosine")
        assert result == "idx_lightrag_vdb_entity_hnsw_cosine"
        assert len(result.encode("utf-8")) <= 63

    def test_long_name_gets_hashed(self):
        """Long table names exceeding 63 bytes should get hashed."""
        from lightrag.kg.postgres_impl import _safe_index_name

        # Long table name that would exceed 63 bytes
        long_table_name = "LIGHTRAG_VDB_ENTITY_text_embedding_3_large_3072d"
        result = _safe_index_name(long_table_name, "hnsw_cosine")

        # Should be within 63 bytes
        assert len(result.encode("utf-8")) <= 63

        # Should start with idx_ prefix
        assert result.startswith("idx_")

        # Should contain the suffix
        assert result.endswith("_hnsw_cosine")

        # Should NOT be the naive concatenation (which would be truncated)
        naive_name = f"idx_{long_table_name.lower()}_hnsw_cosine"
        assert result != naive_name

    def test_deterministic_output(self):
        """Same input should always produce same output (deterministic)."""
        from lightrag.kg.postgres_impl import _safe_index_name

        table_name = "LIGHTRAG_VDB_CHUNKS_text_embedding_3_large_3072d"
        suffix = "hnsw_cosine"

        result1 = _safe_index_name(table_name, suffix)
        result2 = _safe_index_name(table_name, suffix)

        assert result1 == result2

    def test_different_suffixes_different_results(self):
        """Different suffixes should produce different index names."""
        from lightrag.kg.postgres_impl import _safe_index_name

        table_name = "LIGHTRAG_VDB_ENTITY_text_embedding_3_large_3072d"

        result1 = _safe_index_name(table_name, "hnsw_cosine")
        result2 = _safe_index_name(table_name, "ivfflat_cosine")

        assert result1 != result2

    def test_case_insensitive(self):
        """Table names should be normalized to lowercase."""
        from lightrag.kg.postgres_impl import _safe_index_name

        result_upper = _safe_index_name("LIGHTRAG_VDB_ENTITY", "hnsw_cosine")
        result_lower = _safe_index_name("lightrag_vdb_entity", "hnsw_cosine")

        assert result_upper == result_lower

    def test_boundary_case_exactly_63_bytes(self):
        """Test boundary case where name is exactly at 63-byte limit."""
        from lightrag.kg.postgres_impl import _safe_index_name

        # Create a table name that results in exactly 63 bytes
        # idx_ (4) + table_name + _ (1) + suffix = 63
        # So table_name + suffix = 58

        # Test a name that's just under the limit (should remain unchanged)
        short_suffix = "id"
        # idx_ (4) + 56 chars + _ (1) + id (2) = 63
        table_56 = "a" * 56
        result = _safe_index_name(table_56, short_suffix)
        expected = f"idx_{table_56}_{short_suffix}"
        assert result == expected
        assert len(result.encode("utf-8")) == 63

    def test_unicode_handling(self):
        """Unicode characters should be properly handled (bytes, not chars)."""
        from lightrag.kg.postgres_impl import _safe_index_name

        # Unicode characters can take more bytes than visible chars
        # Chinese characters are 3 bytes each in UTF-8
        table_name = "lightrag_测试_table"  # Contains Chinese chars
        result = _safe_index_name(table_name, "hnsw_cosine")

        # Should always be within 63 bytes
        assert len(result.encode("utf-8")) <= 63

    def test_real_world_model_names(self):
        """Test with real-world embedding model names that cause issues."""
        from lightrag.kg.postgres_impl import _safe_index_name

        # These are actual model names that have caused issues
        test_cases = [
            ("LIGHTRAG_VDB_CHUNKS_text_embedding_3_large_3072d", "hnsw_cosine"),
            ("LIGHTRAG_VDB_ENTITY_text_embedding_3_large_3072d", "hnsw_cosine"),
            ("LIGHTRAG_VDB_RELATION_text_embedding_3_large_3072d", "hnsw_cosine"),
            (
                "LIGHTRAG_VDB_ENTITY_bge_m3_1024d",
                "hnsw_cosine",
            ),  # Shorter model name
            (
                "LIGHTRAG_VDB_CHUNKS_nomic_embed_text_v1_768d",
                "ivfflat_cosine",
            ),  # Different index type
        ]

        for table_name, suffix in test_cases:
            result = _safe_index_name(table_name, suffix)

            # Critical: must be within PostgreSQL's 63-byte limit
            assert (
                len(result.encode("utf-8")) <= 63
            ), f"Index name too long: {result} for table {table_name}"

            # Must have consistent format
            assert result.startswith("idx_"), f"Missing idx_ prefix: {result}"
            assert result.endswith(f"_{suffix}"), f"Missing suffix {suffix}: {result}"

    def test_hash_uniqueness_for_similar_tables(self):
        """Similar but different table names should produce different hashes."""
        from lightrag.kg.postgres_impl import _safe_index_name

        # These tables have similar names but should have different hashes
        tables = [
            "LIGHTRAG_VDB_CHUNKS_model_a_1024d",
            "LIGHTRAG_VDB_CHUNKS_model_b_1024d",
            "LIGHTRAG_VDB_ENTITY_model_a_1024d",
        ]

        results = [_safe_index_name(t, "hnsw_cosine") for t in tables]

        # All results should be unique
        assert len(set(results)) == len(results), "Hash collision detected!"


class TestIndexNameIntegration:
    """Integration-style tests for index name usage patterns."""

    def test_pg_indexes_lookup_compatibility(self):
        """
        Test that the generated index name will work with pg_indexes lookup.

        This is the core problem: PostgreSQL stores the truncated name,
        but we were looking up the untruncated name. Our fix ensures we
        always use a name that fits within 63 bytes.
        """
        from lightrag.kg.postgres_impl import _safe_index_name

        table_name = "LIGHTRAG_VDB_CHUNKS_text_embedding_3_large_3072d"
        suffix = "hnsw_cosine"

        # Generate the index name
        index_name = _safe_index_name(table_name, suffix)

        # Simulate what PostgreSQL would store (truncate at 63 bytes)
        stored_name = index_name.encode("utf-8")[:63].decode("utf-8", errors="ignore")

        # The key fix: our generated name should equal the stored name
        # because it's already within the 63-byte limit
        assert (
            index_name == stored_name
        ), "Index name would be truncated by PostgreSQL, causing lookup failures!"

    def test_backward_compatibility_short_names(self):
        """
        Ensure backward compatibility with existing short index names.

        For tables that have existing indexes with short names (pre-model-suffix era),
        the function should not change their names.
        """
        from lightrag.kg.postgres_impl import _safe_index_name

        # Legacy table names without model suffix
        legacy_tables = [
            "LIGHTRAG_VDB_ENTITY",
            "LIGHTRAG_VDB_RELATION",
            "LIGHTRAG_VDB_CHUNKS",
        ]

        for table in legacy_tables:
            for suffix in ["hnsw_cosine", "ivfflat_cosine", "id"]:
                result = _safe_index_name(table, suffix)
                expected = f"idx_{table.lower()}_{suffix}"

                # Short names should remain unchanged for backward compatibility
                if len(expected.encode("utf-8")) <= 63:
                    assert (
                        result == expected
                    ), f"Short name changed unexpectedly: {result} != {expected}"