test_overlap_validation.py 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113
  1. """
  2. Test for overlap_tokens validation to prevent infinite loop.
  3. This test validates the fix for the bug where overlap_tokens >= max_tokens
  4. causes an infinite loop in the chunking function.
  5. """
  6. from lightrag.rerank import chunk_documents_for_rerank
  7. class TestOverlapValidation:
  8. """Test suite for overlap_tokens validation"""
  9. def test_overlap_greater_than_max_tokens(self):
  10. """Test that overlap_tokens > max_tokens is clamped and doesn't hang"""
  11. documents = [" ".join([f"word{i}" for i in range(100)])]
  12. # This should clamp overlap_tokens to 29 (max_tokens - 1)
  13. chunked_docs, doc_indices = chunk_documents_for_rerank(
  14. documents, max_tokens=30, overlap_tokens=32
  15. )
  16. # Should complete without hanging
  17. assert len(chunked_docs) > 0
  18. assert all(idx == 0 for idx in doc_indices)
  19. def test_overlap_equal_to_max_tokens(self):
  20. """Test that overlap_tokens == max_tokens is clamped and doesn't hang"""
  21. documents = [" ".join([f"word{i}" for i in range(100)])]
  22. # This should clamp overlap_tokens to 29 (max_tokens - 1)
  23. chunked_docs, doc_indices = chunk_documents_for_rerank(
  24. documents, max_tokens=30, overlap_tokens=30
  25. )
  26. # Should complete without hanging
  27. assert len(chunked_docs) > 0
  28. assert all(idx == 0 for idx in doc_indices)
  29. def test_overlap_slightly_less_than_max_tokens(self):
  30. """Test that overlap_tokens < max_tokens works normally"""
  31. documents = [" ".join([f"word{i}" for i in range(100)])]
  32. # This should work without clamping
  33. chunked_docs, doc_indices = chunk_documents_for_rerank(
  34. documents, max_tokens=30, overlap_tokens=29
  35. )
  36. # Should complete successfully
  37. assert len(chunked_docs) > 0
  38. assert all(idx == 0 for idx in doc_indices)
  39. def test_small_max_tokens_with_large_overlap(self):
  40. """Test edge case with very small max_tokens"""
  41. documents = [" ".join([f"word{i}" for i in range(50)])]
  42. # max_tokens=5, overlap_tokens=10 should clamp to 4
  43. chunked_docs, doc_indices = chunk_documents_for_rerank(
  44. documents, max_tokens=5, overlap_tokens=10
  45. )
  46. # Should complete without hanging
  47. assert len(chunked_docs) > 0
  48. assert all(idx == 0 for idx in doc_indices)
  49. def test_multiple_documents_with_invalid_overlap(self):
  50. """Test multiple documents with overlap_tokens >= max_tokens"""
  51. documents = [
  52. " ".join([f"word{i}" for i in range(50)]),
  53. "short document",
  54. " ".join([f"word{i}" for i in range(75)]),
  55. ]
  56. # overlap_tokens > max_tokens
  57. chunked_docs, doc_indices = chunk_documents_for_rerank(
  58. documents, max_tokens=25, overlap_tokens=30
  59. )
  60. # Should complete successfully and chunk the long documents
  61. assert len(chunked_docs) >= len(documents)
  62. # Short document should not be chunked
  63. assert "short document" in chunked_docs
  64. def test_normal_operation_unaffected(self):
  65. """Test that normal cases continue to work correctly"""
  66. documents = [
  67. " ".join([f"word{i}" for i in range(100)]),
  68. "short doc",
  69. ]
  70. # Normal case: overlap_tokens (10) < max_tokens (50)
  71. chunked_docs, doc_indices = chunk_documents_for_rerank(
  72. documents, max_tokens=50, overlap_tokens=10
  73. )
  74. # Long document should be chunked, short one should not
  75. assert len(chunked_docs) > 2 # At least 3 chunks (2 from long doc + 1 short)
  76. assert "short doc" in chunked_docs
  77. # Verify doc_indices maps correctly
  78. assert doc_indices[-1] == 1 # Last chunk is from second document
  79. def test_edge_case_max_tokens_one(self):
  80. """Test edge case where max_tokens=1"""
  81. documents = [" ".join([f"word{i}" for i in range(20)])]
  82. # max_tokens=1, overlap_tokens=5 should clamp to 0
  83. chunked_docs, doc_indices = chunk_documents_for_rerank(
  84. documents, max_tokens=1, overlap_tokens=5
  85. )
  86. # Should complete without hanging
  87. assert len(chunked_docs) > 0
  88. assert all(idx == 0 for idx in doc_indices)