test_sidecar_uri.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120
  1. """Unit tests for the sidecar URI helpers and document-path canonicalization
  2. introduced when ``full_docs`` collapsed its four path fields to
  3. ``file_path`` + ``sidecar_location``.
  4. """
  5. from __future__ import annotations
  6. from pathlib import Path
  7. import pytest
  8. from lightrag.utils_pipeline import (
  9. SIDECAR_LOCATION_UNKNOWN,
  10. normalize_document_file_path,
  11. parsed_artifact_dir_for,
  12. resolve_sidecar_uri,
  13. sidecar_assets_dir_for_uri,
  14. sidecar_blocks_path,
  15. sidecar_modality_path,
  16. sidecar_uri_for,
  17. )
  18. @pytest.mark.offline
  19. def test_normalize_strips_hint_and_directory():
  20. assert normalize_document_file_path("abc.[native-iet].docx") == "abc.docx"
  21. assert normalize_document_file_path("/tmp/sub/abc.docx") == "abc.docx"
  22. assert normalize_document_file_path("abc.docx") == "abc.docx"
  23. @pytest.mark.offline
  24. def test_normalize_idempotent():
  25. once = normalize_document_file_path("/tmp/abc.[native].docx")
  26. twice = normalize_document_file_path(once)
  27. assert once == twice == "abc.docx"
  28. @pytest.mark.offline
  29. @pytest.mark.parametrize(
  30. "value",
  31. ["", None, "no-file-path", "unknown_source", " "],
  32. )
  33. def test_normalize_maps_placeholders_to_unknown(value):
  34. assert normalize_document_file_path(value) == "unknown_source"
  35. @pytest.mark.offline
  36. def test_sidecar_uri_round_trip_ascii(tmp_path):
  37. sidecar_dir = tmp_path / "abc.docx.parsed"
  38. sidecar_dir.mkdir()
  39. uri = sidecar_uri_for(sidecar_dir)
  40. assert uri.startswith("file://")
  41. assert uri.endswith("/")
  42. resolved = resolve_sidecar_uri(uri)
  43. assert resolved == sidecar_dir.resolve()
  44. @pytest.mark.offline
  45. def test_sidecar_uri_round_trip_unicode_and_spaces(tmp_path):
  46. sidecar_dir = tmp_path / "中文 报告.docx.parsed"
  47. sidecar_dir.mkdir()
  48. uri = sidecar_uri_for(sidecar_dir)
  49. assert uri.startswith("file://")
  50. assert " " not in uri # spaces are percent-encoded
  51. resolved = resolve_sidecar_uri(uri)
  52. assert resolved == sidecar_dir.resolve()
  53. @pytest.mark.offline
  54. def test_resolve_sidecar_uri_tolerates_missing_trailing_slash(tmp_path):
  55. sidecar_dir = tmp_path / "demo.parsed"
  56. sidecar_dir.mkdir()
  57. uri_no_slash = sidecar_uri_for(sidecar_dir).rstrip("/")
  58. assert resolve_sidecar_uri(uri_no_slash) == sidecar_dir.resolve()
  59. @pytest.mark.offline
  60. @pytest.mark.parametrize(
  61. "uri",
  62. [None, "", SIDECAR_LOCATION_UNKNOWN, "s3://bucket/path/"],
  63. )
  64. def test_resolve_sidecar_uri_returns_none_for_unsupported(uri):
  65. assert resolve_sidecar_uri(uri) is None
  66. @pytest.mark.offline
  67. def test_sidecar_blocks_path_locates_jsonl(tmp_path):
  68. sidecar_dir = tmp_path / "demo.docx.parsed"
  69. sidecar_dir.mkdir()
  70. blocks = sidecar_dir / "demo.blocks.jsonl"
  71. blocks.write_text("", encoding="utf-8")
  72. uri = sidecar_uri_for(sidecar_dir)
  73. assert sidecar_blocks_path(uri) == str(blocks)
  74. assert sidecar_modality_path(uri, "tables") == str(sidecar_dir / "demo.tables.json")
  75. assert sidecar_assets_dir_for_uri(uri) == Path(sidecar_dir / "demo.blocks.assets")
  76. @pytest.mark.offline
  77. def test_sidecar_blocks_path_returns_none_when_missing(tmp_path):
  78. empty = tmp_path / "empty.parsed"
  79. empty.mkdir()
  80. uri = sidecar_uri_for(empty)
  81. assert sidecar_blocks_path(uri) is None
  82. assert sidecar_modality_path(uri, "drawings") is None
  83. assert sidecar_assets_dir_for_uri(uri) is None
  84. @pytest.mark.offline
  85. def test_parsed_artifact_dir_for_uses_input_dir(tmp_path, monkeypatch):
  86. monkeypatch.setenv("INPUT_DIR", str(tmp_path))
  87. result = parsed_artifact_dir_for("demo.docx")
  88. assert result == tmp_path / "__parsed__" / "demo.docx.parsed"
  89. @pytest.mark.offline
  90. def test_parsed_artifact_dir_for_strips_hint(tmp_path, monkeypatch):
  91. monkeypatch.setenv("INPUT_DIR", str(tmp_path))
  92. result = parsed_artifact_dir_for("abc.[native-iet].docx")
  93. assert result == tmp_path / "__parsed__" / "abc.docx.parsed"