| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135 |
- """Offline tests for the unified VLM image_inputs path."""
- from __future__ import annotations
- import base64
- import hashlib
- from typing import Any
- import pytest
- from lightrag.llm._vision_utils import (
- image_audit_metadata,
- image_cache_metadata,
- normalize_image_inputs,
- )
- pytestmark = pytest.mark.offline
- PNG_BYTES = (
- b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01"
- b"\x08\x06\x00\x00\x00\x1f\x15\xc4\x89\x00\x00\x00\rIDATx\x9cc\xf8"
- b"\xcf\xc0\x00\x00\x00\x03\x00\x01\x5c\xcc\xd9\x9e\x00\x00\x00\x00"
- b"IEND\xaeB`\x82"
- )
- JPEG_BYTES = b"\xff\xd8\xff\xe0\x00\x10JFIF" + b"\x00" * 16
- def _b64(raw: bytes) -> str:
- return base64.b64encode(raw).decode("ascii")
- def test_normalize_accepts_raw_base64_and_detects_png():
- result = normalize_image_inputs([_b64(PNG_BYTES)])
- assert len(result) == 1
- img = result[0]
- assert img.index == 0
- assert img.mime_type == "image/png"
- assert img.raw_bytes == PNG_BYTES
- assert img.sha256 == hashlib.sha256(PNG_BYTES).hexdigest()
- assert img.source_id is None
- assert img.source_file is None
- def test_normalize_accepts_data_url_and_uses_declared_mime():
- data_url = f"data:image/jpeg;base64,{_b64(JPEG_BYTES)}"
- result = normalize_image_inputs([data_url])
- assert len(result) == 1
- assert result[0].mime_type == "image/jpeg"
- def test_normalize_accepts_dict_with_metadata():
- dict_item: dict[str, Any] = {
- "base64": _b64(PNG_BYTES),
- "mime_type": "image/png",
- "source_id": "img-001",
- "source_file": "/tmp/foo.png",
- "modality": "image",
- "doc_id": "doc-1",
- }
- [img] = normalize_image_inputs([dict_item])
- assert img.source_id == "img-001"
- assert img.source_file == "/tmp/foo.png"
- assert img.modality == "image"
- assert img.doc_id == "doc-1"
- def test_normalize_empty_returns_empty_list():
- assert normalize_image_inputs(None) == []
- assert normalize_image_inputs([]) == []
- def test_normalize_rejects_invalid_base64():
- with pytest.raises(ValueError):
- normalize_image_inputs(["this is not base64@@@!!"])
- def test_normalize_rejects_unsupported_element_type():
- with pytest.raises(TypeError):
- normalize_image_inputs([12345])
- def test_normalize_rejects_dict_without_base64():
- with pytest.raises(ValueError):
- normalize_image_inputs([{"mime_type": "image/png"}])
- def test_cache_metadata_excludes_source_identifiers():
- images = normalize_image_inputs(
- [
- {
- "base64": _b64(PNG_BYTES),
- "source_id": "leak-id",
- "source_file": "/leak/path.png",
- }
- ]
- )
- [meta] = image_cache_metadata(images)
- assert "source_id" not in meta
- assert "source_file" not in meta
- assert meta["sha256"] == hashlib.sha256(PNG_BYTES).hexdigest()
- assert meta["mime_type"] == "image/png"
- assert meta["bytes"] == len(PNG_BYTES)
- def test_cache_metadata_same_image_different_filename_is_identical():
- img_a = normalize_image_inputs(
- [{"base64": _b64(PNG_BYTES), "source_file": "/a/x.png"}]
- )
- img_b = normalize_image_inputs(
- [{"base64": _b64(PNG_BYTES), "source_file": "/b/y.png"}]
- )
- assert image_cache_metadata(img_a) == image_cache_metadata(img_b)
- def test_audit_metadata_includes_full_provenance_without_raw_base64():
- images = normalize_image_inputs(
- [
- {
- "base64": _b64(PNG_BYTES),
- "source_id": "img-001",
- "source_file": "/tmp/foo.png",
- "modality": "image",
- "doc_id": "doc-1",
- }
- ]
- )
- [audit] = image_audit_metadata(images)
- assert audit["source_id"] == "img-001"
- assert audit["source_file"] == "/tmp/foo.png"
- assert audit["sha256"] == hashlib.sha256(PNG_BYTES).hexdigest()
- # The audit blob must never re-leak the raw base64 payload.
- assert "base64" not in audit
- assert _b64(PNG_BYTES) not in str(audit)
|