| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334 |
- """
- Centralized configuration constants for LightRAG.
- This module defines default values for configuration constants used across
- different parts of the LightRAG system. Centralizing these values ensures
- consistency and makes maintenance easier.
- """
- from typing import Literal, TypeAlias
- # Default values for server settings
- DEFAULT_WOKERS = 2
- DEFAULT_MAX_GRAPH_NODES = 1000
- # Default values for extraction settings
- DEFAULT_SUMMARY_LANGUAGE = "English" # Default language for document processing
- DEFAULT_MAX_GLEANING = 1
- DEFAULT_ENTITY_NAME_MAX_LENGTH = 256
- # Per-response output limits for entity extraction prompts
- DEFAULT_MAX_EXTRACTION_RECORDS = 100
- DEFAULT_MAX_EXTRACTION_ENTITIES = 40
- # Number of description fragments to trigger LLM summary
- DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE = 8
- # Max description token size to trigger LLM summary
- DEFAULT_SUMMARY_MAX_TOKENS = 1200
- # Recommended LLM summary output length in tokens
- DEFAULT_SUMMARY_LENGTH_RECOMMENDED = 600
- # Maximum token size sent to LLM for summary
- DEFAULT_SUMMARY_CONTEXT_SIZE = 12000
- # Maximum token size allowed for entity extraction input context
- DEFAULT_MAX_EXTRACT_INPUT_TOKENS = 20480
- # Separator for: description, source_id and relation-key fields(Can not be changed after data inserted)
- GRAPH_FIELD_SEP = "<SEP>"
- # Query and retrieval configuration defaults
- DEFAULT_TOP_K = 40
- DEFAULT_CHUNK_TOP_K = 20
- DEFAULT_MAX_ENTITY_TOKENS = 6000
- DEFAULT_MAX_RELATION_TOKENS = 8000
- DEFAULT_MAX_TOTAL_TOKENS = 30000
- DEFAULT_COSINE_THRESHOLD = 0.2
- DEFAULT_RELATED_CHUNK_NUMBER = 5
- DEFAULT_KG_CHUNK_PICK_METHOD = "VECTOR"
- # Rerank configuration defaults
- DEFAULT_MIN_RERANK_SCORE = 0.0
- DEFAULT_RERANK_BINDING = "null"
- # Default source ids limit in meta data for entity and relation
- DEFAULT_MAX_SOURCE_IDS_PER_ENTITY = 300
- DEFAULT_MAX_SOURCE_IDS_PER_RELATION = 300
- ### control chunk_ids limitation method: FIFO, FIFO
- ### FIFO: First in first out
- ### KEEP: Keep oldest (less merge action and faster)
- SOURCE_IDS_LIMIT_METHOD_KEEP = "KEEP"
- SOURCE_IDS_LIMIT_METHOD_FIFO = "FIFO"
- DEFAULT_SOURCE_IDS_LIMIT_METHOD = SOURCE_IDS_LIMIT_METHOD_FIFO
- VALID_SOURCE_IDS_LIMIT_METHODS = {
- SOURCE_IDS_LIMIT_METHOD_KEEP,
- SOURCE_IDS_LIMIT_METHOD_FIFO,
- }
- # Maximum number of file paths stored in entity/relation file_path field (For displayed only, does not affect query performance)
- DEFAULT_MAX_FILE_PATHS = 100
- # Field length of file_path in Milvus Schema for entity and relation (Should not be changed)
- # file_path must store all file paths up to the DEFAULT_MAX_FILE_PATHS limit within the metadata.
- DEFAULT_MAX_FILE_PATH_LENGTH = 32768
- # Placeholder for more file paths in meta data for entity and relation (Should not be changed)
- DEFAULT_FILE_PATH_MORE_PLACEHOLDER = "truncated"
- # Default temperature for LLM
- DEFAULT_TEMPERATURE = 1.0
- # Async configuration defaults
- DEFAULT_MAX_ASYNC = 4 # Default maximum async operations
- DEFAULT_MAX_PARALLEL_INSERT = 2 # Default maximum parallel insert operations
- # Chunker defaults — i18n-aware so Chinese / mixed-language documents
- # split correctly out of the box. Override per deployment via
- # CHUNK_R_SEPARATORS / CHUNK_V_SENTENCE_SPLIT_REGEX env vars.
- #
- # DEFAULT_R_SEPARATORS: cascade tried by langchain RecursiveCharacterTextSplitter.
- # Order matters — strongest boundary first: paragraph (\n\n) > line (\n) >
- # Chinese sentence-end (。!?) > Chinese semi-clause (;,) > space > char.
- # English sentence-ending punctuation (.?!) is intentionally NOT included
- # because RecursiveCharacterTextSplitter does literal-string splitting, so
- # "." would also split numerals (``0.95``) and abbreviations (``e.g.``).
- # The English path falls through space / char as before.
- DEFAULT_R_SEPARATORS: tuple[str, ...] = (
- "\n\n",
- "\n",
- "。",
- "!",
- "?",
- ";",
- ",",
- " ",
- "",
- )
- # DEFAULT_SENTENCE_SPLIT_REGEX: pattern fed to langchain SemanticChunker.
- # Two alternates so the English branch keeps its ``\s+`` requirement
- # (avoiding ``0.95`` mid-token splits) while the Chinese branch matches
- # bare ``。?!`` (CJK has no inter-sentence whitespace).
- DEFAULT_SENTENCE_SPLIT_REGEX = r"(?<=[.?!])\s+|(?<=[。?!])"
- # DEFAULT_CHUNK_P_SIZE: paragraph-semantic chunker target size when
- # CHUNK_P_SIZE env is unset. Deliberately larger than the global
- # CHUNK_SIZE default — heading-aligned paragraph merging needs more
- # headroom to keep semantically related paragraphs together; falling
- # back to CHUNK_SIZE (1200) would force premature splits and defeat
- # the strategy's purpose.
- DEFAULT_CHUNK_P_SIZE = 2000
- # LightRAG Document pipeline
- FULL_DOCS_FORMAT_RAW = "raw" # content in full_docs["content"]
- FULL_DOCS_FORMAT_LIGHTRAG = "lightrag" # content in LightRAG Document files
- FULL_DOCS_FORMAT_PENDING_PARSE = (
- "pending_parse" # file saved but not yet parsed; parse_native will read from disk
- )
- # Marker prefix for full_docs.content when format=lightrag.
- # Per docs/FileProcessingConfiguration-zh.md, the content is "{{LRdoc}}" + a
- # leading summary of the parsed document so paginated APIs can show a real
- # preview without loading the full LightRAG Document file.
- LIGHTRAG_DOC_CONTENT_PREFIX = "{{LRdoc}}"
- PARSER_ENGINE_LEGACY = "legacy"
- PARSER_ENGINE_NATIVE = "native"
- PARSER_ENGINE_MINERU = "mineru"
- PARSER_ENGINE_DOCLING = "docling"
- SUPPORTED_PARSER_ENGINES = frozenset(
- {
- PARSER_ENGINE_LEGACY,
- PARSER_ENGINE_NATIVE,
- PARSER_ENGINE_MINERU,
- PARSER_ENGINE_DOCLING,
- }
- )
- PARSER_ENGINE_SUFFIX_CAPABILITIES = {
- PARSER_ENGINE_LEGACY: frozenset(
- {
- "txt",
- "md",
- "mdx",
- "pdf",
- "docx",
- "pptx",
- "xlsx",
- "rtf",
- "odt",
- "tex",
- "epub",
- "html",
- "htm",
- "csv",
- "json",
- "xml",
- "yaml",
- "yml",
- "log",
- "conf",
- "ini",
- "properties",
- "sql",
- "bat",
- "sh",
- "c",
- "h",
- "cpp",
- "hpp",
- "py",
- "java",
- "js",
- "ts",
- "swift",
- "go",
- "rb",
- "php",
- "css",
- "scss",
- "less",
- }
- ),
- PARSER_ENGINE_NATIVE: frozenset({"docx"}),
- PARSER_ENGINE_MINERU: frozenset(
- {
- "pdf",
- "doc",
- "docx",
- "ppt",
- "pptx",
- "xls",
- "xlsx",
- "png",
- "jpg",
- "jpeg",
- "jp2",
- "webp",
- "gif",
- "bmp",
- }
- ),
- PARSER_ENGINE_DOCLING: frozenset(
- {
- "pdf",
- "docx",
- "pptx",
- "xlsx",
- "md",
- "html",
- "xhtml",
- "png",
- "jpg",
- "jpeg",
- "tiff",
- "webp",
- "bmp",
- }
- ),
- }
- PARSED_DIR_NAME = "__parsed__" # Dir for parsed files (renamed from __enqueued__)
- # Suffixes for parser artifact subdirectories under ``<input>/__parsed__/``.
- # Centralising them here keeps the sidecar writer, engine cache modules and
- # the delete-path whitelist in sync — new engines should add their raw-dir
- # suffix to ``PARSED_ARTIFACT_DIR_SUFFIXES`` so deletion picks them up
- # automatically.
- PARSED_DIR_SUFFIX = ".parsed" # spec sidecar layout (every engine)
- MINERU_RAW_DIR_SUFFIX = ".mineru_raw" # preserved MinerU raw bundle
- DOCLING_RAW_DIR_SUFFIX = ".docling_raw" # preserved Docling raw bundle
- PARSED_ARTIFACT_DIR_SUFFIXES: tuple[str, ...] = (
- PARSED_DIR_SUFFIX,
- MINERU_RAW_DIR_SUFFIX,
- DOCLING_RAW_DIR_SUFFIX,
- )
- # Per-file processing options carried by filename hints / LIGHTRAG_PARSER rules.
- # See docs/FileProcessingConfiguration-zh.md for the full specification.
- PROCESS_OPTION_IMAGES = "i" # Enable VLM analysis for drawings/images
- PROCESS_OPTION_TABLES = "t" # Enable VLM analysis for tables
- PROCESS_OPTION_EQUATIONS = "e" # Enable VLM analysis for equations
- PROCESS_OPTION_SKIP_KG = "!" # Skip entity/relation extraction (no KG build)
- ProcessChunkingOption: TypeAlias = Literal["F", "R", "V", "P"]
- PROCESS_OPTION_CHUNK_FIXED: ProcessChunkingOption = (
- "F" # Fixed-length / separator chunking (default)
- )
- PROCESS_OPTION_CHUNK_RECURSIVE: ProcessChunkingOption = (
- "R" # Recursive semantic chunking
- )
- PROCESS_OPTION_CHUNK_VECTOR: ProcessChunkingOption = (
- "V" # Vector-driven semantic chunking
- )
- PROCESS_OPTION_CHUNK_PARAGRAH: ProcessChunkingOption = (
- "P" # Paragrah-driven semantic chunking
- )
- PROCESS_OPTION_CHUNK_CHARS: frozenset[ProcessChunkingOption] = frozenset(
- {
- PROCESS_OPTION_CHUNK_FIXED,
- PROCESS_OPTION_CHUNK_RECURSIVE,
- PROCESS_OPTION_CHUNK_VECTOR,
- PROCESS_OPTION_CHUNK_PARAGRAH,
- }
- )
- SUPPORTED_PROCESS_OPTIONS = frozenset(
- {
- PROCESS_OPTION_IMAGES,
- PROCESS_OPTION_TABLES,
- PROCESS_OPTION_EQUATIONS,
- PROCESS_OPTION_SKIP_KG,
- PROCESS_OPTION_CHUNK_FIXED,
- PROCESS_OPTION_CHUNK_RECURSIVE,
- PROCESS_OPTION_CHUNK_VECTOR,
- PROCESS_OPTION_CHUNK_PARAGRAH,
- }
- )
- DEFAULT_MAX_PARALLEL_ANALYZE = 5 # Multimodal analysis (VLM) concurrency
- # Per-engine parsing concurrency defaults. mineru / docling default to 1
- # because both engines are resource-intensive (GPU/CPU + memory) and tend to
- # be more stable when run serially; users with capacity can opt into higher
- # concurrency via MAX_PARALLEL_PARSE_* env vars.
- DEFAULT_MAX_PARALLEL_PARSE_NATIVE = 5
- DEFAULT_MAX_PARALLEL_PARSE_MINERU = 1
- DEFAULT_MAX_PARALLEL_PARSE_DOCLING = 1
- # Staged pipeline queue size defaults.
- DEFAULT_QUEUE_SIZE_DEFAULT = 100
- DEFAULT_QUEUE_SIZE_INSERT = 4
- # Multimodal analysis / chunk thresholds
- # Minimum token count retained when truncating a multimodal chunk's
- # description to fit within DEFAULT_MAX_EXTRACT_INPUT_TOKENS. Falling below
- # this floor leaves the description too thin to ground a useful entity
- # description, so the pipeline raises instead of producing a stub.
- DEFAULT_MM_CHUNK_DESCRIPTION_MIN_TOKENS = 100
- # Minimum image side (width or height) in pixels accepted for VLM analysis.
- # Anything smaller is treated as decorative (icons, separators, etc.) and
- # written as status="skipped".
- DEFAULT_MM_IMAGE_MIN_PIXEL = 32
- # Priority used for all multimodal analysis LLM calls. Higher numbers run
- # behind entity extraction (priority 10) so a busy ingestion queue still
- # prefers KG-building work.
- DEFAULT_MM_ANALYSIS_PRIORITY = 12
- # Embedding configuration defaults
- DEFAULT_EMBEDDING_FUNC_MAX_ASYNC = 8 # Default max async for embedding functions
- DEFAULT_EMBEDDING_BATCH_NUM = 10 # Default batch size for embedding computations
- # Gunicorn worker timeout
- DEFAULT_TIMEOUT = 300
- # Default llm and embedding timeout
- DEFAULT_LLM_TIMEOUT = 180
- DEFAULT_EMBEDDING_TIMEOUT = 30
- # Rerank async / timeout defaults
- # Concurrency falls back to base MAX_ASYNC when env unset; timeout has its own
- # default since reranker calls are typically much faster than full LLM generation.
- DEFAULT_RERANK_MAX_ASYNC = DEFAULT_MAX_ASYNC
- DEFAULT_RERANK_TIMEOUT = 30
- # Logging configuration defaults
- DEFAULT_LOG_MAX_BYTES = 10485760 # Default 10MB
- DEFAULT_LOG_BACKUP_COUNT = 5 # Default 5 backups
- DEFAULT_LOG_FILENAME = "lightrag.log" # Default log filename
- # Ollama server configuration defaults
- DEFAULT_OLLAMA_MODEL_NAME = "lightrag"
- DEFAULT_OLLAMA_MODEL_TAG = "latest"
- DEFAULT_OLLAMA_MODEL_SIZE = 7365960935
- DEFAULT_OLLAMA_CREATED_AT = "2024-01-15T00:00:00Z"
- DEFAULT_OLLAMA_DIGEST = "sha256:lightrag"
|