wxcz_admin
/
lightrag-cn-git


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334
							"""
Centralized configuration constants for LightRAG.

This module defines default values for configuration constants used across
different parts of the LightRAG system. Centralizing these values ensures
consistency and makes maintenance easier.
"""

from typing import Literal, TypeAlias

# Default values for server settings
DEFAULT_WOKERS = 2
DEFAULT_MAX_GRAPH_NODES = 1000

# Default values for extraction settings
DEFAULT_SUMMARY_LANGUAGE = "English"  # Default language for document processing
DEFAULT_MAX_GLEANING = 1
DEFAULT_ENTITY_NAME_MAX_LENGTH = 256

# Per-response output limits for entity extraction prompts
DEFAULT_MAX_EXTRACTION_RECORDS = 100
DEFAULT_MAX_EXTRACTION_ENTITIES = 40

# Number of description fragments to trigger LLM summary
DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE = 8
# Max description token size to trigger LLM summary
DEFAULT_SUMMARY_MAX_TOKENS = 1200
# Recommended LLM summary output length in tokens
DEFAULT_SUMMARY_LENGTH_RECOMMENDED = 600
# Maximum token size sent to LLM for summary
DEFAULT_SUMMARY_CONTEXT_SIZE = 12000
# Maximum token size allowed for entity extraction input context
DEFAULT_MAX_EXTRACT_INPUT_TOKENS = 20480
# Separator for: description, source_id and relation-key fields(Can not be changed after data inserted)
GRAPH_FIELD_SEP = "<SEP>"

# Query and retrieval configuration defaults
DEFAULT_TOP_K = 40
DEFAULT_CHUNK_TOP_K = 20
DEFAULT_MAX_ENTITY_TOKENS = 6000
DEFAULT_MAX_RELATION_TOKENS = 8000
DEFAULT_MAX_TOTAL_TOKENS = 30000
DEFAULT_COSINE_THRESHOLD = 0.2
DEFAULT_RELATED_CHUNK_NUMBER = 5
DEFAULT_KG_CHUNK_PICK_METHOD = "VECTOR"

# Rerank configuration defaults
DEFAULT_MIN_RERANK_SCORE = 0.0
DEFAULT_RERANK_BINDING = "null"

# Default source ids limit in meta data for entity and relation
DEFAULT_MAX_SOURCE_IDS_PER_ENTITY = 300
DEFAULT_MAX_SOURCE_IDS_PER_RELATION = 300
### control chunk_ids limitation method: FIFO, FIFO
###    FIFO: First in first out
###    KEEP: Keep oldest (less merge action and faster)
SOURCE_IDS_LIMIT_METHOD_KEEP = "KEEP"
SOURCE_IDS_LIMIT_METHOD_FIFO = "FIFO"
DEFAULT_SOURCE_IDS_LIMIT_METHOD = SOURCE_IDS_LIMIT_METHOD_FIFO
VALID_SOURCE_IDS_LIMIT_METHODS = {
    SOURCE_IDS_LIMIT_METHOD_KEEP,
    SOURCE_IDS_LIMIT_METHOD_FIFO,
}
# Maximum number of file paths stored in entity/relation file_path field (For displayed only, does not affect query performance)
DEFAULT_MAX_FILE_PATHS = 100

# Field length of file_path in Milvus Schema for entity and relation (Should not be changed)
# file_path must store all file paths up to the DEFAULT_MAX_FILE_PATHS limit within the metadata.
DEFAULT_MAX_FILE_PATH_LENGTH = 32768
# Placeholder for more file paths in meta data for entity and relation (Should not be changed)
DEFAULT_FILE_PATH_MORE_PLACEHOLDER = "truncated"

# Default temperature for LLM
DEFAULT_TEMPERATURE = 1.0

# Async configuration defaults
DEFAULT_MAX_ASYNC = 4  # Default maximum async operations
DEFAULT_MAX_PARALLEL_INSERT = 2  # Default maximum parallel insert operations

# Chunker defaults — i18n-aware so Chinese / mixed-language documents
# split correctly out of the box.  Override per deployment via
# CHUNK_R_SEPARATORS / CHUNK_V_SENTENCE_SPLIT_REGEX env vars.
#
# DEFAULT_R_SEPARATORS: cascade tried by langchain RecursiveCharacterTextSplitter.
# Order matters — strongest boundary first: paragraph (\n\n) > line (\n) >
# Chinese sentence-end (。！？) > Chinese semi-clause (；，) > space > char.
# English sentence-ending punctuation (.?!) is intentionally NOT included
# because RecursiveCharacterTextSplitter does literal-string splitting, so
# "." would also split numerals (``0.95``) and abbreviations (``e.g.``).
# The English path falls through space / char as before.
DEFAULT_R_SEPARATORS: tuple[str, ...] = (
    "\n\n",
    "\n",
    "。",
    "！",
    "？",
    "；",
    "，",
    " ",
    "",
)
# DEFAULT_SENTENCE_SPLIT_REGEX: pattern fed to langchain SemanticChunker.
# Two alternates so the English branch keeps its ``\s+`` requirement
# (avoiding ``0.95`` mid-token splits) while the Chinese branch matches
# bare ``。？！`` (CJK has no inter-sentence whitespace).
DEFAULT_SENTENCE_SPLIT_REGEX = r"(?<=[.?!])\s+|(?<=[。？！])"

# DEFAULT_CHUNK_P_SIZE: paragraph-semantic chunker target size when
# CHUNK_P_SIZE env is unset.  Deliberately larger than the global
# CHUNK_SIZE default — heading-aligned paragraph merging needs more
# headroom to keep semantically related paragraphs together; falling
# back to CHUNK_SIZE (1200) would force premature splits and defeat
# the strategy's purpose.
DEFAULT_CHUNK_P_SIZE = 2000

# LightRAG Document pipeline
FULL_DOCS_FORMAT_RAW = "raw"  # content in full_docs["content"]
FULL_DOCS_FORMAT_LIGHTRAG = "lightrag"  # content in LightRAG Document files
FULL_DOCS_FORMAT_PENDING_PARSE = (
    "pending_parse"  # file saved but not yet parsed; parse_native will read from disk
)
# Marker prefix for full_docs.content when format=lightrag.
# Per docs/FileProcessingConfiguration-zh.md, the content is "{{LRdoc}}" + a
# leading summary of the parsed document so paginated APIs can show a real
# preview without loading the full LightRAG Document file.
LIGHTRAG_DOC_CONTENT_PREFIX = "{{LRdoc}}"
PARSER_ENGINE_LEGACY = "legacy"
PARSER_ENGINE_NATIVE = "native"
PARSER_ENGINE_MINERU = "mineru"
PARSER_ENGINE_DOCLING = "docling"
SUPPORTED_PARSER_ENGINES = frozenset(
    {
        PARSER_ENGINE_LEGACY,
        PARSER_ENGINE_NATIVE,
        PARSER_ENGINE_MINERU,
        PARSER_ENGINE_DOCLING,
    }
)
PARSER_ENGINE_SUFFIX_CAPABILITIES = {
    PARSER_ENGINE_LEGACY: frozenset(
        {
            "txt",
            "md",
            "mdx",
            "pdf",
            "docx",
            "pptx",
            "xlsx",
            "rtf",
            "odt",
            "tex",
            "epub",
            "html",
            "htm",
            "csv",
            "json",
            "xml",
            "yaml",
            "yml",
            "log",
            "conf",
            "ini",
            "properties",
            "sql",
            "bat",
            "sh",
            "c",
            "h",
            "cpp",
            "hpp",
            "py",
            "java",
            "js",
            "ts",
            "swift",
            "go",
            "rb",
            "php",
            "css",
            "scss",
            "less",
        }
    ),
    PARSER_ENGINE_NATIVE: frozenset({"docx"}),
    PARSER_ENGINE_MINERU: frozenset(
        {
            "pdf",
            "doc",
            "docx",
            "ppt",
            "pptx",
            "xls",
            "xlsx",
            "png",
            "jpg",
            "jpeg",
            "jp2",
            "webp",
            "gif",
            "bmp",
        }
    ),
    PARSER_ENGINE_DOCLING: frozenset(
        {
            "pdf",
            "docx",
            "pptx",
            "xlsx",
            "md",
            "html",
            "xhtml",
            "png",
            "jpg",
            "jpeg",
            "tiff",
            "webp",
            "bmp",
        }
    ),
}
PARSED_DIR_NAME = "__parsed__"  # Dir for parsed files (renamed from __enqueued__)

# Suffixes for parser artifact subdirectories under ``<input>/__parsed__/``.
# Centralising them here keeps the sidecar writer, engine cache modules and
# the delete-path whitelist in sync — new engines should add their raw-dir
# suffix to ``PARSED_ARTIFACT_DIR_SUFFIXES`` so deletion picks them up
# automatically.
PARSED_DIR_SUFFIX = ".parsed"  # spec sidecar layout (every engine)
MINERU_RAW_DIR_SUFFIX = ".mineru_raw"  # preserved MinerU raw bundle
DOCLING_RAW_DIR_SUFFIX = ".docling_raw"  # preserved Docling raw bundle
PARSED_ARTIFACT_DIR_SUFFIXES: tuple[str, ...] = (
    PARSED_DIR_SUFFIX,
    MINERU_RAW_DIR_SUFFIX,
    DOCLING_RAW_DIR_SUFFIX,
)

# Per-file processing options carried by filename hints / LIGHTRAG_PARSER rules.
# See docs/FileProcessingConfiguration-zh.md for the full specification.
PROCESS_OPTION_IMAGES = "i"  # Enable VLM analysis for drawings/images
PROCESS_OPTION_TABLES = "t"  # Enable VLM analysis for tables
PROCESS_OPTION_EQUATIONS = "e"  # Enable VLM analysis for equations
PROCESS_OPTION_SKIP_KG = "!"  # Skip entity/relation extraction (no KG build)
ProcessChunkingOption: TypeAlias = Literal["F", "R", "V", "P"]
PROCESS_OPTION_CHUNK_FIXED: ProcessChunkingOption = (
    "F"  # Fixed-length / separator chunking (default)
)
PROCESS_OPTION_CHUNK_RECURSIVE: ProcessChunkingOption = (
    "R"  # Recursive semantic chunking
)
PROCESS_OPTION_CHUNK_VECTOR: ProcessChunkingOption = (
    "V"  # Vector-driven semantic chunking
)
PROCESS_OPTION_CHUNK_PARAGRAH: ProcessChunkingOption = (
    "P"  # Paragrah-driven semantic chunking
)

PROCESS_OPTION_CHUNK_CHARS: frozenset[ProcessChunkingOption] = frozenset(
    {
        PROCESS_OPTION_CHUNK_FIXED,
        PROCESS_OPTION_CHUNK_RECURSIVE,
        PROCESS_OPTION_CHUNK_VECTOR,
        PROCESS_OPTION_CHUNK_PARAGRAH,
    }
)
SUPPORTED_PROCESS_OPTIONS = frozenset(
    {
        PROCESS_OPTION_IMAGES,
        PROCESS_OPTION_TABLES,
        PROCESS_OPTION_EQUATIONS,
        PROCESS_OPTION_SKIP_KG,
        PROCESS_OPTION_CHUNK_FIXED,
        PROCESS_OPTION_CHUNK_RECURSIVE,
        PROCESS_OPTION_CHUNK_VECTOR,
        PROCESS_OPTION_CHUNK_PARAGRAH,
    }
)

DEFAULT_MAX_PARALLEL_ANALYZE = 5  # Multimodal analysis (VLM) concurrency

# Per-engine parsing concurrency defaults.  mineru / docling default to 1
# because both engines are resource-intensive (GPU/CPU + memory) and tend to
# be more stable when run serially; users with capacity can opt into higher
# concurrency via MAX_PARALLEL_PARSE_* env vars.
DEFAULT_MAX_PARALLEL_PARSE_NATIVE = 5
DEFAULT_MAX_PARALLEL_PARSE_MINERU = 1
DEFAULT_MAX_PARALLEL_PARSE_DOCLING = 1

# Staged pipeline queue size defaults.
DEFAULT_QUEUE_SIZE_DEFAULT = 100
DEFAULT_QUEUE_SIZE_INSERT = 4

# Multimodal analysis / chunk thresholds
# Minimum token count retained when truncating a multimodal chunk's
# description to fit within DEFAULT_MAX_EXTRACT_INPUT_TOKENS.  Falling below
# this floor leaves the description too thin to ground a useful entity
# description, so the pipeline raises instead of producing a stub.
DEFAULT_MM_CHUNK_DESCRIPTION_MIN_TOKENS = 100
# Minimum image side (width or height) in pixels accepted for VLM analysis.
# Anything smaller is treated as decorative (icons, separators, etc.) and
# written as status="skipped".
DEFAULT_MM_IMAGE_MIN_PIXEL = 32
# Priority used for all multimodal analysis LLM calls.  Higher numbers run
# behind entity extraction (priority 10) so a busy ingestion queue still
# prefers KG-building work.
DEFAULT_MM_ANALYSIS_PRIORITY = 12

# Embedding configuration defaults
DEFAULT_EMBEDDING_FUNC_MAX_ASYNC = 8  # Default max async for embedding functions
DEFAULT_EMBEDDING_BATCH_NUM = 10  # Default batch size for embedding computations

# Gunicorn worker timeout
DEFAULT_TIMEOUT = 300

# Default llm and embedding timeout
DEFAULT_LLM_TIMEOUT = 180
DEFAULT_EMBEDDING_TIMEOUT = 30

# Rerank async / timeout defaults
# Concurrency falls back to base MAX_ASYNC when env unset; timeout has its own
# default since reranker calls are typically much faster than full LLM generation.
DEFAULT_RERANK_MAX_ASYNC = DEFAULT_MAX_ASYNC
DEFAULT_RERANK_TIMEOUT = 30

# Logging configuration defaults
DEFAULT_LOG_MAX_BYTES = 10485760  # Default 10MB
DEFAULT_LOG_BACKUP_COUNT = 5  # Default 5 backups
DEFAULT_LOG_FILENAME = "lightrag.log"  # Default log filename

# Ollama server configuration defaults
DEFAULT_OLLAMA_MODEL_NAME = "lightrag"
DEFAULT_OLLAMA_MODEL_TAG = "latest"
DEFAULT_OLLAMA_MODEL_SIZE = 7365960935
DEFAULT_OLLAMA_CREATED_AT = "2024-01-15T00:00:00Z"
DEFAULT_OLLAMA_DIGEST = "sha256:lightrag"