constants.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334
  1. """
  2. Centralized configuration constants for LightRAG.
  3. This module defines default values for configuration constants used across
  4. different parts of the LightRAG system. Centralizing these values ensures
  5. consistency and makes maintenance easier.
  6. """
  7. from typing import Literal, TypeAlias
  8. # Default values for server settings
  9. DEFAULT_WOKERS = 2
  10. DEFAULT_MAX_GRAPH_NODES = 1000
  11. # Default values for extraction settings
  12. DEFAULT_SUMMARY_LANGUAGE = "English" # Default language for document processing
  13. DEFAULT_MAX_GLEANING = 1
  14. DEFAULT_ENTITY_NAME_MAX_LENGTH = 256
  15. # Per-response output limits for entity extraction prompts
  16. DEFAULT_MAX_EXTRACTION_RECORDS = 100
  17. DEFAULT_MAX_EXTRACTION_ENTITIES = 40
  18. # Number of description fragments to trigger LLM summary
  19. DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE = 8
  20. # Max description token size to trigger LLM summary
  21. DEFAULT_SUMMARY_MAX_TOKENS = 1200
  22. # Recommended LLM summary output length in tokens
  23. DEFAULT_SUMMARY_LENGTH_RECOMMENDED = 600
  24. # Maximum token size sent to LLM for summary
  25. DEFAULT_SUMMARY_CONTEXT_SIZE = 12000
  26. # Maximum token size allowed for entity extraction input context
  27. DEFAULT_MAX_EXTRACT_INPUT_TOKENS = 20480
  28. # Separator for: description, source_id and relation-key fields(Can not be changed after data inserted)
  29. GRAPH_FIELD_SEP = "<SEP>"
  30. # Query and retrieval configuration defaults
  31. DEFAULT_TOP_K = 40
  32. DEFAULT_CHUNK_TOP_K = 20
  33. DEFAULT_MAX_ENTITY_TOKENS = 6000
  34. DEFAULT_MAX_RELATION_TOKENS = 8000
  35. DEFAULT_MAX_TOTAL_TOKENS = 30000
  36. DEFAULT_COSINE_THRESHOLD = 0.2
  37. DEFAULT_RELATED_CHUNK_NUMBER = 5
  38. DEFAULT_KG_CHUNK_PICK_METHOD = "VECTOR"
  39. # Rerank configuration defaults
  40. DEFAULT_MIN_RERANK_SCORE = 0.0
  41. DEFAULT_RERANK_BINDING = "null"
  42. # Default source ids limit in meta data for entity and relation
  43. DEFAULT_MAX_SOURCE_IDS_PER_ENTITY = 300
  44. DEFAULT_MAX_SOURCE_IDS_PER_RELATION = 300
  45. ### control chunk_ids limitation method: FIFO, FIFO
  46. ### FIFO: First in first out
  47. ### KEEP: Keep oldest (less merge action and faster)
  48. SOURCE_IDS_LIMIT_METHOD_KEEP = "KEEP"
  49. SOURCE_IDS_LIMIT_METHOD_FIFO = "FIFO"
  50. DEFAULT_SOURCE_IDS_LIMIT_METHOD = SOURCE_IDS_LIMIT_METHOD_FIFO
  51. VALID_SOURCE_IDS_LIMIT_METHODS = {
  52. SOURCE_IDS_LIMIT_METHOD_KEEP,
  53. SOURCE_IDS_LIMIT_METHOD_FIFO,
  54. }
  55. # Maximum number of file paths stored in entity/relation file_path field (For displayed only, does not affect query performance)
  56. DEFAULT_MAX_FILE_PATHS = 100
  57. # Field length of file_path in Milvus Schema for entity and relation (Should not be changed)
  58. # file_path must store all file paths up to the DEFAULT_MAX_FILE_PATHS limit within the metadata.
  59. DEFAULT_MAX_FILE_PATH_LENGTH = 32768
  60. # Placeholder for more file paths in meta data for entity and relation (Should not be changed)
  61. DEFAULT_FILE_PATH_MORE_PLACEHOLDER = "truncated"
  62. # Default temperature for LLM
  63. DEFAULT_TEMPERATURE = 1.0
  64. # Async configuration defaults
  65. DEFAULT_MAX_ASYNC = 4 # Default maximum async operations
  66. DEFAULT_MAX_PARALLEL_INSERT = 2 # Default maximum parallel insert operations
  67. # Chunker defaults — i18n-aware so Chinese / mixed-language documents
  68. # split correctly out of the box. Override per deployment via
  69. # CHUNK_R_SEPARATORS / CHUNK_V_SENTENCE_SPLIT_REGEX env vars.
  70. #
  71. # DEFAULT_R_SEPARATORS: cascade tried by langchain RecursiveCharacterTextSplitter.
  72. # Order matters — strongest boundary first: paragraph (\n\n) > line (\n) >
  73. # Chinese sentence-end (。!?) > Chinese semi-clause (;,) > space > char.
  74. # English sentence-ending punctuation (.?!) is intentionally NOT included
  75. # because RecursiveCharacterTextSplitter does literal-string splitting, so
  76. # "." would also split numerals (``0.95``) and abbreviations (``e.g.``).
  77. # The English path falls through space / char as before.
  78. DEFAULT_R_SEPARATORS: tuple[str, ...] = (
  79. "\n\n",
  80. "\n",
  81. "。",
  82. "!",
  83. "?",
  84. ";",
  85. ",",
  86. " ",
  87. "",
  88. )
  89. # DEFAULT_SENTENCE_SPLIT_REGEX: pattern fed to langchain SemanticChunker.
  90. # Two alternates so the English branch keeps its ``\s+`` requirement
  91. # (avoiding ``0.95`` mid-token splits) while the Chinese branch matches
  92. # bare ``。?!`` (CJK has no inter-sentence whitespace).
  93. DEFAULT_SENTENCE_SPLIT_REGEX = r"(?<=[.?!])\s+|(?<=[。?!])"
  94. # DEFAULT_CHUNK_P_SIZE: paragraph-semantic chunker target size when
  95. # CHUNK_P_SIZE env is unset. Deliberately larger than the global
  96. # CHUNK_SIZE default — heading-aligned paragraph merging needs more
  97. # headroom to keep semantically related paragraphs together; falling
  98. # back to CHUNK_SIZE (1200) would force premature splits and defeat
  99. # the strategy's purpose.
  100. DEFAULT_CHUNK_P_SIZE = 2000
  101. # LightRAG Document pipeline
  102. FULL_DOCS_FORMAT_RAW = "raw" # content in full_docs["content"]
  103. FULL_DOCS_FORMAT_LIGHTRAG = "lightrag" # content in LightRAG Document files
  104. FULL_DOCS_FORMAT_PENDING_PARSE = (
  105. "pending_parse" # file saved but not yet parsed; parse_native will read from disk
  106. )
  107. # Marker prefix for full_docs.content when format=lightrag.
  108. # Per docs/FileProcessingConfiguration-zh.md, the content is "{{LRdoc}}" + a
  109. # leading summary of the parsed document so paginated APIs can show a real
  110. # preview without loading the full LightRAG Document file.
  111. LIGHTRAG_DOC_CONTENT_PREFIX = "{{LRdoc}}"
  112. PARSER_ENGINE_LEGACY = "legacy"
  113. PARSER_ENGINE_NATIVE = "native"
  114. PARSER_ENGINE_MINERU = "mineru"
  115. PARSER_ENGINE_DOCLING = "docling"
  116. SUPPORTED_PARSER_ENGINES = frozenset(
  117. {
  118. PARSER_ENGINE_LEGACY,
  119. PARSER_ENGINE_NATIVE,
  120. PARSER_ENGINE_MINERU,
  121. PARSER_ENGINE_DOCLING,
  122. }
  123. )
  124. PARSER_ENGINE_SUFFIX_CAPABILITIES = {
  125. PARSER_ENGINE_LEGACY: frozenset(
  126. {
  127. "txt",
  128. "md",
  129. "mdx",
  130. "pdf",
  131. "docx",
  132. "pptx",
  133. "xlsx",
  134. "rtf",
  135. "odt",
  136. "tex",
  137. "epub",
  138. "html",
  139. "htm",
  140. "csv",
  141. "json",
  142. "xml",
  143. "yaml",
  144. "yml",
  145. "log",
  146. "conf",
  147. "ini",
  148. "properties",
  149. "sql",
  150. "bat",
  151. "sh",
  152. "c",
  153. "h",
  154. "cpp",
  155. "hpp",
  156. "py",
  157. "java",
  158. "js",
  159. "ts",
  160. "swift",
  161. "go",
  162. "rb",
  163. "php",
  164. "css",
  165. "scss",
  166. "less",
  167. }
  168. ),
  169. PARSER_ENGINE_NATIVE: frozenset({"docx"}),
  170. PARSER_ENGINE_MINERU: frozenset(
  171. {
  172. "pdf",
  173. "doc",
  174. "docx",
  175. "ppt",
  176. "pptx",
  177. "xls",
  178. "xlsx",
  179. "png",
  180. "jpg",
  181. "jpeg",
  182. "jp2",
  183. "webp",
  184. "gif",
  185. "bmp",
  186. }
  187. ),
  188. PARSER_ENGINE_DOCLING: frozenset(
  189. {
  190. "pdf",
  191. "docx",
  192. "pptx",
  193. "xlsx",
  194. "md",
  195. "html",
  196. "xhtml",
  197. "png",
  198. "jpg",
  199. "jpeg",
  200. "tiff",
  201. "webp",
  202. "bmp",
  203. }
  204. ),
  205. }
  206. PARSED_DIR_NAME = "__parsed__" # Dir for parsed files (renamed from __enqueued__)
  207. # Suffixes for parser artifact subdirectories under ``<input>/__parsed__/``.
  208. # Centralising them here keeps the sidecar writer, engine cache modules and
  209. # the delete-path whitelist in sync — new engines should add their raw-dir
  210. # suffix to ``PARSED_ARTIFACT_DIR_SUFFIXES`` so deletion picks them up
  211. # automatically.
  212. PARSED_DIR_SUFFIX = ".parsed" # spec sidecar layout (every engine)
  213. MINERU_RAW_DIR_SUFFIX = ".mineru_raw" # preserved MinerU raw bundle
  214. DOCLING_RAW_DIR_SUFFIX = ".docling_raw" # preserved Docling raw bundle
  215. PARSED_ARTIFACT_DIR_SUFFIXES: tuple[str, ...] = (
  216. PARSED_DIR_SUFFIX,
  217. MINERU_RAW_DIR_SUFFIX,
  218. DOCLING_RAW_DIR_SUFFIX,
  219. )
  220. # Per-file processing options carried by filename hints / LIGHTRAG_PARSER rules.
  221. # See docs/FileProcessingConfiguration-zh.md for the full specification.
  222. PROCESS_OPTION_IMAGES = "i" # Enable VLM analysis for drawings/images
  223. PROCESS_OPTION_TABLES = "t" # Enable VLM analysis for tables
  224. PROCESS_OPTION_EQUATIONS = "e" # Enable VLM analysis for equations
  225. PROCESS_OPTION_SKIP_KG = "!" # Skip entity/relation extraction (no KG build)
  226. ProcessChunkingOption: TypeAlias = Literal["F", "R", "V", "P"]
  227. PROCESS_OPTION_CHUNK_FIXED: ProcessChunkingOption = (
  228. "F" # Fixed-length / separator chunking (default)
  229. )
  230. PROCESS_OPTION_CHUNK_RECURSIVE: ProcessChunkingOption = (
  231. "R" # Recursive semantic chunking
  232. )
  233. PROCESS_OPTION_CHUNK_VECTOR: ProcessChunkingOption = (
  234. "V" # Vector-driven semantic chunking
  235. )
  236. PROCESS_OPTION_CHUNK_PARAGRAH: ProcessChunkingOption = (
  237. "P" # Paragrah-driven semantic chunking
  238. )
  239. PROCESS_OPTION_CHUNK_CHARS: frozenset[ProcessChunkingOption] = frozenset(
  240. {
  241. PROCESS_OPTION_CHUNK_FIXED,
  242. PROCESS_OPTION_CHUNK_RECURSIVE,
  243. PROCESS_OPTION_CHUNK_VECTOR,
  244. PROCESS_OPTION_CHUNK_PARAGRAH,
  245. }
  246. )
  247. SUPPORTED_PROCESS_OPTIONS = frozenset(
  248. {
  249. PROCESS_OPTION_IMAGES,
  250. PROCESS_OPTION_TABLES,
  251. PROCESS_OPTION_EQUATIONS,
  252. PROCESS_OPTION_SKIP_KG,
  253. PROCESS_OPTION_CHUNK_FIXED,
  254. PROCESS_OPTION_CHUNK_RECURSIVE,
  255. PROCESS_OPTION_CHUNK_VECTOR,
  256. PROCESS_OPTION_CHUNK_PARAGRAH,
  257. }
  258. )
  259. DEFAULT_MAX_PARALLEL_ANALYZE = 5 # Multimodal analysis (VLM) concurrency
  260. # Per-engine parsing concurrency defaults. mineru / docling default to 1
  261. # because both engines are resource-intensive (GPU/CPU + memory) and tend to
  262. # be more stable when run serially; users with capacity can opt into higher
  263. # concurrency via MAX_PARALLEL_PARSE_* env vars.
  264. DEFAULT_MAX_PARALLEL_PARSE_NATIVE = 5
  265. DEFAULT_MAX_PARALLEL_PARSE_MINERU = 1
  266. DEFAULT_MAX_PARALLEL_PARSE_DOCLING = 1
  267. # Staged pipeline queue size defaults.
  268. DEFAULT_QUEUE_SIZE_DEFAULT = 100
  269. DEFAULT_QUEUE_SIZE_INSERT = 4
  270. # Multimodal analysis / chunk thresholds
  271. # Minimum token count retained when truncating a multimodal chunk's
  272. # description to fit within DEFAULT_MAX_EXTRACT_INPUT_TOKENS. Falling below
  273. # this floor leaves the description too thin to ground a useful entity
  274. # description, so the pipeline raises instead of producing a stub.
  275. DEFAULT_MM_CHUNK_DESCRIPTION_MIN_TOKENS = 100
  276. # Minimum image side (width or height) in pixels accepted for VLM analysis.
  277. # Anything smaller is treated as decorative (icons, separators, etc.) and
  278. # written as status="skipped".
  279. DEFAULT_MM_IMAGE_MIN_PIXEL = 32
  280. # Priority used for all multimodal analysis LLM calls. Higher numbers run
  281. # behind entity extraction (priority 10) so a busy ingestion queue still
  282. # prefers KG-building work.
  283. DEFAULT_MM_ANALYSIS_PRIORITY = 12
  284. # Embedding configuration defaults
  285. DEFAULT_EMBEDDING_FUNC_MAX_ASYNC = 8 # Default max async for embedding functions
  286. DEFAULT_EMBEDDING_BATCH_NUM = 10 # Default batch size for embedding computations
  287. # Gunicorn worker timeout
  288. DEFAULT_TIMEOUT = 300
  289. # Default llm and embedding timeout
  290. DEFAULT_LLM_TIMEOUT = 180
  291. DEFAULT_EMBEDDING_TIMEOUT = 30
  292. # Rerank async / timeout defaults
  293. # Concurrency falls back to base MAX_ASYNC when env unset; timeout has its own
  294. # default since reranker calls are typically much faster than full LLM generation.
  295. DEFAULT_RERANK_MAX_ASYNC = DEFAULT_MAX_ASYNC
  296. DEFAULT_RERANK_TIMEOUT = 30
  297. # Logging configuration defaults
  298. DEFAULT_LOG_MAX_BYTES = 10485760 # Default 10MB
  299. DEFAULT_LOG_BACKUP_COUNT = 5 # Default 5 backups
  300. DEFAULT_LOG_FILENAME = "lightrag.log" # Default log filename
  301. # Ollama server configuration defaults
  302. DEFAULT_OLLAMA_MODEL_NAME = "lightrag"
  303. DEFAULT_OLLAMA_MODEL_TAG = "latest"
  304. DEFAULT_OLLAMA_MODEL_SIZE = 7365960935
  305. DEFAULT_OLLAMA_CREATED_AT = "2024-01-15T00:00:00Z"
  306. DEFAULT_OLLAMA_DIGEST = "sha256:lightrag"