ollama_api.py 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738
  1. from fastapi import APIRouter, HTTPException, Request
  2. from pydantic import BaseModel
  3. from typing import List, Dict, Any, Optional, Type
  4. from lightrag.utils import logger
  5. import time
  6. import json
  7. import re
  8. from enum import Enum
  9. from fastapi.responses import StreamingResponse
  10. import asyncio
  11. from lightrag import LightRAG, QueryParam
  12. from lightrag.utils import TiktokenTokenizer
  13. from lightrag.api.utils_api import get_combined_auth_dependency
  14. from fastapi import Depends
  15. # query mode according to query prefix (bypass is not LightRAG quer mode)
  16. class SearchMode(str, Enum):
  17. naive = "naive"
  18. local = "local"
  19. global_ = "global"
  20. hybrid = "hybrid"
  21. mix = "mix"
  22. bypass = "bypass"
  23. context = "context"
  24. class OllamaMessage(BaseModel):
  25. role: str
  26. content: str
  27. images: Optional[List[str]] = None
  28. class OllamaChatRequest(BaseModel):
  29. model: str
  30. messages: List[OllamaMessage]
  31. stream: bool = True
  32. options: Optional[Dict[str, Any]] = None
  33. system: Optional[str] = None
  34. class OllamaChatResponse(BaseModel):
  35. model: str
  36. created_at: str
  37. message: OllamaMessage
  38. done: bool
  39. class OllamaGenerateRequest(BaseModel):
  40. model: str
  41. prompt: str
  42. system: Optional[str] = None
  43. stream: bool = False
  44. options: Optional[Dict[str, Any]] = None
  45. class OllamaGenerateResponse(BaseModel):
  46. model: str
  47. created_at: str
  48. response: str
  49. done: bool
  50. context: Optional[List[int]]
  51. total_duration: Optional[int]
  52. load_duration: Optional[int]
  53. prompt_eval_count: Optional[int]
  54. prompt_eval_duration: Optional[int]
  55. eval_count: Optional[int]
  56. eval_duration: Optional[int]
  57. class OllamaVersionResponse(BaseModel):
  58. version: str
  59. class OllamaModelDetails(BaseModel):
  60. parent_model: str
  61. format: str
  62. family: str
  63. families: List[str]
  64. parameter_size: str
  65. quantization_level: str
  66. class OllamaModel(BaseModel):
  67. name: str
  68. model: str
  69. size: int
  70. digest: str
  71. modified_at: str
  72. details: OllamaModelDetails
  73. class OllamaTagResponse(BaseModel):
  74. models: List[OllamaModel]
  75. class OllamaRunningModelDetails(BaseModel):
  76. parent_model: str
  77. format: str
  78. family: str
  79. families: List[str]
  80. parameter_size: str
  81. quantization_level: str
  82. class OllamaRunningModel(BaseModel):
  83. name: str
  84. model: str
  85. size: int
  86. digest: str
  87. details: OllamaRunningModelDetails
  88. expires_at: str
  89. size_vram: int
  90. class OllamaPsResponse(BaseModel):
  91. models: List[OllamaRunningModel]
  92. async def parse_request_body(
  93. request: Request, model_class: Type[BaseModel]
  94. ) -> BaseModel:
  95. """
  96. Parse request body based on Content-Type header.
  97. Supports both application/json and application/octet-stream.
  98. Args:
  99. request: The FastAPI Request object
  100. model_class: The Pydantic model class to parse the request into
  101. Returns:
  102. An instance of the provided model_class
  103. """
  104. content_type = request.headers.get("content-type", "").lower()
  105. try:
  106. if content_type.startswith("application/json"):
  107. # FastAPI already handles JSON parsing for us
  108. body = await request.json()
  109. elif content_type.startswith("application/octet-stream"):
  110. # Manually parse octet-stream as JSON
  111. body_bytes = await request.body()
  112. body = json.loads(body_bytes.decode("utf-8"))
  113. else:
  114. # Try to parse as JSON for any other content type
  115. body_bytes = await request.body()
  116. body = json.loads(body_bytes.decode("utf-8"))
  117. # Create an instance of the model
  118. return model_class(**body)
  119. except json.JSONDecodeError:
  120. raise HTTPException(status_code=400, detail="Invalid JSON in request body")
  121. except Exception as e:
  122. raise HTTPException(
  123. status_code=400, detail=f"Error parsing request body: {str(e)}"
  124. )
  125. def estimate_tokens(text: str) -> int:
  126. """Estimate the number of tokens in text using tiktoken"""
  127. tokens = TiktokenTokenizer().encode(text)
  128. return len(tokens)
  129. def parse_query_mode(query: str) -> tuple[str, SearchMode, bool, Optional[str]]:
  130. """Parse query prefix to determine search mode
  131. Returns tuple of (cleaned_query, search_mode, only_need_context, user_prompt)
  132. Examples:
  133. - "/local[use mermaid format for diagrams] query string" -> (cleaned_query, SearchMode.local, False, "use mermaid format for diagrams")
  134. - "/[use mermaid format for diagrams] query string" -> (cleaned_query, SearchMode.hybrid, False, "use mermaid format for diagrams")
  135. - "/local query string" -> (cleaned_query, SearchMode.local, False, None)
  136. """
  137. # Initialize user_prompt as None
  138. user_prompt = None
  139. # First check if there's a bracket format for user prompt
  140. bracket_pattern = r"^/([a-z]*)\[(.*?)\](.*)"
  141. bracket_match = re.match(bracket_pattern, query)
  142. if bracket_match:
  143. mode_prefix = bracket_match.group(1)
  144. user_prompt = bracket_match.group(2)
  145. remaining_query = bracket_match.group(3).lstrip()
  146. # Reconstruct query, removing the bracket part
  147. query = f"/{mode_prefix} {remaining_query}".strip()
  148. # Unified handling of mode and only_need_context determination
  149. mode_map = {
  150. "/local ": (SearchMode.local, False),
  151. "/global ": (
  152. SearchMode.global_,
  153. False,
  154. ), # global_ is used because 'global' is a Python keyword
  155. "/naive ": (SearchMode.naive, False),
  156. "/hybrid ": (SearchMode.hybrid, False),
  157. "/mix ": (SearchMode.mix, False),
  158. "/bypass ": (SearchMode.bypass, False),
  159. "/context": (
  160. SearchMode.mix,
  161. True,
  162. ),
  163. "/localcontext": (SearchMode.local, True),
  164. "/globalcontext": (SearchMode.global_, True),
  165. "/hybridcontext": (SearchMode.hybrid, True),
  166. "/naivecontext": (SearchMode.naive, True),
  167. "/mixcontext": (SearchMode.mix, True),
  168. }
  169. for prefix, (mode, only_need_context) in mode_map.items():
  170. if query.startswith(prefix):
  171. # After removing prefix and leading spaces
  172. cleaned_query = query[len(prefix) :].lstrip()
  173. return cleaned_query, mode, only_need_context, user_prompt
  174. return query, SearchMode.mix, False, user_prompt
  175. class OllamaAPI:
  176. def __init__(self, rag: LightRAG, top_k: int = 60, api_key: Optional[str] = None):
  177. self.rag = rag
  178. self.ollama_server_infos = rag.ollama_server_infos
  179. self.top_k = top_k
  180. self.api_key = api_key
  181. self.router = APIRouter(tags=["ollama"])
  182. self.setup_routes()
  183. def setup_routes(self):
  184. # Create combined auth dependency for Ollama API routes
  185. combined_auth = get_combined_auth_dependency(self.api_key)
  186. @self.router.get("/version", dependencies=[Depends(combined_auth)])
  187. async def get_version():
  188. """Get Ollama version information"""
  189. return OllamaVersionResponse(version="0.9.3")
  190. @self.router.get("/tags", dependencies=[Depends(combined_auth)])
  191. async def get_tags():
  192. """Return available models acting as an Ollama server"""
  193. return OllamaTagResponse(
  194. models=[
  195. {
  196. "name": self.ollama_server_infos.LIGHTRAG_MODEL,
  197. "model": self.ollama_server_infos.LIGHTRAG_MODEL,
  198. "modified_at": self.ollama_server_infos.LIGHTRAG_CREATED_AT,
  199. "size": self.ollama_server_infos.LIGHTRAG_SIZE,
  200. "digest": self.ollama_server_infos.LIGHTRAG_DIGEST,
  201. "details": {
  202. "parent_model": "",
  203. "format": "gguf",
  204. "family": self.ollama_server_infos.LIGHTRAG_NAME,
  205. "families": [self.ollama_server_infos.LIGHTRAG_NAME],
  206. "parameter_size": "13B",
  207. "quantization_level": "Q4_0",
  208. },
  209. }
  210. ]
  211. )
  212. @self.router.get("/ps", dependencies=[Depends(combined_auth)])
  213. async def get_running_models():
  214. """List Running Models - returns currently running models"""
  215. return OllamaPsResponse(
  216. models=[
  217. {
  218. "name": self.ollama_server_infos.LIGHTRAG_MODEL,
  219. "model": self.ollama_server_infos.LIGHTRAG_MODEL,
  220. "size": self.ollama_server_infos.LIGHTRAG_SIZE,
  221. "digest": self.ollama_server_infos.LIGHTRAG_DIGEST,
  222. "details": {
  223. "parent_model": "",
  224. "format": "gguf",
  225. "family": "llama",
  226. "families": ["llama"],
  227. "parameter_size": "7.2B",
  228. "quantization_level": "Q4_0",
  229. },
  230. "expires_at": "2050-12-31T14:38:31.83753-07:00",
  231. "size_vram": self.ollama_server_infos.LIGHTRAG_SIZE,
  232. }
  233. ]
  234. )
  235. @self.router.post(
  236. "/generate", dependencies=[Depends(combined_auth)], include_in_schema=True
  237. )
  238. async def generate(raw_request: Request):
  239. """Handle generate completion requests acting as an Ollama model
  240. For compatibility purpose, the request is not processed by LightRAG,
  241. and will be handled by underlying LLM model.
  242. Supports both application/json and application/octet-stream Content-Types.
  243. """
  244. try:
  245. # Parse the request body manually
  246. request = await parse_request_body(raw_request, OllamaGenerateRequest)
  247. query = request.prompt
  248. start_time = time.time_ns()
  249. prompt_tokens = estimate_tokens(query)
  250. role_kwargs = (
  251. dict(self.rag.role_llm_kwargs["query"])
  252. if self.rag.role_llm_kwargs["query"] is not None
  253. else dict(self.rag.llm_model_kwargs)
  254. )
  255. if request.system:
  256. role_kwargs["system_prompt"] = request.system
  257. if request.stream:
  258. response = await (self.rag.role_llm_funcs["query"])(
  259. query, stream=True, **role_kwargs
  260. )
  261. async def stream_generator():
  262. first_chunk_time = None
  263. last_chunk_time = time.time_ns()
  264. total_response = ""
  265. # Ensure response is an async generator
  266. if isinstance(response, str):
  267. # If it's a string, send in two parts
  268. first_chunk_time = start_time
  269. last_chunk_time = time.time_ns()
  270. total_response = response
  271. data = {
  272. "model": self.ollama_server_infos.LIGHTRAG_MODEL,
  273. "created_at": self.ollama_server_infos.LIGHTRAG_CREATED_AT,
  274. "response": response,
  275. "done": False,
  276. }
  277. yield f"{json.dumps(data, ensure_ascii=False)}\n"
  278. completion_tokens = estimate_tokens(total_response)
  279. total_time = last_chunk_time - start_time
  280. prompt_eval_time = first_chunk_time - start_time
  281. eval_time = last_chunk_time - first_chunk_time
  282. data = {
  283. "model": self.ollama_server_infos.LIGHTRAG_MODEL,
  284. "created_at": self.ollama_server_infos.LIGHTRAG_CREATED_AT,
  285. "response": "",
  286. "done": True,
  287. "done_reason": "stop",
  288. "context": [],
  289. "total_duration": total_time,
  290. "load_duration": 0,
  291. "prompt_eval_count": prompt_tokens,
  292. "prompt_eval_duration": prompt_eval_time,
  293. "eval_count": completion_tokens,
  294. "eval_duration": eval_time,
  295. }
  296. yield f"{json.dumps(data, ensure_ascii=False)}\n"
  297. else:
  298. try:
  299. async for chunk in response:
  300. if chunk:
  301. if first_chunk_time is None:
  302. first_chunk_time = time.time_ns()
  303. last_chunk_time = time.time_ns()
  304. total_response += chunk
  305. data = {
  306. "model": self.ollama_server_infos.LIGHTRAG_MODEL,
  307. "created_at": self.ollama_server_infos.LIGHTRAG_CREATED_AT,
  308. "response": chunk,
  309. "done": False,
  310. }
  311. yield f"{json.dumps(data, ensure_ascii=False)}\n"
  312. except (asyncio.CancelledError, Exception) as e:
  313. error_msg = str(e)
  314. if isinstance(e, asyncio.CancelledError):
  315. error_msg = "Stream was cancelled by server"
  316. else:
  317. error_msg = f"Provider error: {error_msg}"
  318. logger.error(f"Stream error: {error_msg}")
  319. # Send error message to client
  320. error_data = {
  321. "model": self.ollama_server_infos.LIGHTRAG_MODEL,
  322. "created_at": self.ollama_server_infos.LIGHTRAG_CREATED_AT,
  323. "response": f"\n\nError: {error_msg}",
  324. "error": f"\n\nError: {error_msg}",
  325. "done": False,
  326. }
  327. yield f"{json.dumps(error_data, ensure_ascii=False)}\n"
  328. # Send final message to close the stream
  329. final_data = {
  330. "model": self.ollama_server_infos.LIGHTRAG_MODEL,
  331. "created_at": self.ollama_server_infos.LIGHTRAG_CREATED_AT,
  332. "response": "",
  333. "done": True,
  334. }
  335. yield f"{json.dumps(final_data, ensure_ascii=False)}\n"
  336. return
  337. if first_chunk_time is None:
  338. first_chunk_time = start_time
  339. completion_tokens = estimate_tokens(total_response)
  340. total_time = last_chunk_time - start_time
  341. prompt_eval_time = first_chunk_time - start_time
  342. eval_time = last_chunk_time - first_chunk_time
  343. data = {
  344. "model": self.ollama_server_infos.LIGHTRAG_MODEL,
  345. "created_at": self.ollama_server_infos.LIGHTRAG_CREATED_AT,
  346. "response": "",
  347. "done": True,
  348. "done_reason": "stop",
  349. "context": [],
  350. "total_duration": total_time,
  351. "load_duration": 0,
  352. "prompt_eval_count": prompt_tokens,
  353. "prompt_eval_duration": prompt_eval_time,
  354. "eval_count": completion_tokens,
  355. "eval_duration": eval_time,
  356. }
  357. yield f"{json.dumps(data, ensure_ascii=False)}\n"
  358. return
  359. return StreamingResponse(
  360. stream_generator(),
  361. media_type="application/x-ndjson",
  362. headers={
  363. "Cache-Control": "no-cache",
  364. "Connection": "keep-alive",
  365. "Content-Type": "application/x-ndjson",
  366. "X-Accel-Buffering": "no", # Ensure proper handling of streaming responses in Nginx proxy
  367. },
  368. )
  369. else:
  370. first_chunk_time = time.time_ns()
  371. response_text = await (self.rag.role_llm_funcs["query"])(
  372. query, stream=False, **role_kwargs
  373. )
  374. last_chunk_time = time.time_ns()
  375. if not response_text:
  376. response_text = "No response generated"
  377. completion_tokens = estimate_tokens(str(response_text))
  378. total_time = last_chunk_time - start_time
  379. prompt_eval_time = first_chunk_time - start_time
  380. eval_time = last_chunk_time - first_chunk_time
  381. return {
  382. "model": self.ollama_server_infos.LIGHTRAG_MODEL,
  383. "created_at": self.ollama_server_infos.LIGHTRAG_CREATED_AT,
  384. "response": str(response_text),
  385. "done": True,
  386. "done_reason": "stop",
  387. "context": [],
  388. "total_duration": total_time,
  389. "load_duration": 0,
  390. "prompt_eval_count": prompt_tokens,
  391. "prompt_eval_duration": prompt_eval_time,
  392. "eval_count": completion_tokens,
  393. "eval_duration": eval_time,
  394. }
  395. except Exception as e:
  396. logger.error(f"Ollama generate error: {str(e)}", exc_info=True)
  397. raise HTTPException(status_code=500, detail=str(e))
  398. @self.router.post(
  399. "/chat", dependencies=[Depends(combined_auth)], include_in_schema=True
  400. )
  401. async def chat(raw_request: Request):
  402. """Process chat completion requests by acting as an Ollama model.
  403. Routes user queries through LightRAG by selecting query mode based on query prefix.
  404. Detects and forwards OpenWebUI session-related requests (for meta data generation task) directly to LLM.
  405. Supports both application/json and application/octet-stream Content-Types.
  406. """
  407. try:
  408. # Parse the request body manually
  409. request = await parse_request_body(raw_request, OllamaChatRequest)
  410. # Get all messages
  411. messages = request.messages
  412. if not messages:
  413. raise HTTPException(status_code=400, detail="No messages provided")
  414. # Validate that the last message is from a user
  415. if messages[-1].role != "user":
  416. raise HTTPException(
  417. status_code=400, detail="Last message must be from user role"
  418. )
  419. # Get the last message as query and previous messages as history
  420. query = messages[-1].content
  421. # Convert OllamaMessage objects to dictionaries
  422. conversation_history = [
  423. {"role": msg.role, "content": msg.content} for msg in messages[:-1]
  424. ]
  425. # Check for query prefix
  426. cleaned_query, mode, only_need_context, user_prompt = parse_query_mode(
  427. query
  428. )
  429. start_time = time.time_ns()
  430. prompt_tokens = estimate_tokens(cleaned_query)
  431. param_dict = {
  432. "mode": mode.value,
  433. "stream": request.stream,
  434. "only_need_context": only_need_context,
  435. "conversation_history": conversation_history,
  436. "top_k": self.top_k,
  437. }
  438. # Add user_prompt to param_dict
  439. if user_prompt is not None:
  440. param_dict["user_prompt"] = user_prompt
  441. query_param = QueryParam(**param_dict)
  442. if request.stream:
  443. # Determine if the request is prefix with "/bypass"
  444. if mode == SearchMode.bypass:
  445. role_kwargs = (
  446. dict(self.rag.role_llm_kwargs["query"])
  447. if self.rag.role_llm_kwargs["query"] is not None
  448. else dict(self.rag.llm_model_kwargs)
  449. )
  450. if request.system:
  451. role_kwargs["system_prompt"] = request.system
  452. response = await (self.rag.role_llm_funcs["query"])(
  453. cleaned_query,
  454. stream=True,
  455. history_messages=conversation_history,
  456. **role_kwargs,
  457. )
  458. else:
  459. response = await self.rag.aquery(
  460. cleaned_query, param=query_param
  461. )
  462. async def stream_generator():
  463. first_chunk_time = None
  464. last_chunk_time = time.time_ns()
  465. total_response = ""
  466. # Ensure response is an async generator
  467. if isinstance(response, str):
  468. # If it's a string, send in two parts
  469. first_chunk_time = start_time
  470. last_chunk_time = time.time_ns()
  471. total_response = response
  472. data = {
  473. "model": self.ollama_server_infos.LIGHTRAG_MODEL,
  474. "created_at": self.ollama_server_infos.LIGHTRAG_CREATED_AT,
  475. "message": {
  476. "role": "assistant",
  477. "content": response,
  478. "images": None,
  479. },
  480. "done": False,
  481. }
  482. yield f"{json.dumps(data, ensure_ascii=False)}\n"
  483. completion_tokens = estimate_tokens(total_response)
  484. total_time = last_chunk_time - start_time
  485. prompt_eval_time = first_chunk_time - start_time
  486. eval_time = last_chunk_time - first_chunk_time
  487. data = {
  488. "model": self.ollama_server_infos.LIGHTRAG_MODEL,
  489. "created_at": self.ollama_server_infos.LIGHTRAG_CREATED_AT,
  490. "message": {
  491. "role": "assistant",
  492. "content": "",
  493. "images": None,
  494. },
  495. "done_reason": "stop",
  496. "done": True,
  497. "total_duration": total_time,
  498. "load_duration": 0,
  499. "prompt_eval_count": prompt_tokens,
  500. "prompt_eval_duration": prompt_eval_time,
  501. "eval_count": completion_tokens,
  502. "eval_duration": eval_time,
  503. }
  504. yield f"{json.dumps(data, ensure_ascii=False)}\n"
  505. else:
  506. try:
  507. async for chunk in response:
  508. if chunk:
  509. if first_chunk_time is None:
  510. first_chunk_time = time.time_ns()
  511. last_chunk_time = time.time_ns()
  512. total_response += chunk
  513. data = {
  514. "model": self.ollama_server_infos.LIGHTRAG_MODEL,
  515. "created_at": self.ollama_server_infos.LIGHTRAG_CREATED_AT,
  516. "message": {
  517. "role": "assistant",
  518. "content": chunk,
  519. "images": None,
  520. },
  521. "done": False,
  522. }
  523. yield f"{json.dumps(data, ensure_ascii=False)}\n"
  524. except (asyncio.CancelledError, Exception) as e:
  525. error_msg = str(e)
  526. if isinstance(e, asyncio.CancelledError):
  527. error_msg = "Stream was cancelled by server"
  528. else:
  529. error_msg = f"Provider error: {error_msg}"
  530. logger.error(f"Stream error: {error_msg}")
  531. # Send error message to client
  532. error_data = {
  533. "model": self.ollama_server_infos.LIGHTRAG_MODEL,
  534. "created_at": self.ollama_server_infos.LIGHTRAG_CREATED_AT,
  535. "message": {
  536. "role": "assistant",
  537. "content": f"\n\nError: {error_msg}",
  538. "images": None,
  539. },
  540. "error": f"\n\nError: {error_msg}",
  541. "done": False,
  542. }
  543. yield f"{json.dumps(error_data, ensure_ascii=False)}\n"
  544. # Send final message to close the stream
  545. final_data = {
  546. "model": self.ollama_server_infos.LIGHTRAG_MODEL,
  547. "created_at": self.ollama_server_infos.LIGHTRAG_CREATED_AT,
  548. "message": {
  549. "role": "assistant",
  550. "content": "",
  551. "images": None,
  552. },
  553. "done": True,
  554. }
  555. yield f"{json.dumps(final_data, ensure_ascii=False)}\n"
  556. return
  557. if first_chunk_time is None:
  558. first_chunk_time = start_time
  559. completion_tokens = estimate_tokens(total_response)
  560. total_time = last_chunk_time - start_time
  561. prompt_eval_time = first_chunk_time - start_time
  562. eval_time = last_chunk_time - first_chunk_time
  563. data = {
  564. "model": self.ollama_server_infos.LIGHTRAG_MODEL,
  565. "created_at": self.ollama_server_infos.LIGHTRAG_CREATED_AT,
  566. "message": {
  567. "role": "assistant",
  568. "content": "",
  569. "images": None,
  570. },
  571. "done_reason": "stop",
  572. "done": True,
  573. "total_duration": total_time,
  574. "load_duration": 0,
  575. "prompt_eval_count": prompt_tokens,
  576. "prompt_eval_duration": prompt_eval_time,
  577. "eval_count": completion_tokens,
  578. "eval_duration": eval_time,
  579. }
  580. yield f"{json.dumps(data, ensure_ascii=False)}\n"
  581. return StreamingResponse(
  582. stream_generator(),
  583. media_type="application/x-ndjson",
  584. headers={
  585. "Cache-Control": "no-cache",
  586. "Connection": "keep-alive",
  587. "Content-Type": "application/x-ndjson",
  588. "X-Accel-Buffering": "no", # Ensure proper handling of streaming responses in Nginx proxy
  589. },
  590. )
  591. else:
  592. first_chunk_time = time.time_ns()
  593. # Determine if the request is prefix with "/bypass" or from Open WebUI's session title and session keyword generation task
  594. match_result = re.search(
  595. r"\n<chat_history>\nUSER:", cleaned_query, re.MULTILINE
  596. )
  597. if match_result or mode == SearchMode.bypass:
  598. role_kwargs = (
  599. dict(self.rag.role_llm_kwargs["query"])
  600. if self.rag.role_llm_kwargs["query"] is not None
  601. else dict(self.rag.llm_model_kwargs)
  602. )
  603. if request.system:
  604. role_kwargs["system_prompt"] = request.system
  605. response_text = await (self.rag.role_llm_funcs["query"])(
  606. cleaned_query,
  607. stream=False,
  608. history_messages=conversation_history,
  609. **role_kwargs,
  610. )
  611. else:
  612. response_text = await self.rag.aquery(
  613. cleaned_query, param=query_param
  614. )
  615. last_chunk_time = time.time_ns()
  616. if not response_text:
  617. response_text = "No response generated"
  618. completion_tokens = estimate_tokens(str(response_text))
  619. total_time = last_chunk_time - start_time
  620. prompt_eval_time = first_chunk_time - start_time
  621. eval_time = last_chunk_time - first_chunk_time
  622. return {
  623. "model": self.ollama_server_infos.LIGHTRAG_MODEL,
  624. "created_at": self.ollama_server_infos.LIGHTRAG_CREATED_AT,
  625. "message": {
  626. "role": "assistant",
  627. "content": str(response_text),
  628. "images": None,
  629. },
  630. "done_reason": "stop",
  631. "done": True,
  632. "total_duration": total_time,
  633. "load_duration": 0,
  634. "prompt_eval_count": prompt_tokens,
  635. "prompt_eval_duration": prompt_eval_time,
  636. "eval_count": completion_tokens,
  637. "eval_duration": eval_time,
  638. }
  639. except Exception as e:
  640. logger.error(f"Ollama chat error: {str(e)}", exc_info=True)
  641. raise HTTPException(status_code=500, detail=str(e))