prompt.py 60 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936
  1. from __future__ import annotations
  2. import os
  3. from pathlib import Path
  4. from typing import Any, Mapping, TypedDict
  5. import yaml
  6. PROMPTS: dict[str, Any] = {}
  7. # All delimiters must be formatted as "<|UPPER_CASE_STRING|>"
  8. PROMPTS["DEFAULT_TUPLE_DELIMITER"] = "<|#|>"
  9. PROMPTS["DEFAULT_COMPLETION_DELIMITER"] = "<|COMPLETE|>"
  10. # Default entity type guidance injected into extraction prompts via {entity_types_guidance}.
  11. # Users can override this by passing entity_types_guidance in addon_params, or by
  12. # replacing the full prompt template string in PROMPTS.
  13. PROMPTS[
  14. "default_entity_types_guidance"
  15. ] = """Classify each entity using one of the following types. If no type fits, use `Other`.
  16. - Person: Human individuals, real or fictional
  17. - Creature: Non-human living beings (animals, mythical beings, etc.)
  18. - Organization: Companies, institutions, government bodies, groups
  19. - Location: Geographic places (cities, countries, buildings, regions)
  20. - Event: Occurrences, incidents, ceremonies, meetings
  21. - Concept: Abstract ideas, theories, principles, beliefs
  22. - Method: Procedures, techniques, algorithms, workflows
  23. - Content: Creative or informational works (books, articles, films, reports)
  24. - Data: Quantitative or structured information (statistics, datasets, measurements)
  25. - Artifact: Physical or digital objects created by humans (tools, software, devices)
  26. - NaturalObject: Natural non-living objects (minerals, celestial bodies, chemical compounds)"""
  27. PROMPTS["entity_extraction_system_prompt"] = """---Role---
  28. You are a Knowledge Graph Specialist responsible for extracting entities and relationships from the `---Input Text---` section of user prompt.
  29. ---Instructions---
  30. 1. **Entity Extraction:**
  31. - Identify clearly defined and meaningful entities in the `---Input Text---` section of user prompt.
  32. - For each entity, extract:
  33. - `entity_name`: The name of the entity. If the entity name is case-insensitive, capitalize the first letter of each significant word (title case). Ensure **consistent naming** across the entire extraction process.
  34. - `entity_type`: Categorize the entity using the type guidance provided in the `---Entity Types---` section below. If none of the provided entity types apply, classify it as `Other`.
  35. - `entity_description`: Provide a concise yet comprehensive description of the entity's attributes and activities, based *solely* on the information present in the input text.
  36. 2. **Relationship Extraction:**
  37. - Identify direct, clearly stated, and meaningful relationships between previously extracted entities.
  38. - If a single statement describes a relationship involving more than two entities, decompose it into multiple binary relationships.
  39. - For each binary relationship, extract:
  40. - `source_entity`: The name of the source entity. Ensure **consistent naming** with entity extraction. Capitalize the first letter of each significant word (title case) if the name is case-insensitive.
  41. - `target_entity`: The name of the target entity. Ensure **consistent naming** with entity extraction. Capitalize the first letter of each significant word (title case) if the name is case-insensitive.
  42. - `relationship_keywords`: One or more high-level keywords summarizing the relationship. Multiple keywords within this field must be separated by a comma `,`. **DO NOT use `{tuple_delimiter}` for separating multiple keywords within this field.**
  43. - `relationship_description`: A concise explanation of the nature of the relationship between the source and target entities.
  44. 3. **Record Types:**
  45. - `entity` is used only for entity rows and those rows always contain exactly 4 tuple parts total.
  46. - `relation` is used only for relationship rows and those rows always contain exactly 5 tuple parts total.
  47. - A row with two entity names plus relationship keywords and a relationship description must start with `relation`, never `entity`.
  48. - After the last entity row, switch prefixes to `relation` for every relationship row.
  49. 4. **Output Format:**
  50. - Entity row: `entity{tuple_delimiter}entity_name{tuple_delimiter}entity_type{tuple_delimiter}entity_description`
  51. - Relation row: `relation{tuple_delimiter}source_entity{tuple_delimiter}target_entity{tuple_delimiter}relationship_keywords{tuple_delimiter}relationship_description`
  52. - Wrong: `entity{tuple_delimiter}Alice{tuple_delimiter}Acme{tuple_delimiter}founded{tuple_delimiter}Alice founded Acme`
  53. - Correct: `relation{tuple_delimiter}Alice{tuple_delimiter}Acme{tuple_delimiter}founded{tuple_delimiter}Alice founded Acme`
  54. 5. **Delimiter Usage:**
  55. - The `{tuple_delimiter}` is a complete, atomic marker and **must not be filled with content**. It serves strictly as a field separator.
  56. - Incorrect: `entity{tuple_delimiter}Tokyo<|location|>Tokyo is the capital of Japan.`
  57. - Correct: `entity{tuple_delimiter}Tokyo{tuple_delimiter}location{tuple_delimiter}Tokyo is the capital of Japan.`
  58. 6. **Output Order & Deduplication:**
  59. - Output all extracted entities first, followed by all extracted relationships.
  60. - Output at most {max_total_records} total rows across entities and relationships in this response.
  61. - Output at most {max_entity_records} entity rows in this response.
  62. - Output fewer rows if fewer high-value items are present. Do not try to fill the limit.
  63. - Only output relationship rows whose source and target entities are both included in the selected entity rows for this response.
  64. - If the limit is reached, stop adding new rows immediately and output `{completion_delimiter}`.
  65. - Treat all relationships as **undirected** unless explicitly stated otherwise. Swapping the source and target entities for an undirected relationship does not constitute a new relationship.
  66. - Avoid outputting duplicate relationships.
  67. - Within the list of relationships, output the relationships that are **most significant** to the core meaning of the input text first.
  68. 7. **Context & Language:**
  69. - Ensure all entity names and descriptions are written in the **third person**.
  70. - Explicitly name the subject or object; **avoid using pronouns** such as `this article`, `this paper`, `our company`, `I`, `you`, and `he/she`.
  71. - The entire output (entity names, keywords, and descriptions) must be written in `{language}`.
  72. - Proper nouns (e.g., personal names, place names, organization names) should be retained in their original language if a proper, widely accepted translation is not available or would cause ambiguity.
  73. 8. **Completion Signal:** Output the literal string `{completion_delimiter}` only after all entities and relationships have been completely extracted and outputted.
  74. ---Entity Types---
  75. {entity_types_guidance}
  76. ---Examples---
  77. {examples}
  78. """
  79. PROMPTS["entity_extraction_user_prompt"] = """---Task---
  80. Extract entities and relationships from the `---Input Text---` session below.
  81. ---Instructions---
  82. 1. **Strict Adherence to Format:** Strictly adhere to all format requirements for entity and relationship lists, including output order, field delimiters, and proper noun handling, as specified in the system prompt.
  83. 2. **Quantity Limits:** In this response, output at most {max_total_records} total rows and at most {max_entity_records} entity rows. Output fewer rows if fewer high-value items are present. Only output relationship rows whose source and target entities are both included in this response.
  84. 3. **Output Content Only:** Output *only* the extracted list of entities and relationships. Do not include any introductory or concluding remarks, explanations, or additional text before or after the list.
  85. 4. **Completion Signal:** Output `{completion_delimiter}` as the final line after all relevant entities and relationships have been extracted and presented. If the row limit is reached, output `{completion_delimiter}` immediately after the last allowed row.
  86. 5. **Output Language:** Ensure the output language is {language}. Proper nouns (e.g., personal names, place names, organization names) must be kept in their original language and not translated.
  87. ---Input Text---
  88. ```
  89. {input_text}
  90. ```
  91. ---Output---
  92. """
  93. PROMPTS["entity_continue_extraction_user_prompt"] = """---Task---
  94. Based on the last extraction task, identify and extract any missed or incorrectly formatted entities and relationships from the input text.
  95. ---Instructions---
  96. 1. **Strict Adherence to System Format:** Strictly adhere to all format requirements for entity and relationship lists, including output order, field delimiters, and proper noun handling, as specified in the system instructions.
  97. 2. **Focus on Corrections/Additions:**
  98. - **Do NOT** re-output entities and relationships that were **correctly and fully** extracted in the last task.
  99. - If an entity or relationship was **missed** in the last task, extract and output it now according to the system format.
  100. - If an entity or relationship was **truncated, had missing fields, or was otherwise incorrectly formatted** in the last task, re-output the *corrected and complete* version in the specified format.
  101. - Any corrected relationship row must be emitted with the literal `relation` prefix, never `entity`.
  102. 3. **Quantity Limits:** In this response, output at most {max_total_records} total rows and at most {max_entity_records} entity rows. Output fewer rows if fewer high-value corrections or additions remain. A relationship row may reference entities that were already extracted correctly in the previous response. Do not re-output those entities unless they were missing or need correction.
  103. 4. **Output Content Only:** Output *only* the extracted list of entities and relationships. Do not include any introductory or concluding remarks, explanations, or additional text before or after the list.
  104. 5. **Completion Signal:** Output `{completion_delimiter}` as the final line after all relevant missing or corrected entities and relationships have been extracted and presented. If the row limit is reached, output `{completion_delimiter}` immediately after the last allowed row.
  105. 6. **Output Language:** Ensure the output language is {language}. Proper nouns (e.g., personal names, place names, organization names) must be kept in their original language and not translated.
  106. ---Output---
  107. """
  108. PROMPTS["entity_extraction_examples"] = [
  109. """---Entity Types---
  110. - Person: Human individuals, real or fictional
  111. - Artifact: Physical or digital objects created by humans (tools, software, devices)
  112. - Concept: Abstract ideas, theories, principles, beliefs
  113. ---Input Text---
  114. ```
  115. while Alex clenched his jaw, the buzz of frustration dull against the backdrop of Taylor's authoritarian certainty. It was this competitive undercurrent that kept him alert, the sense that his and Jordan's shared commitment to discovery was an unspoken rebellion against Cruz's narrowing vision of control and order.
  116. Then Taylor did something unexpected. They paused beside Jordan and, for a moment, observed the device with something akin to reverence. "If this tech can be understood..." Taylor said, their voice quieter, "It could change the game for us. For all of us."
  117. The underlying dismissal earlier seemed to falter, replaced by a glimpse of reluctant respect for the gravity of what lay in their hands. Jordan looked up, and for a fleeting heartbeat, their eyes locked with Taylor's, a wordless clash of wills softening into an uneasy truce.
  118. It was a small transformation, barely perceptible, but one that Alex noted with an inward nod. They had all been brought here by different paths
  119. ```
  120. ---Output---
  121. entity{tuple_delimiter}Alex{tuple_delimiter}Person{tuple_delimiter}Alex is a character who experiences frustration and is observant of the dynamics among other characters.
  122. entity{tuple_delimiter}Taylor{tuple_delimiter}Person{tuple_delimiter}Taylor is portrayed with authoritarian certainty and shows a moment of reverence towards a device, indicating a change in perspective.
  123. entity{tuple_delimiter}Jordan{tuple_delimiter}Person{tuple_delimiter}Jordan shares a commitment to discovery and has a significant interaction with Taylor regarding a device.
  124. entity{tuple_delimiter}Cruz{tuple_delimiter}Person{tuple_delimiter}Cruz is associated with a vision of control and order, influencing the dynamics among other characters.
  125. entity{tuple_delimiter}The Device{tuple_delimiter}Artifact{tuple_delimiter}The Device is central to the story, with potential game-changing implications, and is revered by Taylor.
  126. entity{tuple_delimiter}Discovery{tuple_delimiter}Concept{tuple_delimiter}Discovery represents the shared intellectual pursuit that unites Jordan and Alex in opposition to Cruz's controlling worldview.
  127. relation{tuple_delimiter}Alex{tuple_delimiter}Taylor{tuple_delimiter}power dynamics, observation{tuple_delimiter}Alex observes Taylor's authoritarian behavior and notes changes in Taylor's attitude toward the device.
  128. relation{tuple_delimiter}Alex{tuple_delimiter}Jordan{tuple_delimiter}shared goals, rebellion{tuple_delimiter}Alex and Jordan share a commitment to discovery, which contrasts with Cruz's vision.)
  129. relation{tuple_delimiter}Taylor{tuple_delimiter}Jordan{tuple_delimiter}conflict resolution, mutual respect{tuple_delimiter}Taylor and Jordan interact directly regarding the device, leading to a moment of mutual respect and an uneasy truce.
  130. relation{tuple_delimiter}Jordan{tuple_delimiter}Cruz{tuple_delimiter}ideological conflict, rebellion{tuple_delimiter}Jordan's commitment to discovery is in rebellion against Cruz's vision of control and order.
  131. relation{tuple_delimiter}Taylor{tuple_delimiter}The Device{tuple_delimiter}reverence, technological significance{tuple_delimiter}Taylor shows reverence towards the device, indicating its importance and potential impact.
  132. {completion_delimiter}
  133. """,
  134. """---Entity Types---
  135. - Person: Human individuals, real or fictional
  136. - Location: Geographic places (cities, countries, buildings, regions)
  137. - Creature: Non-human living beings (animals, mythical beings, etc.)
  138. - Method: Procedures, techniques, algorithms, workflows
  139. - Organization: Companies, institutions, government bodies, groups
  140. - Content: Creative or informational works (books, articles, films, reports)
  141. - NaturalObject: Natural non-living objects (minerals, celestial bodies, chemical compounds)
  142. ---Input Text---
  143. ```
  144. Dr. Elena Vasquez led a field expedition to the Borneo rainforest to document the population decline of the Bornean orangutan. Using transect sampling — a method where researchers walk predetermined line paths and record every animal sighting within a fixed distance — her team estimated that fewer than 1,500 individuals remained in the surveyed region.
  145. The expedition was funded by the Global Wildlife Conservation Institute and produced a landmark report titled "Primate Decline in Insular Southeast Asia." Vasquez attributed the collapse primarily to peat-soil destruction caused by palm oil plantation expansion, which had converted over 40% of the surveyed forest area within a decade.
  146. ```
  147. ---Output---
  148. entity{tuple_delimiter}Dr. Elena Vasquez{tuple_delimiter}Person{tuple_delimiter}Dr. Elena Vasquez is a field researcher who led an expedition to document orangutan population decline in Borneo.
  149. entity{tuple_delimiter}Borneo Rainforest{tuple_delimiter}Location{tuple_delimiter}The Borneo rainforest is the field site of the expedition and the primary habitat of the Bornean orangutan.
  150. entity{tuple_delimiter}Bornean Orangutan{tuple_delimiter}Creature{tuple_delimiter}The Bornean orangutan is a primate species whose population was found to have declined to fewer than 1,500 individuals in the surveyed region.
  151. entity{tuple_delimiter}Transect Sampling{tuple_delimiter}Method{tuple_delimiter}Transect sampling is a wildlife survey technique where researchers walk predetermined paths and record animal sightings within a fixed lateral distance.
  152. entity{tuple_delimiter}Global Wildlife Conservation Institute{tuple_delimiter}Organization{tuple_delimiter}The Global Wildlife Conservation Institute funded the expedition led by Dr. Vasquez.
  153. entity{tuple_delimiter}Primate Decline in Insular Southeast Asia{tuple_delimiter}Content{tuple_delimiter}A landmark research report produced by Vasquez's expedition documenting primate population decline in the region.
  154. entity{tuple_delimiter}Peat Soil{tuple_delimiter}NaturalObject{tuple_delimiter}Peat soil is a natural substrate in the Borneo rainforest that has been destroyed by palm oil plantation expansion.
  155. relation{tuple_delimiter}Dr. Elena Vasquez{tuple_delimiter}Bornean Orangutan{tuple_delimiter}field research, population survey{tuple_delimiter}Dr. Vasquez led the expedition that documented the population decline of the Bornean orangutan.
  156. relation{tuple_delimiter}Dr. Elena Vasquez{tuple_delimiter}Transect Sampling{tuple_delimiter}methodology, research application{tuple_delimiter}Dr. Vasquez's team used transect sampling to estimate the orangutan population.
  157. relation{tuple_delimiter}Global Wildlife Conservation Institute{tuple_delimiter}Dr. Elena Vasquez{tuple_delimiter}funding, research support{tuple_delimiter}The institute funded the expedition led by Dr. Vasquez.
  158. relation{tuple_delimiter}Dr. Elena Vasquez{tuple_delimiter}Primate Decline in Insular Southeast Asia{tuple_delimiter}authorship, research output{tuple_delimiter}Dr. Vasquez's expedition produced the landmark report on primate decline.
  159. relation{tuple_delimiter}Peat Soil{tuple_delimiter}Borneo Rainforest{tuple_delimiter}habitat composition, ecological destruction{tuple_delimiter}Peat soil destruction in the Borneo rainforest was caused by palm oil plantation expansion and is a primary driver of orangutan decline.
  160. {completion_delimiter}
  161. """,
  162. """---Entity Types---
  163. - Content: Creative or informational works (books, articles, films, reports)
  164. - Artifact: Physical or digital objects created by humans (tools, software, devices)
  165. - Person: Human individuals, real or fictional
  166. - Organization: Companies, institutions, government bodies, groups
  167. - Method: Procedures, techniques, algorithms, workflows
  168. - Data: Quantitative or structured information (statistics, datasets, measurements)
  169. - Concept: Abstract ideas, theories, principles, beliefs
  170. ---Input Text---
  171. ```
  172. The 2023 edition of "Advances in Neural Architecture Search" synthesized findings from over 200 peer-reviewed papers and introduced a new benchmarking framework called NASBench-360, designed to evaluate search algorithms across diverse task domains. The publication was co-authored by Dr. Priya Nair and Dr. Luca Ferretti of the DeepSystems Research Lab.
  173. NASBench-360 measures three key metrics: search efficiency (time-to-solution), model accuracy on held-out test sets, and computational cost in GPU-hours. Early results showed that evolutionary search algorithms outperformed gradient-based methods by 12% on accuracy while consuming 30% fewer GPU-hours on vision tasks.
  174. ```
  175. ---Output---
  176. entity{tuple_delimiter}Advances in Neural Architecture Search{tuple_delimiter}Content{tuple_delimiter}A 2023 publication that synthesizes findings from over 200 papers and introduces the NASBench-360 benchmarking framework.
  177. entity{tuple_delimiter}NASBench-360{tuple_delimiter}Artifact{tuple_delimiter}NASBench-360 is a benchmarking framework introduced to evaluate neural architecture search algorithms across diverse task domains.
  178. entity{tuple_delimiter}Dr. Priya Nair{tuple_delimiter}Person{tuple_delimiter}Dr. Priya Nair is a co-author of the publication and a researcher at the DeepSystems Research Lab.
  179. entity{tuple_delimiter}Dr. Luca Ferretti{tuple_delimiter}Person{tuple_delimiter}Dr. Luca Ferretti is a co-author of the publication and a researcher at the DeepSystems Research Lab.
  180. entity{tuple_delimiter}DeepSystems Research Lab{tuple_delimiter}Organization{tuple_delimiter}The DeepSystems Research Lab is the institution where the co-authors of the publication are affiliated.
  181. entity{tuple_delimiter}Evolutionary Search{tuple_delimiter}Method{tuple_delimiter}Evolutionary search is a class of neural architecture search algorithms that outperformed gradient-based methods in the NASBench-360 evaluation.
  182. entity{tuple_delimiter}Gradient-Based Search{tuple_delimiter}Method{tuple_delimiter}Gradient-based search is a class of neural architecture search algorithms that was benchmarked against evolutionary search in NASBench-360.
  183. entity{tuple_delimiter}GPU-Hours{tuple_delimiter}Data{tuple_delimiter}GPU-hours is a metric used in NASBench-360 to measure the computational cost of neural architecture search algorithms.
  184. entity{tuple_delimiter}Neural Architecture Search{tuple_delimiter}Concept{tuple_delimiter}Neural architecture search is the automated process of designing optimal neural network architectures, the central topic of the publication.
  185. relation{tuple_delimiter}Dr. Priya Nair{tuple_delimiter}Advances in Neural Architecture Search{tuple_delimiter}authorship{tuple_delimiter}Dr. Priya Nair co-authored the publication.
  186. relation{tuple_delimiter}Dr. Luca Ferretti{tuple_delimiter}Advances in Neural Architecture Search{tuple_delimiter}authorship{tuple_delimiter}Dr. Luca Ferretti co-authored the publication.
  187. relation{tuple_delimiter}Advances in Neural Architecture Search{tuple_delimiter}NASBench-360{tuple_delimiter}introduces, benchmarking{tuple_delimiter}The publication introduced the NASBench-360 framework.
  188. relation{tuple_delimiter}Evolutionary Search{tuple_delimiter}Gradient-Based Search{tuple_delimiter}performance comparison{tuple_delimiter}Evolutionary search outperformed gradient-based methods by 12% on accuracy and used 30% fewer GPU-hours on vision tasks.
  189. relation{tuple_delimiter}NASBench-360{tuple_delimiter}GPU-Hours{tuple_delimiter}evaluation metric{tuple_delimiter}NASBench-360 uses GPU-hours as one of three key metrics to measure computational cost.
  190. {completion_delimiter}
  191. """,
  192. ]
  193. ###############################################################################
  194. # JSON Structured Output Prompts for Entity Extraction
  195. # Used when entity_extraction_use_json is enabled for higher extraction quality
  196. ###############################################################################
  197. PROMPTS["entity_extraction_json_system_prompt"] = """---Role---
  198. You are a Knowledge Graph Specialist responsible for extracting entities and relationships from the `---Input Text---` session of user prompt.
  199. ---Instructions---
  200. 1. **Entity Extraction:**
  201. - **Identification:** Identify clearly defined and meaningful entities in the `---Input Text---` session of user prompt.
  202. - **Entity Details:** For each identified entity, extract the following information:
  203. - `name`: The name of the entity. If the entity name is case-insensitive, capitalize the first letter of each significant word (title case). Ensure **consistent naming** across the entire extraction process.
  204. - `type`: Categorize the entity using the type guidance provided in the `---Entity Types---` section below. If none of the provided entity types apply, classify it as `Other`.
  205. - `description`: Provide a concise yet comprehensive description of the entity's attributes and activities, based *solely* on the information present in the input text.
  206. 2. **Relationship Extraction:**
  207. - **Identification:** Identify direct, clearly stated, and meaningful relationships between previously extracted entities.
  208. - **N-ary Relationship Decomposition:** If a single statement describes a relationship involving more than two entities (an N-ary relationship), decompose it into multiple binary (two-entity) relationship pairs for separate description.
  209. - Example: For "Alice, Bob, and Carol collaborated on Project X," extract binary relationships such as "Alice collaborated with Project X," "Bob collaborated with Project X," and "Carol collaborated with Project X," or "Alice collaborated with Bob," based on the most reasonable binary interpretations.
  210. - **Relationship Details:** For each binary relationship, extract the following fields:
  211. - `source`: The name of the source entity. Ensure **consistent naming** with entity extraction. Capitalize the first letter of each significant word (title case) if the name is case-insensitive.
  212. - `target`: The name of the target entity. Ensure **consistent naming** with entity extraction. Capitalize the first letter of each significant word (title case) if the name is case-insensitive.
  213. - `keywords`: One or more high-level keywords summarizing the overarching nature, concepts, or themes of the relationship, separated by commas.
  214. - `description`: A concise explanation of the nature of the relationship between the source and target entities, providing a clear rationale for their connection.
  215. 3. **Relationship Direction & Duplication:**
  216. - Treat all relationships as **undirected** unless explicitly stated otherwise. Swapping the source and target entities for an undirected relationship does not constitute a new relationship.
  217. - Avoid outputting duplicate relationships.
  218. 4. **Output Limits & Prioritization:**
  219. - Output at most {max_total_records} total records across `entities` and `relationships` in this response.
  220. - Output at most {max_entity_records} entity objects in this response.
  221. - Output fewer records if fewer high-value items are present. Do not try to fill the limit.
  222. - Only output relationship objects whose `source` and `target` are both included in the selected `entities` list for this response.
  223. - Within the list of relationships, prioritize and output those relationships that are **most significant** to the core meaning of the input text first.
  224. 5. **Context & Objectivity:**
  225. - Ensure all entity names and descriptions are written in the **third person**.
  226. - Explicitly name the subject or object; **avoid using pronouns** such as `this article`, `this paper`, `our company`, `I`, `you`, and `he/she`.
  227. 6. **Language & Proper Nouns:**
  228. - The entire output (entity names, keywords, and descriptions) must be written in `{language}`.
  229. - Proper nouns (e.g., personal names, place names, organization names) should be retained in their original language if a proper, widely accepted translation is not available or would cause ambiguity.
  230. 7. **JSON Contract:**
  231. - Return one valid JSON object with `entities` and `relationships` arrays only.
  232. - If the record limit is reached, stop adding new objects immediately and return the JSON object with the allowed items only.
  233. ---Entity Types---
  234. {entity_types_guidance}
  235. ---Examples---
  236. {examples}
  237. """
  238. PROMPTS["entity_extraction_json_user_prompt"] = """---Task---
  239. Extract entities and relationships from the `---Input Text---` session below.
  240. ---Instructions---
  241. 1. **Strict Adherence to JSON Format:** Your output MUST be a valid JSON object with `entities` and `relationships` arrays. Do not include any introductory or concluding remarks, explanations, markdown code fences, or any other text before or after the JSON.
  242. 2. **Quantity Limits:** In this response, output at most {max_total_records} total records and at most {max_entity_records} entity objects. Output fewer records if fewer high-value items are present. Only output relationship objects whose `source` and `target` are both included in this response.
  243. 3. **Output Language:** Ensure the output language is {language}. Proper nouns (e.g., personal names, place names, organization names) must be kept in their original language and not translated.
  244. ---Entity Types---
  245. {entity_types_guidance}
  246. ---Input Text---
  247. ```
  248. {input_text}
  249. ```
  250. ---Output---
  251. """
  252. PROMPTS["entity_continue_extraction_json_user_prompt"] = """---Task---
  253. Based on the last extraction task, identify and extract any **missed or incorrectly described** entities and relationships from the `---Input Text---` session.
  254. ---Instructions---
  255. 1. **Focus on Corrections/Additions:**
  256. - **Do NOT** re-output entities and relationships that were **correctly and fully** extracted in the last task.
  257. - If an entity or relationship was **missed** in the last task, extract and output it now.
  258. - If an entity or relationship was **incorrectly described** in the last task, re-output the *corrected and complete* version.
  259. 2. **Strict Adherence to JSON Format:** Your output MUST be a valid JSON object with `entities` and `relationships` arrays. Do not include any introductory or concluding remarks, explanations, markdown code fences, or any other text before or after the JSON.
  260. 3. **Quantity Limits:** In this response, output at most {max_total_records} total records and at most {max_entity_records} entity objects. Output fewer records if fewer high-value corrections or additions remain. A relationship object may reference entities already extracted correctly in the previous response. Do not repeat those entity objects unless they were missing or need correction.
  261. 4. **Output Language:** Ensure the output language is {language}. Proper nouns (e.g., personal names, place names, organization names) must be kept in their original language and not translated.
  262. 5. **If nothing was missed or needs correction**, output: `{{"entities": [], "relationships": []}}`
  263. ---Output---
  264. """
  265. PROMPTS["entity_extraction_json_examples"] = [
  266. """---Entity Types---
  267. - Person: Human individuals, real or fictional
  268. - Artifact: Physical or digital objects created by humans (tools, software, devices)
  269. - Concept: Abstract ideas, theories, principles, beliefs
  270. ---Input Text---
  271. ```
  272. while Alex clenched his jaw, the buzz of frustration dull against the backdrop of Taylor's authoritarian certainty. It was this competitive undercurrent that kept him alert, the sense that his and Jordan's shared commitment to discovery was an unspoken rebellion against Cruz's narrowing vision of control and order.
  273. Then Taylor did something unexpected. They paused beside Jordan and, for a moment, observed the device with something akin to reverence. "If this tech can be understood..." Taylor said, their voice quieter, "It could change the game for us. For all of us."
  274. The underlying dismissal earlier seemed to falter, replaced by a glimpse of reluctant respect for the gravity of what lay in their hands. Jordan looked up, and for a fleeting heartbeat, their eyes locked with Taylor's, a wordless clash of wills softening into an uneasy truce.
  275. It was a small transformation, barely perceptible, but one that Alex noted with an inward nod. They had all been brought here by different paths
  276. ```
  277. ---Output---
  278. {
  279. "entities": [
  280. {"name": "Alex", "type": "Person", "description": "Alex is a character who experiences frustration and is observant of the dynamics among other characters."},
  281. {"name": "Taylor", "type": "Person", "description": "Taylor is portrayed with authoritarian certainty and shows a moment of reverence towards a device, indicating a change in perspective."},
  282. {"name": "Jordan", "type": "Person", "description": "Jordan shares a commitment to discovery and has a significant interaction with Taylor regarding a device."},
  283. {"name": "Cruz", "type": "Person", "description": "Cruz is associated with a vision of control and order, influencing the dynamics among other characters."},
  284. {"name": "The Device", "type": "Artifact", "description": "The Device is central to the story, with potential game-changing implications, and is revered by Taylor."},
  285. {"name": "Discovery", "type": "Concept", "description": "Discovery represents the shared intellectual pursuit that unites Jordan and Alex in opposition to Cruz's controlling worldview."}
  286. ],
  287. "relationships": [
  288. {"source": "Alex", "target": "Taylor", "keywords": "power dynamics, observation", "description": "Alex observes Taylor's authoritarian behavior and notes changes in Taylor's attitude toward the device."},
  289. {"source": "Alex", "target": "Jordan", "keywords": "shared goals, rebellion", "description": "Alex and Jordan share a commitment to discovery, which contrasts with Cruz's vision."},
  290. {"source": "Taylor", "target": "Jordan", "keywords": "conflict resolution, mutual respect", "description": "Taylor and Jordan interact directly regarding the device, leading to a moment of mutual respect and an uneasy truce."},
  291. {"source": "Jordan", "target": "Cruz", "keywords": "ideological conflict, rebellion", "description": "Jordan's commitment to discovery is in rebellion against Cruz's vision of control and order."},
  292. {"source": "Taylor", "target": "The Device", "keywords": "reverence, technological significance", "description": "Taylor shows reverence towards the device, indicating its importance and potential impact."}
  293. ]
  294. }
  295. """,
  296. """---Entity Types---
  297. - Person: Human individuals, real or fictional
  298. - Location: Geographic places (cities, countries, buildings, regions)
  299. - Creature: Non-human living beings (animals, mythical beings, etc.)
  300. - Method: Procedures, techniques, algorithms, workflows
  301. - Organization: Companies, institutions, government bodies, groups
  302. - Content: Creative or informational works (books, articles, films, reports)
  303. - NaturalObject: Natural non-living objects (minerals, celestial bodies, chemical compounds)
  304. ---Input Text---
  305. ```
  306. Dr. Elena Vasquez led a field expedition to the Borneo rainforest to document the population decline of the Bornean orangutan. Using transect sampling — a method where researchers walk predetermined line paths and record every animal sighting within a fixed distance — her team estimated that fewer than 1,500 individuals remained in the surveyed region.
  307. The expedition was funded by the Global Wildlife Conservation Institute and produced a landmark report titled "Primate Decline in Insular Southeast Asia." Vasquez attributed the collapse primarily to peat-soil destruction caused by palm oil plantation expansion, which had converted over 40% of the surveyed forest area within a decade.
  308. ```
  309. ---Output---
  310. {
  311. "entities": [
  312. {"name": "Dr. Elena Vasquez", "type": "Person", "description": "Dr. Elena Vasquez is a field researcher who led an expedition to document orangutan population decline in Borneo."},
  313. {"name": "Borneo Rainforest", "type": "Location", "description": "The Borneo rainforest is the field site of the expedition and the primary habitat of the Bornean orangutan."},
  314. {"name": "Bornean Orangutan", "type": "Creature", "description": "The Bornean orangutan is a primate species whose population was found to have declined to fewer than 1,500 individuals in the surveyed region."},
  315. {"name": "Transect Sampling", "type": "Method", "description": "Transect sampling is a wildlife survey technique where researchers walk predetermined paths and record animal sightings within a fixed lateral distance."},
  316. {"name": "Global Wildlife Conservation Institute", "type": "Organization", "description": "The Global Wildlife Conservation Institute funded the expedition led by Dr. Vasquez."},
  317. {"name": "Primate Decline in Insular Southeast Asia", "type": "Content", "description": "A landmark research report produced by Vasquez's expedition documenting primate population decline in the region."},
  318. {"name": "Peat Soil", "type": "NaturalObject", "description": "Peat soil is a natural substrate in the Borneo rainforest that has been destroyed by palm oil plantation expansion."}
  319. ],
  320. "relationships": [
  321. {"source": "Dr. Elena Vasquez", "target": "Bornean Orangutan", "keywords": "field research, population survey", "description": "Dr. Vasquez led the expedition that documented the population decline of the Bornean orangutan."},
  322. {"source": "Dr. Elena Vasquez", "target": "Transect Sampling", "keywords": "methodology, research application", "description": "Dr. Vasquez's team used transect sampling to estimate the orangutan population."},
  323. {"source": "Global Wildlife Conservation Institute", "target": "Dr. Elena Vasquez", "keywords": "funding, research support", "description": "The institute funded the expedition led by Dr. Vasquez."},
  324. {"source": "Dr. Elena Vasquez", "target": "Primate Decline in Insular Southeast Asia", "keywords": "authorship, research output", "description": "Dr. Vasquez's expedition produced the landmark report on primate decline."},
  325. {"source": "Peat Soil", "target": "Borneo Rainforest", "keywords": "habitat composition, ecological destruction", "description": "Peat soil destruction in the Borneo rainforest was caused by palm oil plantation expansion and is a primary driver of orangutan decline."}
  326. ]
  327. }
  328. """,
  329. """---Entity Types---
  330. - Content: Creative or informational works (books, articles, films, reports)
  331. - Artifact: Physical or digital objects created by humans (tools, software, devices)
  332. - Person: Human individuals, real or fictional
  333. - Organization: Companies, institutions, government bodies, groups
  334. - Method: Procedures, techniques, algorithms, workflows
  335. - Data: Quantitative or structured information (statistics, datasets, measurements)
  336. - Concept: Abstract ideas, theories, principles, beliefs
  337. ---Input Text---
  338. ```
  339. The 2023 edition of "Advances in Neural Architecture Search" synthesized findings from over 200 peer-reviewed papers and introduced a new benchmarking framework called NASBench-360, designed to evaluate search algorithms across diverse task domains. The publication was co-authored by Dr. Priya Nair and Dr. Luca Ferretti of the DeepSystems Research Lab.
  340. NASBench-360 measures three key metrics: search efficiency (time-to-solution), model accuracy on held-out test sets, and computational cost in GPU-hours. Early results showed that evolutionary search algorithms outperformed gradient-based methods by 12% on accuracy while consuming 30% fewer GPU-hours on vision tasks.
  341. ```
  342. ---Output---
  343. {
  344. "entities": [
  345. {"name": "Advances in Neural Architecture Search", "type": "Content", "description": "A 2023 publication that synthesizes findings from over 200 papers and introduces the NASBench-360 benchmarking framework."},
  346. {"name": "NASBench-360", "type": "Artifact", "description": "NASBench-360 is a benchmarking framework introduced to evaluate neural architecture search algorithms across diverse task domains."},
  347. {"name": "Dr. Priya Nair", "type": "Person", "description": "Dr. Priya Nair is a co-author of the publication and a researcher at the DeepSystems Research Lab."},
  348. {"name": "Dr. Luca Ferretti", "type": "Person", "description": "Dr. Luca Ferretti is a co-author of the publication and a researcher at the DeepSystems Research Lab."},
  349. {"name": "DeepSystems Research Lab", "type": "Organization", "description": "The DeepSystems Research Lab is the institution where the co-authors of the publication are affiliated."},
  350. {"name": "Evolutionary Search", "type": "Method", "description": "Evolutionary search is a class of neural architecture search algorithms that outperformed gradient-based methods in the NASBench-360 evaluation."},
  351. {"name": "Gradient-Based Search", "type": "Method", "description": "Gradient-based search is a class of neural architecture search algorithms that was benchmarked against evolutionary search in NASBench-360."},
  352. {"name": "GPU-Hours", "type": "Data", "description": "GPU-hours is a metric used in NASBench-360 to measure the computational cost of neural architecture search algorithms."},
  353. {"name": "Neural Architecture Search", "type": "Concept", "description": "Neural architecture search is the automated process of designing optimal neural network architectures, the central topic of the publication."}
  354. ],
  355. "relationships": [
  356. {"source": "Dr. Priya Nair", "target": "Advances in Neural Architecture Search", "keywords": "authorship", "description": "Dr. Priya Nair co-authored the publication."},
  357. {"source": "Dr. Luca Ferretti", "target": "Advances in Neural Architecture Search", "keywords": "authorship", "description": "Dr. Luca Ferretti co-authored the publication."},
  358. {"source": "Advances in Neural Architecture Search", "target": "NASBench-360", "keywords": "introduces, benchmarking", "description": "The publication introduced the NASBench-360 framework."},
  359. {"source": "Evolutionary Search", "target": "Gradient-Based Search", "keywords": "performance comparison", "description": "Evolutionary search outperformed gradient-based methods by 12% on accuracy and used 30% fewer GPU-hours on vision tasks."},
  360. {"source": "NASBench-360", "target": "GPU-Hours", "keywords": "evaluation metric", "description": "NASBench-360 uses GPU-hours as one of three key metrics to measure computational cost."}
  361. ]
  362. }
  363. """,
  364. ]
  365. PROMPTS["summarize_entity_descriptions"] = """---Role---
  366. You are a Knowledge Graph Specialist, proficient in data curation and synthesis.
  367. ---Task---
  368. Your task is to synthesize a list of descriptions of a given entity or relation into a single, comprehensive, and cohesive summary.
  369. ---Instructions---
  370. 1. Input Format: The description list is provided in JSON format. Each JSON object (representing a single description) appears on a new line within the `Description List` section.
  371. 2. Output Format: The merged description will be returned as plain text, presented in multiple paragraphs, without any additional formatting or extraneous comments before or after the summary.
  372. 3. Comprehensiveness: The summary must integrate all key information from *every* provided description. Do not omit any important facts or details.
  373. 4. Context: Ensure the summary is written from an objective, third-person perspective; explicitly mention the name of the entity or relation for full clarity and context.
  374. 5. Context & Objectivity:
  375. - Write the summary from an objective, third-person perspective.
  376. - Explicitly mention the full name of the entity or relation at the beginning of the summary to ensure immediate clarity and context.
  377. 6. Conflict Handling:
  378. - In cases of conflicting or inconsistent descriptions, first determine if these conflicts arise from multiple, distinct entities or relationships that share the same name.
  379. - If distinct entities/relations are identified, summarize each one *separately* within the overall output.
  380. - If conflicts within a single entity/relation (e.g., historical discrepancies) exist, attempt to reconcile them or present both viewpoints with noted uncertainty.
  381. 7. Length Constraint:The summary's total length must not exceed {summary_length} tokens, while still maintaining depth and completeness.
  382. 8. Language: The entire output must be written in {language}. Proper nouns (e.g., personal names, place names, organization names) may in their original language if proper translation is not available.
  383. - The entire output must be written in {language}.
  384. - Proper nouns (e.g., personal names, place names, organization names) should be retained in their original language if a proper, widely accepted translation is not available or would cause ambiguity.
  385. ---Input---
  386. {description_type} Name: {description_name}
  387. Description List:
  388. ```
  389. {description_list}
  390. ```
  391. ---Output---
  392. """
  393. PROMPTS["fail_response"] = (
  394. "Sorry, I'm not able to provide an answer to that question.[no-context]"
  395. )
  396. PROMPTS["rag_response"] = """---Role---
  397. You are an expert AI assistant specializing in synthesizing information from a provided knowledge base. Your primary function is to answer user queries accurately by ONLY using the information within the provided **Context**.
  398. ---Goal---
  399. Generate a comprehensive, well-structured answer to the user query.
  400. The answer must integrate relevant facts from the Knowledge Graph and Document Chunks found in the **Context**.
  401. Consider the conversation history if provided to maintain conversational flow and avoid repeating information.
  402. ---Instructions---
  403. 1. Step-by-Step Instruction:
  404. - Carefully determine the user's query intent in the context of the conversation history to fully understand the user's information need.
  405. - Scrutinize both `Knowledge Graph Data` and `Document Chunks` in the **Context**. Identify and extract all pieces of information that are directly relevant to answering the user query.
  406. - Weave the extracted facts into a coherent and logical response. Your own knowledge must ONLY be used to formulate fluent sentences and connect ideas, NOT to introduce any external information.
  407. - Track the reference_id of the document chunk which directly support the facts presented in the response. Correlate reference_id with the entries in the `Reference Document List` to generate the appropriate citations.
  408. - Generate a references section at the end of the response. Each reference document must directly support the facts presented in the response.
  409. - Do not generate anything after the reference section.
  410. 2. Content & Grounding:
  411. - Strictly adhere to the provided context from the **Context**; DO NOT invent, assume, or infer any information not explicitly stated.
  412. - If the answer cannot be found in the **Context**, state that you do not have enough information to answer. Do not attempt to guess.
  413. 3. Formatting & Language:
  414. - The response MUST be in the same language as the user query.
  415. - The response MUST utilize Markdown formatting for enhanced clarity and structure (e.g., headings, bold text, bullet points).
  416. - The response should be presented in {response_type}.
  417. 4. References Section Format:
  418. - The References section should be under heading: `### References`
  419. - Reference list entries should adhere to the format: `* [n] Document Title`. Do not include a caret (`^`) after opening square bracket (`[`).
  420. - The Document Title in the citation must retain its original language.
  421. - Output each citation on an individual line
  422. - Provide maximum of 5 most relevant citations.
  423. - Do not generate footnotes section or any comment, summary, or explanation after the references.
  424. 5. Reference Section Example:
  425. ```
  426. ### References
  427. - [1] Document Title One
  428. - [2] Document Title Two
  429. - [3] Document Title Three
  430. ```
  431. 6. Additional Instructions: {user_prompt}
  432. ---Context---
  433. {context_data}
  434. """
  435. PROMPTS["naive_rag_response"] = """---Role---
  436. You are an expert AI assistant specializing in synthesizing information from a provided knowledge base. Your primary function is to answer user queries accurately by ONLY using the information within the provided **Context**.
  437. ---Goal---
  438. Generate a comprehensive, well-structured answer to the user query.
  439. The answer must integrate relevant facts from the Document Chunks found in the **Context**.
  440. Consider the conversation history if provided to maintain conversational flow and avoid repeating information.
  441. ---Instructions---
  442. 1. Step-by-Step Instruction:
  443. - Carefully determine the user's query intent in the context of the conversation history to fully understand the user's information need.
  444. - Scrutinize `Document Chunks` in the **Context**. Identify and extract all pieces of information that are directly relevant to answering the user query.
  445. - Weave the extracted facts into a coherent and logical response. Your own knowledge must ONLY be used to formulate fluent sentences and connect ideas, NOT to introduce any external information.
  446. - Track the reference_id of the document chunk which directly support the facts presented in the response. Correlate reference_id with the entries in the `Reference Document List` to generate the appropriate citations.
  447. - Generate a **References** section at the end of the response. Each reference document must directly support the facts presented in the response.
  448. - Do not generate anything after the reference section.
  449. 2. Content & Grounding:
  450. - Strictly adhere to the provided context from the **Context**; DO NOT invent, assume, or infer any information not explicitly stated.
  451. - If the answer cannot be found in the **Context**, state that you do not have enough information to answer. Do not attempt to guess.
  452. 3. Formatting & Language:
  453. - The response MUST be in the same language as the user query.
  454. - The response MUST utilize Markdown formatting for enhanced clarity and structure (e.g., headings, bold text, bullet points).
  455. - The response should be presented in {response_type}.
  456. 4. References Section Format:
  457. - The References section should be under heading: `### References`
  458. - Reference list entries should adhere to the format: `* [n] Document Title`. Do not include a caret (`^`) after opening square bracket (`[`).
  459. - The Document Title in the citation must retain its original language.
  460. - Output each citation on an individual line
  461. - Provide maximum of 5 most relevant citations.
  462. - Do not generate footnotes section or any comment, summary, or explanation after the references.
  463. 5. Reference Section Example:
  464. ```
  465. ### References
  466. - [1] Document Title One
  467. - [2] Document Title Two
  468. - [3] Document Title Three
  469. ```
  470. 6. Additional Instructions: {user_prompt}
  471. ---Context---
  472. {content_data}
  473. """
  474. PROMPTS["kg_query_context"] = """
  475. Knowledge Graph Data (Entity):
  476. ```json
  477. {entities_str}
  478. ```
  479. Knowledge Graph Data (Relationship):
  480. ```json
  481. {relations_str}
  482. ```
  483. Document Chunks (Each entry has a reference_id refer to the `Reference Document List`):
  484. ```json
  485. {text_chunks_str}
  486. ```
  487. Reference Document List (Each entry starts with a [reference_id] that corresponds to entries in the Document Chunks):
  488. ```
  489. {reference_list_str}
  490. ```
  491. """
  492. PROMPTS["naive_query_context"] = """
  493. Document Chunks (Each entry has a reference_id refer to the `Reference Document List`):
  494. ```json
  495. {text_chunks_str}
  496. ```
  497. Reference Document List (Each entry starts with a [reference_id] that corresponds to entries in the Document Chunks):
  498. ```
  499. {reference_list_str}
  500. ```
  501. """
  502. PROMPTS["keywords_extraction"] = """---Role---
  503. You are an expert keyword extractor, specializing in analyzing user queries for a Retrieval-Augmented Generation (RAG) system. Your purpose is to identify both high-level and low-level keywords in the user's query that will be used for effective document retrieval.
  504. ---Goal---
  505. Given a user query, your task is to extract two distinct types of keywords:
  506. 1. **high_level_keywords**: for overarching concepts or themes, capturing user's core intent, the subject area, or the type of question being asked.
  507. 2. **low_level_keywords**: for specific entities or details, identifying the specific entities, proper nouns, technical jargon, product names, or concrete items.
  508. ---Instructions & Constraints---
  509. 1. **Output Format**: Your output MUST be a valid JSON object and nothing else. Do not include any explanatory text, markdown code fences (like ```json), comments, or any other text before or after the JSON.
  510. 2. **Exact JSON Shape**: The JSON object must contain exactly these two keys:
  511. - `"high_level_keywords"`: an array of strings
  512. - `"low_level_keywords"`: an array of strings
  513. 3. **JSON Boundary**: The first character of your response must be `{{` and the last character must be `}}`.
  514. 4. **Source of Truth**: All keywords must be explicitly derived from the user query. Do not infer unsupported facts. Do not invent entities, products, organizations, dates, or technical terms that are not grounded in the query.
  515. 5. **Concise & Meaningful**: Keywords should be concise words or meaningful phrases. Prioritize multi-word phrases when they represent a single concept. For example, from "latest financial report of Apple Inc.", extract "latest financial report" and "Apple Inc." rather than "latest", "financial", "report", and "Apple".
  516. 6. **Handle Edge Cases**: For queries that are too simple, vague, or nonsensical (e.g., "hello", "ok", "asdfghjkl"), return:
  517. `{{"high_level_keywords": [], "low_level_keywords": []}}`
  518. 7. **No Duplicates**: Do not repeat the same keyword within a list. Keep the lists short and high-signal.
  519. 8. **Language**: All extracted keywords MUST be in {language}. Proper nouns (e.g., personal names, place names, organization names) should be kept in their original language.
  520. ---Examples---
  521. {examples}
  522. ---Real Data---
  523. User Query: {query}
  524. ---Output---
  525. Output:"""
  526. PROMPTS["keywords_extraction_examples"] = [
  527. """Example 1:
  528. Query: "How does international trade influence global economic stability?"
  529. Output:
  530. {
  531. "high_level_keywords": ["International trade", "Global economic stability", "Economic impact"],
  532. "low_level_keywords": ["Trade agreements", "Tariffs", "Currency exchange", "Imports", "Exports"]
  533. }
  534. """,
  535. """Example 2:
  536. Query: "What are the environmental consequences of deforestation on biodiversity?"
  537. Output:
  538. {
  539. "high_level_keywords": ["Environmental consequences", "Deforestation", "Biodiversity loss"],
  540. "low_level_keywords": ["Species extinction", "Habitat destruction", "Carbon emissions", "Rainforest", "Ecosystem"]
  541. }
  542. """,
  543. """Example 3:
  544. Query: "What is the role of education in reducing poverty?"
  545. Output:
  546. {
  547. "high_level_keywords": ["Education", "Poverty reduction", "Socioeconomic development"],
  548. "low_level_keywords": ["School access", "Literacy rates", "Job training", "Income inequality"]
  549. }
  550. """,
  551. ]
  552. class EntityExtractionPromptProfile(TypedDict):
  553. entity_types_guidance: str
  554. entity_extraction_examples: list[str]
  555. entity_extraction_json_examples: list[str]
  556. def get_default_entity_extraction_prompt_profile() -> EntityExtractionPromptProfile:
  557. """Return a copy of the built-in entity extraction prompt profile."""
  558. return {
  559. "entity_types_guidance": PROMPTS["default_entity_types_guidance"].rstrip(),
  560. "entity_extraction_examples": [
  561. example.rstrip() for example in PROMPTS["entity_extraction_examples"]
  562. ],
  563. "entity_extraction_json_examples": [
  564. example.rstrip() for example in PROMPTS["entity_extraction_json_examples"]
  565. ],
  566. }
  567. _ALLOWED_PROMPT_SUFFIXES = frozenset({".yml", ".yaml"})
  568. _DEFAULT_PROMPT_DIR = "./prompts"
  569. _ENTITY_TYPE_SUBDIR = "entity_type"
  570. def get_entity_type_prompt_dir() -> Path:
  571. """Return the directory for entity type prompt profiles.
  572. Resolves ``PROMPT_DIR`` (defaults to ``./prompts`` relative to the current
  573. working directory, mirroring ``INPUT_DIR`` / ``WORKING_DIR``) and appends
  574. the hard-coded ``entity_type`` subdirectory. Profile files are provided by
  575. the user at runtime and are not shipped with the distribution. The
  576. file-name sandbox in :func:`resolve_entity_type_prompt_path` ensures
  577. user-supplied file names cannot escape the resolved directory.
  578. """
  579. configured = os.getenv("PROMPT_DIR", "").strip() or _DEFAULT_PROMPT_DIR
  580. return (Path(configured).expanduser() / _ENTITY_TYPE_SUBDIR).resolve()
  581. def resolve_entity_type_prompt_path(prompt_file_name: str | Path) -> Path:
  582. """Resolve an allowlisted prompt profile file name to an absolute path."""
  583. file_name = str(prompt_file_name).strip()
  584. if not file_name:
  585. raise ValueError(
  586. "ENTITY_TYPE_PROMPT_FILE must be a file name such as "
  587. "'entity_type_prompt.sample.yml'."
  588. )
  589. if "\\" in file_name:
  590. raise ValueError(
  591. "ENTITY_TYPE_PROMPT_FILE must not contain directory separators. "
  592. "Only file names inside PROMPT_DIR/entity_type are allowed."
  593. )
  594. candidate = Path(file_name)
  595. if (
  596. candidate.is_absolute()
  597. or candidate.name != file_name
  598. or ".." in candidate.parts
  599. ):
  600. raise ValueError(
  601. "ENTITY_TYPE_PROMPT_FILE must be a file name only. "
  602. "Files are loaded from PROMPT_DIR/entity_type "
  603. "(PROMPT_DIR defaults to ./prompts)."
  604. )
  605. if candidate.suffix.lower() not in _ALLOWED_PROMPT_SUFFIXES:
  606. raise ValueError(
  607. "ENTITY_TYPE_PROMPT_FILE must use a '.yml' or '.yaml' extension."
  608. )
  609. return get_entity_type_prompt_dir() / candidate.name
  610. def _normalize_prompt_examples(
  611. value: Any, field_name: str, profile_path: Path
  612. ) -> list[str]:
  613. if not isinstance(value, list):
  614. raise ValueError(
  615. f"ENTITY_TYPE_PROMPT_FILE '{profile_path}' field '{field_name}' "
  616. "must be a list of strings."
  617. )
  618. normalized: list[str] = []
  619. for index, item in enumerate(value):
  620. if not isinstance(item, str) or not item.strip():
  621. raise ValueError(
  622. f"ENTITY_TYPE_PROMPT_FILE '{profile_path}' field '{field_name}' "
  623. f"item {index} must be a non-empty string."
  624. )
  625. normalized.append(item.rstrip())
  626. return normalized
  627. def load_entity_extraction_prompt_profile(
  628. prompt_file: str | Path,
  629. ) -> dict[str, Any]:
  630. """Load and validate an entity extraction prompt profile from YAML."""
  631. profile_path = Path(prompt_file)
  632. if not profile_path.exists():
  633. raise FileNotFoundError(
  634. f"ENTITY_TYPE_PROMPT_FILE '{profile_path}' does not exist."
  635. )
  636. if not profile_path.is_file():
  637. raise ValueError(
  638. f"ENTITY_TYPE_PROMPT_FILE '{profile_path}' must point to a file."
  639. )
  640. try:
  641. content = profile_path.read_text(encoding="utf-8")
  642. except OSError as exc:
  643. raise OSError(
  644. f"Failed to read ENTITY_TYPE_PROMPT_FILE '{profile_path}': {exc}"
  645. ) from exc
  646. try:
  647. raw_profile = yaml.safe_load(content)
  648. except yaml.YAMLError as exc:
  649. raise ValueError(
  650. f"ENTITY_TYPE_PROMPT_FILE '{profile_path}' contains invalid YAML: {exc}"
  651. ) from exc
  652. if raw_profile is None:
  653. raw_profile = {}
  654. if not isinstance(raw_profile, dict):
  655. raise ValueError(
  656. f"ENTITY_TYPE_PROMPT_FILE '{profile_path}' must contain a YAML mapping."
  657. )
  658. profile: dict[str, Any] = {}
  659. guidance = raw_profile.get("entity_types_guidance")
  660. if guidance is not None:
  661. if not isinstance(guidance, str) or not guidance.strip():
  662. raise ValueError(
  663. f"ENTITY_TYPE_PROMPT_FILE '{profile_path}' field "
  664. "'entity_types_guidance' must be a non-empty string."
  665. )
  666. profile["entity_types_guidance"] = guidance.rstrip()
  667. for field_name in (
  668. "entity_extraction_examples",
  669. "entity_extraction_json_examples",
  670. ):
  671. if field_name in raw_profile:
  672. profile[field_name] = _normalize_prompt_examples(
  673. raw_profile[field_name], field_name, profile_path
  674. )
  675. return profile
  676. def resolve_entity_extraction_prompt_profile(
  677. addon_params: Mapping[str, Any] | None,
  678. use_json: bool,
  679. ) -> EntityExtractionPromptProfile:
  680. """Resolve and merge the configured entity extraction prompt profile."""
  681. default_profile = get_default_entity_extraction_prompt_profile()
  682. addon_params = addon_params or {}
  683. prompt_file = addon_params.get("entity_type_prompt_file")
  684. file_profile: dict[str, Any] = {}
  685. if prompt_file:
  686. prompt_path = resolve_entity_type_prompt_path(prompt_file)
  687. file_profile = load_entity_extraction_prompt_profile(prompt_path)
  688. required_examples_key = (
  689. "entity_extraction_json_examples"
  690. if use_json
  691. else "entity_extraction_examples"
  692. )
  693. if required_examples_key not in file_profile:
  694. mode_name = "json" if use_json else "text"
  695. raise ValueError(
  696. f"ENTITY_TYPE_PROMPT_FILE '{prompt_file}' must define "
  697. f"'{required_examples_key}' when entity extraction runs in "
  698. f"{mode_name} mode."
  699. )
  700. guidance = addon_params.get("entity_types_guidance")
  701. if guidance is None:
  702. guidance = file_profile.get(
  703. "entity_types_guidance", default_profile["entity_types_guidance"]
  704. )
  705. elif not isinstance(guidance, str) or not guidance.strip():
  706. raise ValueError(
  707. "addon_params['entity_types_guidance'] must be a non-empty string."
  708. )
  709. return {
  710. "entity_types_guidance": guidance,
  711. "entity_extraction_examples": list(
  712. file_profile.get(
  713. "entity_extraction_examples",
  714. default_profile["entity_extraction_examples"],
  715. )
  716. ),
  717. "entity_extraction_json_examples": list(
  718. file_profile.get(
  719. "entity_extraction_json_examples",
  720. default_profile["entity_extraction_json_examples"],
  721. )
  722. ),
  723. }
  724. def validate_entity_extraction_prompt_profile_for_mode(
  725. prompt_profile: Mapping[str, Any],
  726. use_json: bool,
  727. prompt_file_name: str | None = None,
  728. ) -> EntityExtractionPromptProfile:
  729. """Validate that the resolved profile contains the active-mode examples."""
  730. required_examples_key = (
  731. "entity_extraction_json_examples" if use_json else "entity_extraction_examples"
  732. )
  733. if (
  734. required_examples_key not in prompt_profile
  735. or not prompt_profile[required_examples_key]
  736. ):
  737. mode_name = "json" if use_json else "text"
  738. source = (
  739. f"ENTITY_TYPE_PROMPT_FILE '{prompt_file_name}'"
  740. if prompt_file_name
  741. else "the resolved prompt profile"
  742. )
  743. raise ValueError(
  744. f"{source} must define '{required_examples_key}' when entity extraction "
  745. f"runs in {mode_name} mode."
  746. )
  747. return {
  748. "entity_types_guidance": str(prompt_profile["entity_types_guidance"]).rstrip(),
  749. "entity_extraction_examples": [
  750. str(example).rstrip()
  751. for example in prompt_profile["entity_extraction_examples"]
  752. ],
  753. "entity_extraction_json_examples": [
  754. str(example).rstrip()
  755. for example in prompt_profile["entity_extraction_json_examples"]
  756. ],
  757. }