knowledgeBaseService.cjs 52 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362
  1. const crypto = require('node:crypto');
  2. const fs = require('node:fs');
  3. const fsp = require('node:fs/promises');
  4. const path = require('node:path');
  5. const { dialog } = require('electron');
  6. const { getKnowledgeBaseDir } = require('../utils/paths.cjs');
  7. const { deleteImportedImageBatches } = require('../utils/importedImages.cjs');
  8. const { parseDocumentWithConfig } = require('./fileService.cjs');
  9. const supportedExtensions = new Set(['.doc', '.docx', '.wps', '.pdf', '.md', '.markdown']);
  10. const oversizedBlockChars = 8000;
  11. const semanticMergeTargetChars = 500;
  12. const recoveryMaxAttempts = 2;
  13. function now() {
  14. return new Date().toISOString();
  15. }
  16. function createId(prefix) {
  17. return `${prefix}-${crypto.randomUUID()}`;
  18. }
  19. function safeName(name) {
  20. return String(name || '未命名').replace(/[<>:"/\\|?*\x00-\x1F]+/g, '_').trim() || '未命名';
  21. }
  22. function createEmptyIndex() {
  23. return { folders: [], documents: [] };
  24. }
  25. function normalizeDocument(document) {
  26. const documentDir = document?.document_dir || path.join('folders', document?.folder_id || 'unknown', 'documents', document?.id || createId('doc')).replace(/\\/g, '/');
  27. return {
  28. ...document,
  29. document_dir: documentDir,
  30. source_path: document?.source_path || path.join(documentDir, 'source').replace(/\\/g, '/'),
  31. markdown_path: document?.markdown_path || path.join(documentDir, 'content.md').replace(/\\/g, '/'),
  32. blocks_path: document?.blocks_path || path.join(documentDir, 'blocks.json').replace(/\\/g, '/'),
  33. filtered_blocks_path: document?.filtered_blocks_path || path.join(documentDir, 'filtered_blocks.json').replace(/\\/g, '/'),
  34. candidate_items_path: document?.candidate_items_path || path.join(documentDir, 'candidate_items.json').replace(/\\/g, '/'),
  35. match_result_path: document?.match_result_path || path.join(documentDir, 'match_result.json').replace(/\\/g, '/'),
  36. report_path: document?.report_path || path.join(documentDir, 'report.json').replace(/\\/g, '/'),
  37. items_path: document?.items_path || path.join(documentDir, 'items.json').replace(/\\/g, '/'),
  38. item_count: Number(document?.item_count || 0),
  39. block_count: Number(document?.block_count || 0),
  40. filtered_block_count: Number(document?.filtered_block_count || 0),
  41. candidate_item_count: Number(document?.candidate_item_count || 0),
  42. discarded_block_count: Number(document?.discarded_block_count || 0),
  43. system_discarded_after_retry_count: Number(document?.system_discarded_after_retry_count || 0),
  44. };
  45. }
  46. function normalizeIndex(index) {
  47. return {
  48. folders: Array.isArray(index?.folders) ? index.folders : [],
  49. documents: Array.isArray(index?.documents) ? index.documents.map(normalizeDocument) : [],
  50. };
  51. }
  52. function ensureDir(dir) {
  53. fs.mkdirSync(dir, { recursive: true });
  54. }
  55. function readJson(filePath, fallback) {
  56. if (!fs.existsSync(filePath)) {
  57. return fallback;
  58. }
  59. return JSON.parse(fs.readFileSync(filePath, 'utf-8'));
  60. }
  61. function writeJson(filePath, value) {
  62. ensureDir(path.dirname(filePath));
  63. fs.writeFileSync(filePath, JSON.stringify(value, null, 2), 'utf-8');
  64. }
  65. function getIndexPath(baseDir) {
  66. return path.join(baseDir, 'index.json');
  67. }
  68. function getDebugLogsDir(app) {
  69. return path.join(app.getPath('userData'), 'logs', 'knowledge-base');
  70. }
  71. function getDebugLogPath(app, documentId) {
  72. return path.join(getDebugLogsDir(app), `${safeName(documentId)}.jsonl`);
  73. }
  74. function fromRelative(baseDir, relativePath) {
  75. return path.join(baseDir, relativePath || '');
  76. }
  77. function getPromptSummary(messages) {
  78. return (messages || []).map((message, index) => ({
  79. index: index + 1,
  80. role: message.role,
  81. chars: String(message.content || '').length,
  82. }));
  83. }
  84. function getItemSample(items) {
  85. return (items || []).slice(0, 8).map((item) => ({
  86. id: item.id,
  87. title: item.title,
  88. summary_chars: String(item.summary || item.resume || '').length,
  89. }));
  90. }
  91. function getMatchSummary(matches) {
  92. return (matches || []).map((match) => ({
  93. id: match.id,
  94. range_count: match.ranges?.length || 0,
  95. block_count: match.block_ids?.length || 0,
  96. }));
  97. }
  98. function stripMarkdownFence(content) {
  99. return String(content || '').replace(/^```[\s\S]*?\n/, '').replace(/```$/g, '').trim();
  100. }
  101. function splitOversizedText(text, limit) {
  102. const parts = [];
  103. let buffer = '';
  104. const sentences = String(text || '').split(/(?<=[。!?!?;;])\s*/);
  105. for (const sentence of sentences) {
  106. if (!sentence) continue;
  107. if (buffer && buffer.length + sentence.length > limit) {
  108. parts.push(buffer.trim());
  109. buffer = '';
  110. }
  111. buffer += sentence;
  112. }
  113. if (buffer.trim()) {
  114. parts.push(buffer.trim());
  115. }
  116. return parts.length ? parts : [String(text || '')];
  117. }
  118. function normalizeRepeatedText(text) {
  119. return String(text || '')
  120. .replace(/^#+\s*/, '')
  121. .replace(/\s+/g, '')
  122. .replace(/[\-—_·.。::|第页共]/g, '')
  123. .trim()
  124. .toLowerCase();
  125. }
  126. function isPageNumberBlock(text) {
  127. const normalized = String(text || '').trim();
  128. const compact = normalized.replace(/\s+/g, '');
  129. return /^[-—_]*\d+[-—_]*$/.test(compact)
  130. || /^第\d+页(共\d+页)?$/.test(compact)
  131. || /^\d+\/\d+$/.test(compact)
  132. || /^page\d+(of\d+)?$/i.test(compact);
  133. }
  134. function isCatalogBlock(text) {
  135. const normalized = String(text || '').trim();
  136. const compact = normalized.replace(/\s+/g, '');
  137. if (/^(#+)?(目录|目次|contents)$/i.test(compact)) {
  138. return true;
  139. }
  140. const lines = normalized.split(/\r?\n/).map((line) => line.trim()).filter(Boolean);
  141. if (lines.length < 2) {
  142. return false;
  143. }
  144. const catalogLines = lines.filter((line) => /(?:\.{2,}|…{2,}|·{2,}|\s{4,})\s*\d+\s*$/.test(line));
  145. return catalogLines.length >= Math.ceil(lines.length * 0.6);
  146. }
  147. function isCoverBlock(text, index) {
  148. if (index > 12) {
  149. return false;
  150. }
  151. const normalized = String(text || '').trim();
  152. const compact = normalized.replace(/\s+/g, '');
  153. if (!compact || compact.length > 220) {
  154. return false;
  155. }
  156. const coverMarkers = ['投标文件', '投标书', '正本', '副本', '项目名称', '招标编号', '投标人', '编制日期', '日期:', '日期:'];
  157. const hasMarker = coverMarkers.some((marker) => compact.includes(marker));
  158. const hasLongSentence = /[。!?;]/.test(normalized) && normalized.length > 80;
  159. return hasMarker && !hasLongSentence;
  160. }
  161. function isSignatureBlock(text) {
  162. const normalized = String(text || '').trim();
  163. const compact = normalized.replace(/\s+/g, '');
  164. if (!compact || compact.length > 260) {
  165. return false;
  166. }
  167. if (/(签字确认|用户签字|双方责任人.{0,12}签字)/.test(compact)) {
  168. return false;
  169. }
  170. return /(盖章|签章|签名|法定代表人|授权代表|委托代理人|被授权人|年月日|投标人代表签字|代表签字)/.test(compact)
  171. && !/[。!?;].{20,}/.test(normalized);
  172. }
  173. function getContentCharCount(text) {
  174. return String(text || '').replace(/\s+/g, '').length;
  175. }
  176. function stripBoldMarker(text) {
  177. return String(text || '').trim().replace(/^\*\*(.+)\*\*$/, '$1').trim();
  178. }
  179. function isTableBlock(block) {
  180. return /^<table[\s>]/i.test(String(block?.content || '').trim());
  181. }
  182. function isSemanticHeadingBlock(block) {
  183. const original = String(block?.content || '').trim();
  184. const normalized = stripBoldMarker(original);
  185. const compactLength = getContentCharCount(normalized);
  186. if (!normalized || compactLength > 100) {
  187. return false;
  188. }
  189. if (/[。!?;;]$/.test(normalized)) {
  190. return false;
  191. }
  192. return /^\*\*.+\*\*$/.test(original)
  193. || /^\d+(?:\.\d+)+\s*[^。!?;;]{1,80}$/.test(normalized)
  194. || /^\d+\.\s*[^。!?;;]{1,80}$/.test(normalized)
  195. || /^[一二三四五六七八九十]+[、..]\s*[^。!?;;]{1,80}$/.test(normalized)
  196. || /^[①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮⑯⑰⑱⑲⑳][、..]?\s*[^。!?;;]{1,80}$/.test(normalized)
  197. || /^([一二三四五六七八九十]+)\s*[^。!?;;]{1,80}$/.test(normalized)
  198. || /^第[一二三四五六七八九十\d]+[章节部分篇]\s*[^。!?;;]{0,80}$/.test(normalized);
  199. }
  200. function mergeSemanticBlocks(rawBlocks) {
  201. const merged = [];
  202. let buffer = [];
  203. function bufferText() {
  204. return buffer.map((block) => block.content).join('\n\n');
  205. }
  206. function bufferHasOnlyHeadings() {
  207. return buffer.length > 0 && buffer.every(isSemanticHeadingBlock);
  208. }
  209. function flushBuffer() {
  210. if (!buffer.length) {
  211. return;
  212. }
  213. merged.push({
  214. ...buffer[0],
  215. id: `R${String(merged.length + 1).padStart(6, '0')}`,
  216. type: buffer.some((block) => block.type === 'list') ? 'list' : 'paragraph',
  217. content: bufferText().trim(),
  218. });
  219. buffer = [];
  220. }
  221. function pushStandalone(block) {
  222. merged.push({
  223. ...block,
  224. id: `R${String(merged.length + 1).padStart(6, '0')}`,
  225. });
  226. }
  227. for (const block of rawBlocks) {
  228. if (isTableBlock(block)) {
  229. flushBuffer();
  230. pushStandalone(block);
  231. continue;
  232. }
  233. if (isSemanticHeadingBlock(block)) {
  234. if (buffer.length && !bufferHasOnlyHeadings() && getContentCharCount(bufferText()) >= 100) {
  235. flushBuffer();
  236. }
  237. buffer.push(block);
  238. continue;
  239. }
  240. const blockChars = getContentCharCount(block.content);
  241. if (!buffer.length && blockChars >= semanticMergeTargetChars) {
  242. pushStandalone(block);
  243. continue;
  244. }
  245. buffer.push(block);
  246. if (getContentCharCount(bufferText()) >= semanticMergeTargetChars) {
  247. flushBuffer();
  248. }
  249. }
  250. flushBuffer();
  251. return merged;
  252. }
  253. function createRawBlocks(markdown) {
  254. const blocks = [];
  255. const lines = String(markdown || '').split(/\r?\n/);
  256. let buffer = [];
  257. let currentType = 'paragraph';
  258. const headings = [];
  259. function pushBuffer() {
  260. const content = buffer.join('\n').trim();
  261. if (!content) {
  262. buffer = [];
  263. return;
  264. }
  265. const chunks = content.length > oversizedBlockChars ? splitOversizedText(content, Math.floor(oversizedBlockChars * 0.75)) : [content];
  266. for (const chunk of chunks) {
  267. blocks.push({
  268. id: `R${String(blocks.length + 1).padStart(6, '0')}`,
  269. type: currentType,
  270. heading_path: headings.filter(Boolean),
  271. content: chunk,
  272. });
  273. }
  274. buffer = [];
  275. }
  276. for (const line of lines) {
  277. const headingMatch = /^(#{1,6})\s+(.+)$/.exec(line);
  278. if (headingMatch) {
  279. pushBuffer();
  280. const level = headingMatch[1].length;
  281. headings.splice(level - 1);
  282. headings[level - 1] = headingMatch[2].trim();
  283. currentType = 'heading';
  284. buffer = [line];
  285. pushBuffer();
  286. currentType = 'paragraph';
  287. continue;
  288. }
  289. const trimmed = line.trim();
  290. if (!trimmed) {
  291. pushBuffer();
  292. currentType = 'paragraph';
  293. continue;
  294. }
  295. const nextType = /^\s*\|.*\|\s*$/.test(line)
  296. ? 'table'
  297. : /^\s*(?:[-*+]\s+|\d+[.)、]\s+)/.test(line)
  298. ? 'list'
  299. : 'paragraph';
  300. if (buffer.length && currentType !== nextType && (currentType !== 'paragraph' || nextType !== 'paragraph')) {
  301. pushBuffer();
  302. }
  303. currentType = nextType;
  304. buffer.push(line);
  305. }
  306. pushBuffer();
  307. return blocks;
  308. }
  309. function filterBlocks(rawBlocks) {
  310. const repeatedCounts = new Map();
  311. rawBlocks.forEach((block) => {
  312. const key = normalizeRepeatedText(block.content);
  313. if (key && key.length <= 80) {
  314. repeatedCounts.set(key, (repeatedCounts.get(key) || 0) + 1);
  315. }
  316. });
  317. const kept = [];
  318. const filtered = [];
  319. rawBlocks.forEach((block, index) => {
  320. const repeatedKey = normalizeRepeatedText(block.content);
  321. const repeated = repeatedKey && repeatedKey.length <= 80 && repeatedCounts.get(repeatedKey) >= 3;
  322. const reason = !String(block.content || '').trim()
  323. ? 'empty'
  324. : isPageNumberBlock(block.content)
  325. ? 'page_number'
  326. : getContentCharCount(block.content) < 100
  327. ? 'too_short'
  328. : isCatalogBlock(block.content)
  329. ? 'catalog'
  330. : repeated
  331. ? 'repeated_header_footer'
  332. : isCoverBlock(block.content, index)
  333. ? 'cover'
  334. : isSignatureBlock(block.content)
  335. ? 'signature_page'
  336. : '';
  337. if (reason) {
  338. filtered.push({ ...block, reason });
  339. return;
  340. }
  341. kept.push({
  342. ...block,
  343. id: `P${String(kept.length + 1).padStart(6, '0')}`,
  344. });
  345. });
  346. return { blocks: kept, filtered_blocks: filtered };
  347. }
  348. function renderBlocksForPrompt(blocks) {
  349. return blocks.map((block) => {
  350. const headingPath = block.heading_path?.length ? block.heading_path.join(' > ') : '无';
  351. return [
  352. `[${block.id}]`,
  353. `type: ${block.type}`,
  354. `heading_path: ${headingPath}`,
  355. 'text:',
  356. block.content,
  357. ].join('\n');
  358. }).join('\n\n');
  359. }
  360. function normalizeCandidateItems(parsed) {
  361. const items = Array.isArray(parsed) ? parsed : parsed?.items;
  362. if (!Array.isArray(items)) return [];
  363. return items.map((item) => ({
  364. title: String(item?.title || '').trim(),
  365. summary: String(item?.summary || item?.resume || '').trim(),
  366. })).filter((item) => item.title && item.summary);
  367. }
  368. function validateCandidateItems(value) {
  369. if (!Array.isArray(value?.items)) {
  370. throw new Error('AI 返回结果缺少 items 数组');
  371. }
  372. }
  373. function mergeCandidateItems(firstItems, supplementItems) {
  374. const merged = [];
  375. const seen = new Set();
  376. for (const item of [...firstItems, ...supplementItems]) {
  377. const key = item.title.replace(/\s+/g, '').toLowerCase();
  378. if (!key || seen.has(key)) continue;
  379. seen.add(key);
  380. merged.push({
  381. id: `K${String(merged.length + 1).padStart(6, '0')}`,
  382. title: item.title,
  383. summary: item.summary,
  384. });
  385. }
  386. return merged;
  387. }
  388. function buildDocumentBlocksUserMessage(blockText) {
  389. return {
  390. role: 'user',
  391. content: [
  392. '以下是同一份文档的完整 block 列表。',
  393. '<document_blocks>',
  394. blockText,
  395. '</document_blocks>',
  396. ].join('\n'),
  397. };
  398. }
  399. function buildInitialItemMessages(documentName, blockText) {
  400. return [
  401. buildDocumentBlocksUserMessage(blockText),
  402. {
  403. role: 'user',
  404. content: [
  405. `文档名:${documentName}`,
  406. '你是投标资料知识库分析助手。你只负责从历史投标资料中提取对后续编写标书有复用价值的知识条目。',
  407. '任务:请从全文中提取有意义的知识条目数组。条目应覆盖技术方案、项目管理、质量、安全、进度、服务、应急、人员设备、类似业绩等可复用内容。',
  408. '只返回 JSON:{"items":[{"title":"","summary":""}]}',
  409. '要求:title 简洁明确;summary 说明该条目可如何用于编写投标文件;不要输出 id、content、段落编号、Markdown 或解释文字。',
  410. ].join('\n'),
  411. },
  412. ];
  413. }
  414. function buildSupplementItemMessages(documentName, blockText, firstItems) {
  415. return [
  416. buildDocumentBlocksUserMessage(blockText),
  417. {
  418. role: 'user',
  419. content: [
  420. `文档名:${documentName}`,
  421. '你是投标资料知识库补漏助手。你只判断已有知识条目是否遗漏了重要主题,并补充缺失条目。',
  422. '任务:请检查第一轮条目是否遗漏了有复用价值的重要内容。如果有遗漏,只输出新增条目;如果没有遗漏,返回空 items 数组。',
  423. '只返回 JSON:{"items":[{"title":"","summary":""}]}',
  424. '如果没有新增条目,必须返回 {"items":[]},这属于正常结果。',
  425. '不要重复已有条目,不要输出 id、content、段落编号、Markdown 或解释文字。',
  426. '',
  427. '<first_round_items>',
  428. JSON.stringify(firstItems.map(({ title, summary }) => ({ title, summary })), null, 2),
  429. '</first_round_items>',
  430. ].join('\n'),
  431. },
  432. ];
  433. }
  434. function buildMatchMessages(documentName, blockText, batchItems) {
  435. const taskPrompt = [
  436. `文档名:${documentName}`,
  437. '你是投标知识库段落匹配助手。你只根据知识条目的标题和摘要,为其匹配强相关 block 范围。',
  438. '你将收到同一份文档的完整 block 列表,以及本次需要匹配的一小批知识条目。',
  439. '规则:',
  440. '1. 只处理本次给出的知识条目。',
  441. '2. 只匹配与条目强相关、可直接支撑该条目的 block。',
  442. '3. 如果某些 block 更可能属于其他未提供的条目,不要强行匹配。',
  443. '4. 只返回 id 和 ranges,不要输出正文,不要解释。',
  444. '5. ranges 使用闭区间:["P000001","P000003"] 表示连续 block;单个 block 写成 ["P000001","P000001"]。',
  445. '6. 只允许使用输入中存在的 block 编号和本批条目 id。',
  446. '输出 JSON:{"matches":[{"id":"K000001","ranges":[["P000001","P000003"]]}]}',
  447. '',
  448. '以下是本次需要匹配的知识条目。只处理这些条目:',
  449. JSON.stringify(batchItems.map(({ id, title, summary }) => ({ id, title, summary })), null, 2),
  450. ].join('\n');
  451. return [
  452. buildDocumentBlocksUserMessage(blockText),
  453. {
  454. role: 'user',
  455. content: taskPrompt,
  456. },
  457. ];
  458. }
  459. function buildRecoveryMessages(documentName, items, missingBlocks) {
  460. return [
  461. {
  462. role: 'user',
  463. content: [
  464. '以下是当前尚未处理的遗漏 block。',
  465. '<missing_blocks>',
  466. renderBlocksForPrompt(missingBlocks),
  467. '</missing_blocks>',
  468. ].join('\n'),
  469. },
  470. {
  471. role: 'user',
  472. content: [
  473. `文档名:${documentName}`,
  474. '你是投标知识库遗漏段落补漏助手。必须把所有收到的遗漏 block 明确归入已有条目、新增条目或舍弃段落。',
  475. '任务:必须覆盖所有遗漏 block。每个遗漏 block 只能进入以下三类之一:',
  476. '1. matches:归入已有知识条目,只返回已有 id 和 ranges。',
  477. '2. new_items:如果没有合适的已有条目但内容有复用价值,则新增知识条目,并给出 title、summary、ranges。',
  478. '3. discarded:如果内容质量低、重复、格式残留或无投标复用价值,则推荐舍弃,并给出 reason。',
  479. '输出 JSON:{"matches":[{"id":"K000001","ranges":[["P000001","P000003"]]}],"new_items":[{"title":"","summary":"","ranges":[["P000004","P000005"]]}],"discarded":[{"ranges":[["P000006","P000006"]],"reason":""}]}',
  480. '不要输出正文、Markdown 或解释文字。',
  481. '',
  482. '<knowledge_items>',
  483. JSON.stringify(items.map(({ id, title, summary }) => ({ id, title, summary })), null, 2),
  484. '</knowledge_items>',
  485. ].join('\n'),
  486. },
  487. ];
  488. }
  489. function getBlockOrder(blocks) {
  490. return new Map(blocks.map((block, index) => [block.id, index]));
  491. }
  492. function normalizeRangePair(range) {
  493. if (Array.isArray(range)) {
  494. const start = String(range[0] || '').trim();
  495. const end = String(range[1] || range[0] || '').trim();
  496. return start ? [start, end] : null;
  497. }
  498. const id = String(range || '').trim();
  499. return id ? [id, id] : null;
  500. }
  501. function normalizeRanges(ranges, blockOrder) {
  502. if (!Array.isArray(ranges)) return [];
  503. const normalized = [];
  504. for (const range of ranges) {
  505. const pair = normalizeRangePair(range);
  506. if (!pair) continue;
  507. let [start, end] = pair;
  508. if (!blockOrder.has(start) || !blockOrder.has(end)) continue;
  509. if (blockOrder.get(start) > blockOrder.get(end)) {
  510. [start, end] = [end, start];
  511. }
  512. normalized.push([start, end]);
  513. }
  514. return normalized;
  515. }
  516. function expandRanges(ranges, blocks, blockOrder) {
  517. const ids = [];
  518. for (const [start, end] of ranges) {
  519. const startIndex = blockOrder.get(start);
  520. const endIndex = blockOrder.get(end);
  521. if (startIndex === undefined || endIndex === undefined) continue;
  522. for (let index = startIndex; index <= endIndex; index += 1) {
  523. ids.push(blocks[index].id);
  524. }
  525. }
  526. return [...new Set(ids)];
  527. }
  528. function normalizeMatchResult(parsed, itemIds, blocks, blockOrder) {
  529. const matches = Array.isArray(parsed?.matches) ? parsed.matches : [];
  530. return {
  531. matches: matches.map((match) => {
  532. const id = String(match?.id || '').trim();
  533. const ranges = normalizeRanges(match?.ranges || match?.paragraph_ranges || match?.block_ranges || [], blockOrder);
  534. return itemIds.has(id) && ranges.length ? { id, ranges, block_ids: expandRanges(ranges, blocks, blockOrder) } : null;
  535. }).filter(Boolean),
  536. };
  537. }
  538. function validateMatchResult(value) {
  539. if (!Array.isArray(value?.matches)) {
  540. throw new Error('AI 返回结果缺少 matches 数组');
  541. }
  542. }
  543. function normalizeRecoveryResult(parsed, itemIds, blocks, blockOrder) {
  544. const matches = Array.isArray(parsed?.matches) ? parsed.matches : [];
  545. const newItems = Array.isArray(parsed?.new_items) ? parsed.new_items : [];
  546. const discarded = Array.isArray(parsed?.discarded) ? parsed.discarded : [];
  547. return {
  548. matches: matches.map((match) => {
  549. const id = String(match?.id || '').trim();
  550. const ranges = normalizeRanges(match?.ranges || [], blockOrder);
  551. return itemIds.has(id) && ranges.length ? { id, ranges, block_ids: expandRanges(ranges, blocks, blockOrder) } : null;
  552. }).filter(Boolean),
  553. new_items: newItems.map((item) => {
  554. const title = String(item?.title || '').trim();
  555. const summary = String(item?.summary || item?.resume || '').trim();
  556. const ranges = normalizeRanges(item?.ranges || [], blockOrder);
  557. return title && summary && ranges.length ? { title, summary, ranges, block_ids: expandRanges(ranges, blocks, blockOrder) } : null;
  558. }).filter(Boolean),
  559. discarded: discarded.map((item) => {
  560. const ranges = normalizeRanges(item?.ranges || [], blockOrder);
  561. return ranges.length ? {
  562. ranges,
  563. block_ids: expandRanges(ranges, blocks, blockOrder),
  564. reason: String(item?.reason || 'AI 建议舍弃').trim() || 'AI 建议舍弃',
  565. } : null;
  566. }).filter(Boolean),
  567. };
  568. }
  569. function validateRecoveryResult(value) {
  570. if (!Array.isArray(value?.matches) || !Array.isArray(value?.new_items) || !Array.isArray(value?.discarded)) {
  571. throw new Error('AI 返回结果缺少 matches/new_items/discarded 数组');
  572. }
  573. }
  574. function collectHandledBlockIds(matches, discarded, systemDiscarded) {
  575. const handled = new Set();
  576. matches.forEach((match) => match.block_ids.forEach((id) => handled.add(id)));
  577. discarded.forEach((item) => item.block_ids.forEach((id) => handled.add(id)));
  578. systemDiscarded.forEach((item) => item.block_ids.forEach((id) => handled.add(id)));
  579. return handled;
  580. }
  581. function getMissingBlocks(blocks, matches, discarded, systemDiscarded) {
  582. const handled = collectHandledBlockIds(matches, discarded, systemDiscarded);
  583. return blocks.filter((block) => !handled.has(block.id));
  584. }
  585. function nextKnowledgeItemId(items) {
  586. let max = 0;
  587. items.forEach((item) => {
  588. const match = /^K(\d+)$/.exec(item.id || '');
  589. if (match) max = Math.max(max, Number(match[1]));
  590. });
  591. return `K${String(max + 1).padStart(6, '0')}`;
  592. }
  593. function createFinalItems(items, matches, blocks, fileName) {
  594. const blockMap = new Map(blocks.map((block) => [block.id, block]));
  595. const blocksByItem = new Map();
  596. matches.forEach((match) => {
  597. const current = blocksByItem.get(match.id) || [];
  598. blocksByItem.set(match.id, [...new Set([...current, ...match.block_ids])]);
  599. });
  600. return items.map((item) => {
  601. const sourceBlockIds = blocksByItem.get(item.id) || [];
  602. const content = sourceBlockIds.map((id) => blockMap.get(id)?.content || '').filter(Boolean).join('\n\n').trim();
  603. return {
  604. id: item.id,
  605. title: item.title,
  606. resume: item.summary,
  607. content,
  608. source_block_ids: sourceBlockIds,
  609. source_file: fileName,
  610. };
  611. }).filter((item) => item.content);
  612. }
  613. function sumContentChars(items) {
  614. return items.reduce((sum, item) => sum + getContentCharCount(item.content), 0);
  615. }
  616. function countCoveredUniqueBlockChars(items, blocks) {
  617. const blockMap = new Map(blocks.map((block) => [block.id, block]));
  618. const covered = new Set();
  619. items.forEach((item) => {
  620. if (!Array.isArray(item?.source_block_ids)) return;
  621. item.source_block_ids.forEach((id) => covered.add(String(id)));
  622. });
  623. return Array.from(covered).reduce((sum, id) => sum + getContentCharCount(blockMap.get(id)?.content || ''), 0);
  624. }
  625. function createReport({ blocks, filteredBlocks, candidateItems, finalItems, matches, discarded, systemDiscarded, recoveryAttempts, batchSize }) {
  626. const matched = new Set();
  627. matches.forEach((match) => match.block_ids.forEach((id) => matched.add(id)));
  628. const discardedSet = new Set();
  629. discarded.forEach((item) => item.block_ids.forEach((id) => discardedSet.add(id)));
  630. const systemSet = new Set();
  631. systemDiscarded.forEach((item) => item.block_ids.forEach((id) => systemSet.add(id)));
  632. const handled = new Set([...matched, ...discardedSet, ...systemSet]);
  633. const total = blocks.length || 1;
  634. return {
  635. total_blocks: blocks.length,
  636. filtered_blocks_count: filteredBlocks.length,
  637. candidate_items_count: candidateItems.length,
  638. final_items_count: finalItems.length,
  639. matched_blocks_count: matched.size,
  640. discarded_blocks_count: discardedSet.size,
  641. system_discarded_after_retry_count: systemSet.size,
  642. new_items_from_recovery_count: recoveryAttempts.reduce((sum, attempt) => sum + attempt.new_items.length, 0),
  643. recovery_attempt_count: recoveryAttempts.length,
  644. batch_size: batchSize,
  645. coverage_rate: Number((handled.size / total).toFixed(4)),
  646. matched_rate: Number((matched.size / total).toFixed(4)),
  647. created_at: now(),
  648. };
  649. }
  650. function createKnowledgeBaseService({ app, aiService, configStore }) {
  651. const baseDir = getKnowledgeBaseDir(app);
  652. const indexPath = getIndexPath(baseDir);
  653. const activePreparations = new Set();
  654. const activeMatches = new Set();
  655. function isDeveloperMode() {
  656. try {
  657. return Boolean(configStore?.load()?.developer_mode);
  658. } catch {
  659. return false;
  660. }
  661. }
  662. function debugLog(documentId, event, payload = {}) {
  663. if (!isDeveloperMode()) {
  664. return;
  665. }
  666. try {
  667. const logPath = getDebugLogPath(app, documentId || 'unknown');
  668. ensureDir(path.dirname(logPath));
  669. const entry = {
  670. time: now(),
  671. event,
  672. ...payload,
  673. };
  674. fs.appendFileSync(logPath, `${JSON.stringify(entry)}\n`, 'utf-8');
  675. console.info(`[knowledge-base] ${event}`, entry);
  676. } catch (error) {
  677. console.warn('[knowledge-base] 写入调试日志失败', error);
  678. }
  679. }
  680. function loadIndex() {
  681. ensureDir(baseDir);
  682. return normalizeIndex(readJson(indexPath, createEmptyIndex()));
  683. }
  684. function saveIndex(index) {
  685. writeJson(indexPath, normalizeIndex(index));
  686. return normalizeIndex(index);
  687. }
  688. function emitProgress(webContents, document) {
  689. if (!webContents?.isDestroyed()) {
  690. webContents.send('knowledge-base:event', { document });
  691. }
  692. }
  693. function updateDocument(documentId, partial, webContents) {
  694. const index = loadIndex();
  695. const documents = index.documents.map((document) => (
  696. document.id === documentId ? normalizeDocument({ ...document, ...partial, updated_at: now() }) : document
  697. ));
  698. const next = saveIndex({ ...index, documents });
  699. const document = next.documents.find((item) => item.id === documentId);
  700. if (document) emitProgress(webContents, document);
  701. debugLog(documentId, 'document:update', {
  702. status: partial.status,
  703. progress: partial.progress,
  704. message: partial.message,
  705. error: partial.error,
  706. candidate_item_count: partial.candidate_item_count,
  707. item_count: partial.item_count,
  708. block_count: partial.block_count,
  709. filtered_block_count: partial.filtered_block_count,
  710. });
  711. return document;
  712. }
  713. function getDocument(documentId) {
  714. const index = loadIndex();
  715. const document = index.documents.find((item) => item.id === documentId);
  716. if (!document) throw new Error('知识库文档不存在');
  717. return document;
  718. }
  719. async function prepareDocument(documentId, sourceFilePath, webContents) {
  720. if (activePreparations.has(documentId)) {
  721. debugLog(documentId, 'prepare:skip-active');
  722. return;
  723. }
  724. activePreparations.add(documentId);
  725. debugLog(documentId, 'prepare:start', { source_file_path: sourceFilePath });
  726. try {
  727. const document = getDocument(documentId);
  728. const config = configStore ? configStore.load() : { file_parser: { provider: 'local' } };
  729. const documentDir = fromRelative(baseDir, document.document_dir);
  730. const sourcePath = fromRelative(baseDir, document.source_path);
  731. const markdownPath = fromRelative(baseDir, document.markdown_path);
  732. const blocksPath = fromRelative(baseDir, document.blocks_path);
  733. const filteredBlocksPath = fromRelative(baseDir, document.filtered_blocks_path);
  734. const candidateItemsPath = fromRelative(baseDir, document.candidate_items_path);
  735. updateDocument(documentId, { status: 'copying', progress: 5, message: '正在复制原始文件' }, webContents);
  736. ensureDir(documentDir);
  737. await fsp.copyFile(sourceFilePath, sourcePath);
  738. debugLog(documentId, 'prepare:copied-source', { source_path: sourcePath });
  739. updateDocument(documentId, { status: 'converting', progress: 15, message: '正在转换为 Markdown' }, webContents);
  740. const markdown = stripMarkdownFence((await parseDocumentWithConfig(app, sourcePath, config, { assetScope: `knowledge-${documentId}`, preserveImages: false })).trim());
  741. if (!markdown) throw new Error('文档未解析出有效 Markdown 内容');
  742. await fsp.writeFile(markdownPath, `${markdown}\n`, 'utf-8');
  743. debugLog(documentId, 'prepare:converted-markdown', { markdown_path: markdownPath, markdown_chars: markdown.length });
  744. const rawBlocks = createRawBlocks(markdown);
  745. const semanticBlocks = mergeSemanticBlocks(rawBlocks);
  746. const { blocks, filtered_blocks: filteredBlocks } = filterBlocks(semanticBlocks);
  747. if (!blocks.length) throw new Error('筛选后没有可分析的正文内容');
  748. writeJson(blocksPath, blocks);
  749. writeJson(filteredBlocksPath, filteredBlocks);
  750. debugLog(documentId, 'prepare:blocks-ready', {
  751. raw_block_count: rawBlocks.length,
  752. semantic_block_count: semanticBlocks.length,
  753. block_count: blocks.length,
  754. filtered_block_count: filteredBlocks.length,
  755. block_text_chars: renderBlocksForPrompt(blocks).length,
  756. filtered_reasons: filteredBlocks.reduce((acc, block) => {
  757. acc[block.reason] = (acc[block.reason] || 0) + 1;
  758. return acc;
  759. }, {}),
  760. });
  761. const blockText = renderBlocksForPrompt(blocks);
  762. updateDocument(documentId, {
  763. status: 'extracting',
  764. progress: 35,
  765. message: 'AI 正在首次提取知识条目',
  766. block_count: blocks.length,
  767. filtered_block_count: filteredBlocks.length,
  768. }, webContents);
  769. const firstMessages = buildInitialItemMessages(document.file_name, blockText);
  770. debugLog(documentId, 'ai:first-items:start', {
  771. prompt: getPromptSummary(firstMessages),
  772. });
  773. const first = await aiService.collectJsonResponse({
  774. messages: firstMessages,
  775. temperature: 0.2,
  776. response_format: { type: 'json_object' },
  777. normalizer: (value) => ({ items: normalizeCandidateItems(value) }),
  778. validator: validateCandidateItems,
  779. failureMessage: '知识库条目提取失败,AI 未返回有效 JSON',
  780. progressLabel: '知识库条目提取',
  781. });
  782. const firstItems = Array.isArray(first?.items) ? first.items : [];
  783. debugLog(documentId, 'ai:first-items:done', {
  784. item_count: firstItems.length,
  785. sample: getItemSample(firstItems),
  786. });
  787. updateDocument(documentId, { status: 'extracting', progress: 55, message: 'AI 正在补充遗漏知识条目' }, webContents);
  788. const supplementMessages = buildSupplementItemMessages(document.file_name, blockText, firstItems);
  789. debugLog(documentId, 'ai:supplement-items:start', {
  790. first_item_count: firstItems.length,
  791. prompt: getPromptSummary(supplementMessages),
  792. });
  793. const supplement = await aiService.collectJsonResponse({
  794. messages: supplementMessages,
  795. temperature: 0.2,
  796. response_format: { type: 'json_object' },
  797. normalizer: (value) => ({ items: normalizeCandidateItems(value) }),
  798. validator: validateCandidateItems,
  799. failureMessage: '知识库条目补充失败,AI 未返回有效 JSON',
  800. progressLabel: '知识库条目补充',
  801. });
  802. const supplementItems = Array.isArray(supplement?.items) ? supplement.items : [];
  803. debugLog(documentId, 'ai:supplement-items:done', {
  804. item_count: supplementItems.length,
  805. sample: getItemSample(supplementItems),
  806. });
  807. const candidateItems = mergeCandidateItems(firstItems, supplementItems);
  808. if (!candidateItems.length) throw new Error('AI 未提取出可用知识条目');
  809. writeJson(candidateItemsPath, candidateItems);
  810. debugLog(documentId, 'prepare:candidates-saved', {
  811. candidate_item_count: candidateItems.length,
  812. candidate_items_path: candidateItemsPath,
  813. sample: getItemSample(candidateItems),
  814. });
  815. updateDocument(documentId, {
  816. status: 'ready_for_matching',
  817. progress: 65,
  818. message: `已提取 ${candidateItems.length} 条候选知识,请设置批次开始匹配`,
  819. candidate_item_count: candidateItems.length,
  820. item_count: 0,
  821. }, webContents);
  822. if (!isDeveloperMode()) {
  823. debugLog(documentId, 'prepare:auto-match', { batch_size: 20 });
  824. await matchDocument(documentId, 20, webContents);
  825. }
  826. } catch (error) {
  827. debugLog(documentId, 'prepare:error', {
  828. message: error.message || String(error),
  829. stack: error.stack,
  830. });
  831. updateDocument(documentId, { status: 'error', progress: 100, message: error.message || '处理失败', error: error.message || '处理失败' }, webContents);
  832. } finally {
  833. activePreparations.delete(documentId);
  834. debugLog(documentId, 'prepare:finish');
  835. }
  836. }
  837. async function matchDocument(documentId, batchSize, webContents) {
  838. if (activeMatches.has(documentId)) {
  839. debugLog(documentId, 'match:skip-active');
  840. return;
  841. }
  842. activeMatches.add(documentId);
  843. debugLog(documentId, 'match:start', { requested_batch_size: batchSize });
  844. try {
  845. const document = getDocument(documentId);
  846. const normalizedBatchSize = Math.max(1, Math.min(100, Math.floor(Number(batchSize) || 1)));
  847. const blocks = readJson(fromRelative(baseDir, document.blocks_path), []);
  848. const filteredBlocks = readJson(fromRelative(baseDir, document.filtered_blocks_path), []);
  849. const initialItems = readJson(fromRelative(baseDir, document.candidate_items_path), []);
  850. if (!blocks.length) throw new Error('缺少正文 block,请重新上传文档');
  851. if (!initialItems.length) throw new Error('缺少候选知识条目,请等待条目提取完成');
  852. debugLog(documentId, 'match:inputs-ready', {
  853. block_count: blocks.length,
  854. filtered_block_count: filteredBlocks.length,
  855. initial_item_count: initialItems.length,
  856. normalized_batch_size: normalizedBatchSize,
  857. });
  858. const blockText = renderBlocksForPrompt(blocks);
  859. const blockOrder = getBlockOrder(blocks);
  860. const itemIds = new Set(initialItems.map((item) => item.id));
  861. const batches = [];
  862. for (let index = 0; index < initialItems.length; index += normalizedBatchSize) {
  863. batches.push(initialItems.slice(index, index + normalizedBatchSize));
  864. }
  865. const matches = [];
  866. const matchBatches = [];
  867. updateDocument(documentId, { status: 'matching', progress: 66, message: `开始匹配段落,共 ${batches.length} 批`, last_batch_size: normalizedBatchSize }, webContents);
  868. for (let index = 0; index < batches.length; index += 1) {
  869. const progress = Math.min(88, 66 + Math.round(((index + 1) / batches.length) * 22));
  870. updateDocument(documentId, { status: 'matching', progress, message: `AI 正在匹配段落 ${index + 1}/${batches.length}` }, webContents);
  871. const matchMessages = buildMatchMessages(document.file_name, blockText, batches[index]);
  872. debugLog(documentId, 'ai:match-batch:start', {
  873. batch_index: index + 1,
  874. batch_count: batches.length,
  875. item_ids: batches[index].map((item) => item.id),
  876. prompt: getPromptSummary(matchMessages),
  877. });
  878. const parsed = await aiService.collectJsonResponse({
  879. messages: matchMessages,
  880. temperature: 0.1,
  881. response_format: { type: 'json_object' },
  882. normalizer: (value) => normalizeMatchResult(value, itemIds, blocks, blockOrder),
  883. validator: validateMatchResult,
  884. failureMessage: '知识库段落匹配失败,AI 未返回有效 JSON',
  885. progressLabel: '知识库段落匹配',
  886. });
  887. debugLog(documentId, 'ai:match-batch:done', {
  888. batch_index: index + 1,
  889. match_count: parsed.matches.length,
  890. matches: getMatchSummary(parsed.matches),
  891. });
  892. const batchResult = { batch_index: index + 1, item_ids: batches[index].map((item) => item.id), matches: parsed.matches };
  893. matchBatches.push(batchResult);
  894. matches.push(...parsed.matches);
  895. }
  896. const items = [...initialItems];
  897. const discarded = [];
  898. const systemDiscarded = [];
  899. const recoveryAttempts = [];
  900. for (let attempt = 0; attempt < recoveryMaxAttempts; attempt += 1) {
  901. const missingBlocks = getMissingBlocks(blocks, matches, discarded, systemDiscarded);
  902. debugLog(documentId, 'recovery:missing-check', {
  903. attempt: attempt + 1,
  904. missing_block_count: missingBlocks.length,
  905. });
  906. if (!missingBlocks.length) break;
  907. updateDocument(documentId, {
  908. status: 'recovering',
  909. progress: Math.min(96, 90 + attempt * 3),
  910. message: `AI 正在补漏遗漏段落 ${attempt + 1}/${recoveryMaxAttempts},剩余 ${missingBlocks.length} 个 block`,
  911. }, webContents);
  912. const currentItemIds = new Set(items.map((item) => item.id));
  913. const recoveryMessages = buildRecoveryMessages(document.file_name, items, missingBlocks);
  914. debugLog(documentId, 'ai:recovery:start', {
  915. attempt: attempt + 1,
  916. missing_block_count: missingBlocks.length,
  917. item_count: items.length,
  918. prompt: getPromptSummary(recoveryMessages),
  919. });
  920. const parsed = await aiService.collectJsonResponse({
  921. messages: recoveryMessages,
  922. temperature: 0.1,
  923. response_format: { type: 'json_object' },
  924. normalizer: (value) => normalizeRecoveryResult(value, currentItemIds, blocks, blockOrder),
  925. validator: validateRecoveryResult,
  926. failureMessage: '知识库遗漏段落补漏失败,AI 未返回有效 JSON',
  927. progressLabel: '知识库遗漏补漏',
  928. });
  929. debugLog(documentId, 'ai:recovery:done', {
  930. attempt: attempt + 1,
  931. match_count: parsed.matches.length,
  932. new_item_count: parsed.new_items.length,
  933. discarded_group_count: parsed.discarded.length,
  934. matches: getMatchSummary(parsed.matches),
  935. });
  936. const newItemsWithIds = parsed.new_items.map((item) => {
  937. const id = nextKnowledgeItemId(items);
  938. const next = { id, title: item.title, summary: item.summary };
  939. items.push(next);
  940. matches.push({ id, ranges: item.ranges, block_ids: item.block_ids });
  941. return { ...next, ranges: item.ranges, block_ids: item.block_ids };
  942. });
  943. matches.push(...parsed.matches);
  944. discarded.push(...parsed.discarded.map((item) => ({ ...item, source: `recovery_${attempt + 1}` })));
  945. recoveryAttempts.push({
  946. attempt: attempt + 1,
  947. missing_before_count: missingBlocks.length,
  948. matches: parsed.matches,
  949. new_items: newItemsWithIds,
  950. discarded: parsed.discarded,
  951. });
  952. }
  953. const remaining = getMissingBlocks(blocks, matches, discarded, systemDiscarded);
  954. debugLog(documentId, 'match:remaining-after-recovery', { remaining_block_count: remaining.length });
  955. if (remaining.length) {
  956. systemDiscarded.push({
  957. block_ids: remaining.map((block) => block.id),
  958. reason: 'system_discarded_after_retry',
  959. });
  960. }
  961. updateDocument(documentId, { status: 'saving', progress: 98, message: '正在回填正文并保存知识条目' }, webContents);
  962. const finalItems = createFinalItems(items, matches, blocks, document.file_name);
  963. const report = createReport({
  964. blocks,
  965. filteredBlocks,
  966. candidateItems: items,
  967. finalItems,
  968. matches,
  969. discarded,
  970. systemDiscarded,
  971. recoveryAttempts,
  972. batchSize: normalizedBatchSize,
  973. });
  974. const matchResult = {
  975. candidate_items: items,
  976. match_batches: matchBatches,
  977. recovery_attempts: recoveryAttempts,
  978. final_matches: matches,
  979. discarded,
  980. system_discarded_after_retry: systemDiscarded,
  981. report,
  982. };
  983. writeJson(fromRelative(baseDir, document.candidate_items_path), items);
  984. writeJson(fromRelative(baseDir, document.match_result_path), matchResult);
  985. writeJson(fromRelative(baseDir, document.report_path), report);
  986. writeJson(fromRelative(baseDir, document.items_path), finalItems);
  987. debugLog(documentId, 'match:saved', {
  988. final_item_count: finalItems.length,
  989. report,
  990. items_path: fromRelative(baseDir, document.items_path),
  991. report_path: fromRelative(baseDir, document.report_path),
  992. match_result_path: fromRelative(baseDir, document.match_result_path),
  993. });
  994. updateDocument(documentId, {
  995. status: 'success',
  996. progress: 100,
  997. message: `整理完成,共 ${finalItems.length} 条,覆盖率 ${Math.round(report.coverage_rate * 100)}%`,
  998. item_count: finalItems.length,
  999. candidate_item_count: items.length,
  1000. discarded_block_count: report.discarded_blocks_count,
  1001. system_discarded_after_retry_count: report.system_discarded_after_retry_count,
  1002. }, webContents);
  1003. } catch (error) {
  1004. debugLog(documentId, 'match:error', {
  1005. message: error.message || String(error),
  1006. stack: error.stack,
  1007. });
  1008. updateDocument(documentId, { status: 'error', progress: 100, message: error.message || '匹配失败', error: error.message || '匹配失败' }, webContents);
  1009. } finally {
  1010. activeMatches.delete(documentId);
  1011. debugLog(documentId, 'match:finish');
  1012. }
  1013. }
  1014. return {
  1015. list() {
  1016. return loadIndex();
  1017. },
  1018. createFolder(name) {
  1019. const index = loadIndex();
  1020. const folder = { id: createId('folder'), name: safeName(name), created_at: now(), updated_at: now() };
  1021. saveIndex({ ...index, folders: [...index.folders, folder] });
  1022. return folder;
  1023. },
  1024. renameFolder(folderId, name) {
  1025. const nextName = safeName(name);
  1026. const index = loadIndex();
  1027. const folder = index.folders.find((item) => item.id === folderId);
  1028. if (!folder) throw new Error('知识库文件夹不存在');
  1029. const folders = index.folders.map((item) => (
  1030. item.id === folderId ? { ...item, name: nextName, updated_at: now() } : item
  1031. ));
  1032. saveIndex({ ...index, folders });
  1033. return folders.find((item) => item.id === folderId);
  1034. },
  1035. deleteFolder(folderId) {
  1036. const index = loadIndex();
  1037. const folder = index.folders.find((item) => item.id === folderId);
  1038. if (!folder) throw new Error('知识库文件夹不存在');
  1039. const documentsToDelete = index.documents.filter((document) => document.folder_id === folderId);
  1040. const runningDocument = documentsToDelete.find((document) => activePreparations.has(document.id) || activeMatches.has(document.id));
  1041. if (runningDocument) {
  1042. throw new Error(`文档“${runningDocument.file_name}”正在处理中,请完成后再删除文件夹`);
  1043. }
  1044. for (const document of documentsToDelete) {
  1045. deleteImportedImageBatches(app, `knowledge-${document.id}`);
  1046. fs.rmSync(fromRelative(baseDir, document.document_dir), { recursive: true, force: true });
  1047. fs.rmSync(getDebugLogPath(app, document.id), { force: true });
  1048. }
  1049. fs.rmSync(fromRelative(baseDir, path.join('folders', folderId)), { recursive: true, force: true });
  1050. saveIndex({
  1051. folders: index.folders.filter((item) => item.id !== folderId),
  1052. documents: index.documents.filter((document) => document.folder_id !== folderId),
  1053. });
  1054. return { success: true, message: `已删除文件夹“${folder.name}”及 ${documentsToDelete.length} 个文档` };
  1055. },
  1056. deleteDocument(documentId) {
  1057. const index = loadIndex();
  1058. const document = index.documents.find((item) => item.id === documentId);
  1059. if (!document) throw new Error('知识库文档不存在');
  1060. if (activePreparations.has(documentId) || activeMatches.has(documentId)) {
  1061. throw new Error('该文档正在处理中,请完成后再删除');
  1062. }
  1063. deleteImportedImageBatches(app, `knowledge-${documentId}`);
  1064. fs.rmSync(fromRelative(baseDir, document.document_dir), { recursive: true, force: true });
  1065. fs.rmSync(getDebugLogPath(app, documentId), { force: true });
  1066. saveIndex({ ...index, documents: index.documents.filter((item) => item.id !== documentId) });
  1067. return { success: true, message: `已删除文档“${document.file_name}”` };
  1068. },
  1069. async uploadDocuments(folderId, webContents) {
  1070. const currentIndex = loadIndex();
  1071. const folder = currentIndex.folders.find((item) => item.id === folderId);
  1072. if (!folder) throw new Error('请先选择知识库文件夹');
  1073. const result = await dialog.showOpenDialog({
  1074. title: '选择知识库文档',
  1075. properties: ['openFile', 'multiSelections'],
  1076. filters: [
  1077. { name: '知识库文档', extensions: ['doc', 'docx', 'wps', 'pdf', 'md', 'markdown'] },
  1078. { name: '所有文件', extensions: ['*'] },
  1079. ],
  1080. });
  1081. if (result.canceled || !result.filePaths.length) {
  1082. return { success: false, message: '已取消选择' };
  1083. }
  1084. const created = [];
  1085. let index = loadIndex();
  1086. for (const filePath of result.filePaths) {
  1087. const ext = path.extname(filePath).toLowerCase();
  1088. if (!supportedExtensions.has(ext)) continue;
  1089. const documentId = createId('doc');
  1090. const documentDir = path.join('folders', folderId, 'documents', documentId).replace(/\\/g, '/');
  1091. const sourceName = `source${ext}`;
  1092. const document = normalizeDocument({
  1093. id: documentId,
  1094. folder_id: folderId,
  1095. file_name: path.basename(filePath),
  1096. document_dir: documentDir,
  1097. source_path: path.join(documentDir, sourceName).replace(/\\/g, '/'),
  1098. markdown_path: path.join(documentDir, 'content.md').replace(/\\/g, '/'),
  1099. blocks_path: path.join(documentDir, 'blocks.json').replace(/\\/g, '/'),
  1100. filtered_blocks_path: path.join(documentDir, 'filtered_blocks.json').replace(/\\/g, '/'),
  1101. candidate_items_path: path.join(documentDir, 'candidate_items.json').replace(/\\/g, '/'),
  1102. match_result_path: path.join(documentDir, 'match_result.json').replace(/\\/g, '/'),
  1103. report_path: path.join(documentDir, 'report.json').replace(/\\/g, '/'),
  1104. items_path: path.join(documentDir, 'items.json').replace(/\\/g, '/'),
  1105. status: 'pending',
  1106. progress: 0,
  1107. message: '等待处理',
  1108. item_count: 0,
  1109. block_count: 0,
  1110. filtered_block_count: 0,
  1111. candidate_item_count: 0,
  1112. discarded_block_count: 0,
  1113. system_discarded_after_retry_count: 0,
  1114. created_at: now(),
  1115. updated_at: now(),
  1116. });
  1117. index = saveIndex({ ...index, documents: [...index.documents, document] });
  1118. created.push(document);
  1119. emitProgress(webContents, document);
  1120. prepareDocument(documentId, filePath, webContents);
  1121. }
  1122. return { success: Boolean(created.length), message: created.length ? `已加入 ${created.length} 个文档处理任务` : '未选择支持的文档类型', documents: created };
  1123. },
  1124. startMatching(documentId, batchSize, webContents) {
  1125. const document = getDocument(documentId);
  1126. debugLog(documentId, 'ipc:start-matching', { batch_size: batchSize, current_status: document.status });
  1127. if (activeMatches.has(documentId)) {
  1128. return { success: false, message: '该文档正在匹配中', document };
  1129. }
  1130. if (!['ready_for_matching', 'success', 'error'].includes(document.status)) {
  1131. return { success: false, message: '请等待候选知识条目提取完成', document };
  1132. }
  1133. matchDocument(documentId, batchSize, webContents);
  1134. return { success: true, message: '已开始分批匹配段落', document };
  1135. },
  1136. getOutlineReferences(documentIds) {
  1137. const ids = Array.isArray(documentIds) ? documentIds.map((id) => String(id || '').trim()).filter(Boolean) : [];
  1138. if (!ids.length) {
  1139. return { items: [] };
  1140. }
  1141. const index = loadIndex();
  1142. const seen = new Set();
  1143. const items = [];
  1144. ids.forEach((documentId) => {
  1145. const document = index.documents.find((item) => item.id === documentId);
  1146. if (!document || document.status !== 'success') {
  1147. return;
  1148. }
  1149. let documentItems = [];
  1150. try {
  1151. documentItems = readJson(fromRelative(baseDir, document.items_path), []);
  1152. } catch (error) {
  1153. console.warn('[knowledge-base] 读取知识库目录引用失败', { documentId, message: error.message || String(error) });
  1154. return;
  1155. }
  1156. (Array.isArray(documentItems) ? documentItems : []).forEach((item) => {
  1157. const itemId = String(item?.id || '').trim();
  1158. const title = String(item?.title || '').trim();
  1159. const resume = String(item?.resume || item?.summary || '').trim();
  1160. if (!itemId || !title || !resume) {
  1161. return;
  1162. }
  1163. const referenceId = `${document.id}::${itemId}`;
  1164. if (seen.has(referenceId)) {
  1165. return;
  1166. }
  1167. seen.add(referenceId);
  1168. items.push({ id: referenceId, title, resume });
  1169. });
  1170. });
  1171. return { items };
  1172. },
  1173. readMarkdown(documentId) {
  1174. const document = getDocument(documentId);
  1175. const filePath = fromRelative(baseDir, document.markdown_path);
  1176. return fs.existsSync(filePath) ? fs.readFileSync(filePath, 'utf-8') : '';
  1177. },
  1178. readItems(documentId) {
  1179. const document = getDocument(documentId);
  1180. return readJson(fromRelative(baseDir, document.items_path), []);
  1181. },
  1182. readAnalysis(documentId) {
  1183. const document = getDocument(documentId);
  1184. const markdownPath = fromRelative(baseDir, document.markdown_path);
  1185. const markdown = fs.existsSync(markdownPath) ? fs.readFileSync(markdownPath, 'utf-8') : '';
  1186. const blocks = readJson(fromRelative(baseDir, document.blocks_path), []);
  1187. const filteredBlocks = readJson(fromRelative(baseDir, document.filtered_blocks_path), []);
  1188. const candidateItems = readJson(fromRelative(baseDir, document.candidate_items_path), []);
  1189. const items = readJson(fromRelative(baseDir, document.items_path), []);
  1190. const report = readJson(fromRelative(baseDir, document.report_path), null);
  1191. const matchResult = readJson(fromRelative(baseDir, document.match_result_path), null);
  1192. const markdown_chars = getContentCharCount(markdown);
  1193. const kept_block_chars = sumContentChars(blocks);
  1194. const covered_unique_content_chars = countCoveredUniqueBlockChars(items, blocks);
  1195. return {
  1196. document,
  1197. block_count: blocks.length,
  1198. filtered_blocks_count: filteredBlocks.length,
  1199. markdown_chars,
  1200. kept_block_chars,
  1201. covered_unique_content_chars,
  1202. coverage_rate_vs_markdown: markdown_chars ? Number((covered_unique_content_chars / markdown_chars).toFixed(4)) : 0,
  1203. candidate_items: candidateItems,
  1204. report,
  1205. discarded: matchResult?.discarded || [],
  1206. system_discarded_after_retry: matchResult?.system_discarded_after_retry || [],
  1207. debug_log_path: isDeveloperMode() ? getDebugLogPath(app, documentId) : '',
  1208. };
  1209. },
  1210. };
  1211. }
  1212. module.exports = {
  1213. createKnowledgeBaseService,
  1214. _internals: {
  1215. createRawBlocks,
  1216. mergeSemanticBlocks,
  1217. filterBlocks,
  1218. renderBlocksForPrompt,
  1219. normalizeCandidateItems,
  1220. normalizeMatchResult,
  1221. normalizeRecoveryResult,
  1222. },
  1223. };