duplicateCheckService.cjs 82 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031
  1. const fs = require('node:fs/promises');
  2. const path = require('node:path');
  3. const crypto = require('node:crypto');
  4. const { pathToFileURL } = require('node:url');
  5. const AdmZip = require('adm-zip');
  6. const CFB = require('cfb');
  7. const cheerio = require('cheerio');
  8. const iconv = require('iconv-lite');
  9. const { PDFParse } = require('pdf-parse');
  10. const { getDuplicateCheckDir, getGeneratedImagesDir, getImportedImagesDir } = require('../utils/paths.cjs');
  11. const { parseDocumentWithConfig } = require('./fileService.cjs');
  12. const metadataLabels = {
  13. file_name: '文件名',
  14. extension: '扩展名',
  15. size: '文件大小',
  16. created_at: '文件创建时间',
  17. modified_at: '文件修改时间',
  18. accessed_at: '文件访问时间',
  19. title: '标题',
  20. subject: '主题',
  21. author: '作者',
  22. last_modified_by: '最后修改人',
  23. revision: '修订号',
  24. created: '创建时间',
  25. modified: '修改时间',
  26. last_printed: '最后打印时间',
  27. keywords: '关键词',
  28. category: '类别',
  29. description: '描述',
  30. content_status: '内容状态',
  31. content_type: '内容类型',
  32. identifier: '标识符',
  33. language: '语言',
  34. application: '应用程序',
  35. app_version: '应用程序版本',
  36. company: '公司',
  37. manager: '管理者',
  38. template: '模板',
  39. presentation_format: '演示格式',
  40. pages: '页数',
  41. words: '字数',
  42. characters: '字符数',
  43. characters_with_spaces: '含空格字符数',
  44. bytes: '字节数',
  45. lines: '行数',
  46. paragraphs: '段落数',
  47. slides: '幻灯片数',
  48. notes: '备注数',
  49. hidden_slides: '隐藏幻灯片数',
  50. multimedia_clips: '多媒体剪辑数',
  51. total_time: '编辑时长',
  52. code_page: '代码页',
  53. document_version: '文档版本',
  54. doc_security: '文档安全状态',
  55. shared_doc: '共享文档',
  56. links_dirty: '链接已变更',
  57. hlinks_changed: '超链接已变更',
  58. creator: '创建工具',
  59. producer: '生成工具',
  60. pdf_version: 'PDF 版本',
  61. pdf_permissions: 'PDF 权限',
  62. fingerprints: 'PDF 指纹',
  63. };
  64. const comparableKeys = new Set([
  65. 'title', 'subject', 'author', 'last_modified_by', 'revision', 'created', 'modified', 'last_printed', 'keywords',
  66. 'category', 'description', 'content_status', 'content_type', 'identifier', 'language', 'application', 'app_version',
  67. 'company', 'manager', 'template', 'presentation_format', 'pages', 'words', 'characters', 'characters_with_spaces',
  68. 'bytes', 'lines', 'paragraphs', 'slides', 'notes', 'hidden_slides', 'multimedia_clips', 'total_time', 'creator',
  69. 'producer', 'pdf_version', 'pdf_permissions', 'fingerprints', 'document_version', 'doc_security', 'shared_doc',
  70. 'links_dirty', 'hlinks_changed',
  71. ]);
  72. const dateComparableKeys = new Set(['created_at', 'modified_at', 'accessed_at', 'created', 'modified', 'last_printed']);
  73. const markdownImagePattern = /!\[(?<alt>[^\]]*)\]\((?<target><[^>]+>|[^)\s]+)(?<title>\s+"[^"]*")?\)/gi;
  74. const htmlImageSrcPattern = /<img\b[^>]*?\bsrc=["'](?<src>[^"']+)["'][^>]*>/gi;
  75. const htmlImagePattern = /<img\b[^>]*>/gi;
  76. const htmlTablePattern = /<table\b[\s\S]*?<\/table>/gi;
  77. const contentTableTokenPrefix = 'YIBIAO_CONTENT_TABLE_';
  78. function now() {
  79. return new Date().toISOString();
  80. }
  81. function stableFileId(file) {
  82. return file?.id || crypto.createHash('sha1').update(String(file?.file_path || file?.file_name || '')).digest('hex');
  83. }
  84. function createSignature(payload = {}) {
  85. const files = [payload.tenderFile, ...(Array.isArray(payload.bidFiles) ? payload.bidFiles : [])]
  86. .filter(Boolean)
  87. .map((file) => `${file.file_path}|${file.size}|${file.modified_at}`);
  88. return crypto.createHash('sha1').update(files.join('\n')).digest('hex');
  89. }
  90. function normalizeValue(value) {
  91. if (value === null || value === undefined) return '';
  92. if (value instanceof Date) return value.toISOString();
  93. if (Array.isArray(value)) return value.map(normalizeValue).filter(Boolean).join(';');
  94. if (typeof value === 'object') return JSON.stringify(value);
  95. return String(value)
  96. .normalize('NFKC')
  97. .replace(/[\u0000-\u001f\u007f-\u009f\u200b-\u200f\u202a-\u202e\ufeff]/g, '')
  98. .replace(/\s+/g, ' ')
  99. .trim();
  100. }
  101. function normalizeComparable(value) {
  102. const text = normalizeValue(value).toLowerCase();
  103. if (!text || ['没有提及', '原文未提及', '-', '无', 'null', 'undefined'].includes(text)) return '';
  104. const date = new Date(text.replace(/^d:/i, '').replace(/([+-]\d{2})'(\d{2})'$/, '$1:$2'));
  105. if (!Number.isNaN(date.getTime())) return date.toISOString();
  106. return text.replace(/[\s \u0000-\u001f\u007f-\u009f\u200b-\u200f\u202a-\u202e\ufeff]+/g, '');
  107. }
  108. function normalizeDateDay(value) {
  109. const text = normalizeValue(value);
  110. if (!text) return '';
  111. const date = new Date(text.replace(/^d:/i, '').replace(/([+-]\d{2})'(\d{2})'$/, '$1:$2'));
  112. if (!Number.isNaN(date.getTime())) return date.toISOString().slice(0, 10);
  113. const match = text.match(/\d{4}[-/.年]\d{1,2}[-/.月]\d{1,2}/);
  114. return match ? match[0].replace(/[年月/.]/g, '-').replace(/日/g, '') : '';
  115. }
  116. function addField(fields, key, value) {
  117. const text = normalizeValue(value);
  118. if (!text) return;
  119. fields.set(key, text);
  120. }
  121. function addFieldIfAbsent(fields, key, value) {
  122. if (fields.has(key)) return;
  123. addField(fields, key, value);
  124. }
  125. function addListField(fields, key, value) {
  126. const text = normalizeValue(value);
  127. if (!text) return;
  128. const current = fields.get(key);
  129. if (!current) {
  130. fields.set(key, text);
  131. return;
  132. }
  133. const parts = current.split(';').map((item) => item.trim()).filter(Boolean);
  134. if (!parts.includes(text)) fields.set(key, `${current};${text}`);
  135. }
  136. function safeMetadataKey(value) {
  137. return normalizeValue(value)
  138. .toLowerCase()
  139. .replace(/[^a-z0-9\u4e00-\u9fa5]+/gi, '_')
  140. .replace(/^_+|_+$/g, '')
  141. .slice(0, 80) || 'field';
  142. }
  143. function formatMetadataKey(key) {
  144. return String(key || '').replace(/[_:]+/g, ' ').trim() || String(key || '');
  145. }
  146. function getMetadataLabel(key) {
  147. if (metadataLabels[key]) return metadataLabels[key];
  148. if (key.startsWith('converted_docx:')) return `转换 DOCX:${getMetadataLabel(key.slice('converted_docx:'.length))}`;
  149. if (key.startsWith('custom:') && key.endsWith(':base64_decoded')) return `自定义:${key.slice('custom:'.length, -':base64_decoded'.length)}(Base64 解码)`;
  150. if (key.startsWith('custom:')) return `自定义:${key.slice('custom:'.length)}`;
  151. if (key.endsWith(':base64_decoded')) return `${getMetadataLabel(key.slice(0, -':base64_decoded'.length))}(Base64 解码)`;
  152. if (key.startsWith('pdf_info:')) return `PDF Info:${formatMetadataKey(key.slice('pdf_info:'.length))}`;
  153. if (key.startsWith('pdf_xmp:')) return `PDF XMP:${formatMetadataKey(key.slice('pdf_xmp:'.length))}`;
  154. if (key.startsWith('pdf_raw:')) return `PDF 原始记录:${formatMetadataKey(key.slice('pdf_raw:'.length))}`;
  155. if (key.startsWith('ole_signal:')) return `OLE 疑似痕迹:${formatMetadataKey(key.slice('ole_signal:'.length))}`;
  156. if (key.startsWith('wps:')) return `疑似 WPS 用户/账号:${formatMetadataKey(key.slice('wps:'.length))}`;
  157. return formatMetadataKey(key);
  158. }
  159. function isDateComparableKey(key) {
  160. if (dateComparableKeys.has(key)) return true;
  161. const normalized = String(key || '').toLowerCase();
  162. if (/last[_-]?modified[_-]?by|lastmodifiedby/.test(normalized)) return false;
  163. return /(^|[:_])(created|modified|last_printed|creationdate|moddate|createdate|modifydate|metadatadate|lastsaved|lastprinted)([:_]|$)/.test(normalized);
  164. }
  165. function isComparableKey(key) {
  166. return comparableKeys.has(key)
  167. || isDateComparableKey(key)
  168. || key.startsWith('custom:')
  169. || key.startsWith('converted_docx:')
  170. || key.startsWith('pdf_info:')
  171. || key.startsWith('pdf_xmp:')
  172. || key.startsWith('pdf_raw:')
  173. || key.startsWith('ole_signal:')
  174. || key.startsWith('wps:');
  175. }
  176. function tryDecodeBase64Text(value) {
  177. const text = normalizeValue(value);
  178. if (!text || text.length < 12 || text.length % 4 !== 0 || !/^[A-Za-z0-9+/]+={0,2}$/.test(text)) return '';
  179. try {
  180. const decoded = Buffer.from(text, 'base64').toString('utf8').replace(/^\uFEFF/, '').trim();
  181. if (!decoded || decoded === text || /[\u0000-\u0008\u000b\u000c\u000e-\u001f]/.test(decoded)) return '';
  182. try {
  183. return JSON.stringify(JSON.parse(decoded));
  184. } catch {
  185. return decoded;
  186. }
  187. } catch {
  188. return '';
  189. }
  190. }
  191. function addDecodedBase64Fields(fields) {
  192. for (const [key, value] of Array.from(fields.entries())) {
  193. if (key.endsWith(':base64_decoded')) continue;
  194. const decoded = tryDecodeBase64Text(value);
  195. if (decoded) addField(fields, `${key}:base64_decoded`, decoded);
  196. }
  197. }
  198. function xmlText(xml, tagName) {
  199. const pattern = new RegExp(`<[^:>]*:?${tagName}[^>]*>([\\s\\S]*?)<\\/[^:>]*:?${tagName}>`, 'i');
  200. const match = String(xml || '').match(pattern);
  201. return match ? decodeXml(match[1]) : '';
  202. }
  203. function decodeXml(value) {
  204. return String(value || '')
  205. .replace(/<!\[CDATA\[([\s\S]*?)\]\]>/g, '$1')
  206. .replace(/&lt;/g, '<')
  207. .replace(/&gt;/g, '>')
  208. .replace(/&quot;/g, '"')
  209. .replace(/&apos;/g, "'")
  210. .replace(/&amp;/g, '&')
  211. .trim();
  212. }
  213. function readZipText(zip, entryName) {
  214. const entry = zip.getEntry(entryName);
  215. return entry ? entry.getData().toString('utf8') : '';
  216. }
  217. const SUMMARY_PROPERTY_MAP = {
  218. 0x01: { key: 'code_page' },
  219. 0x02: { key: 'title' },
  220. 0x03: { key: 'subject' },
  221. 0x04: { key: 'author' },
  222. 0x05: { key: 'keywords' },
  223. 0x06: { key: 'description' },
  224. 0x07: { key: 'template' },
  225. 0x08: { key: 'last_modified_by' },
  226. 0x09: { key: 'revision' },
  227. 0x0a: { key: 'total_time', kind: 'duration_filetime' },
  228. 0x0b: { key: 'last_printed' },
  229. 0x0c: { key: 'created' },
  230. 0x0d: { key: 'modified' },
  231. 0x0e: { key: 'pages' },
  232. 0x0f: { key: 'words' },
  233. 0x10: { key: 'characters' },
  234. 0x12: { key: 'application' },
  235. 0x13: { key: 'doc_security' },
  236. };
  237. const DOC_SUMMARY_PROPERTY_MAP = {
  238. 0x01: { key: 'code_page' },
  239. 0x02: { key: 'category' },
  240. 0x03: { key: 'presentation_format' },
  241. 0x04: { key: 'bytes' },
  242. 0x05: { key: 'lines' },
  243. 0x06: { key: 'paragraphs' },
  244. 0x07: { key: 'slides' },
  245. 0x08: { key: 'notes' },
  246. 0x09: { key: 'hidden_slides' },
  247. 0x0a: { key: 'multimedia_clips' },
  248. 0x0b: { key: 'scale_crop' },
  249. 0x0c: { key: 'heading_pairs' },
  250. 0x0d: { key: 'titles_of_parts' },
  251. 0x0e: { key: 'manager' },
  252. 0x0f: { key: 'company' },
  253. 0x10: { key: 'links_dirty' },
  254. 0x11: { key: 'characters_with_spaces' },
  255. 0x13: { key: 'shared_doc' },
  256. 0x16: { key: 'hlinks_changed' },
  257. 0x17: { key: 'app_version', kind: 'version' },
  258. 0x1a: { key: 'content_type' },
  259. 0x1b: { key: 'content_status' },
  260. 0x1c: { key: 'language' },
  261. 0x1d: { key: 'document_version' },
  262. };
  263. function align4(value) {
  264. return value + ((4 - (value % 4)) % 4);
  265. }
  266. function readUInt16LE(buffer, offset) {
  267. return offset + 2 <= buffer.length ? buffer.readUInt16LE(offset) : 0;
  268. }
  269. function readInt16LE(buffer, offset) {
  270. return offset + 2 <= buffer.length ? buffer.readInt16LE(offset) : 0;
  271. }
  272. function readUInt32LE(buffer, offset) {
  273. return offset + 4 <= buffer.length ? buffer.readUInt32LE(offset) : 0;
  274. }
  275. function readInt32LE(buffer, offset) {
  276. return offset + 4 <= buffer.length ? buffer.readInt32LE(offset) : 0;
  277. }
  278. function codePageToEncoding(codePage) {
  279. const value = Number(codePage) || 1252;
  280. if (value === 936 || value === 54936) return 'gb18030';
  281. if (value === 950) return 'big5';
  282. if (value === 932) return 'shift_jis';
  283. if (value === 949) return 'euc-kr';
  284. if (value === 65001) return 'utf8';
  285. if (value === 1200 || value === 1201) return 'utf16le';
  286. if (value >= 1250 && value <= 1258) return `windows${value}`;
  287. return 'latin1';
  288. }
  289. function decodeCodePageBuffer(buffer, codePage) {
  290. const encoding = codePageToEncoding(codePage);
  291. try {
  292. return iconv.decode(buffer, encoding);
  293. } catch {
  294. return buffer.toString('latin1');
  295. }
  296. }
  297. function cleanOleString(value) {
  298. return String(value || '').replace(/\u0000+$/g, '').replace(/\u0000/g, '').trim();
  299. }
  300. function parseFileTimeValue(buffer, offset) {
  301. const low = readUInt32LE(buffer, offset);
  302. const high = readUInt32LE(buffer, offset + 4);
  303. if (!low && !high) return '';
  304. const ticks = (BigInt(high) << 32n) + BigInt(low);
  305. const unixMs = ticks / 10000n - 11644473600000n;
  306. const date = new Date(Number(unixMs));
  307. if (Number.isNaN(date.getTime())) return '';
  308. return date.toISOString();
  309. }
  310. function parseFileTimeDuration(buffer, offset) {
  311. const low = readUInt32LE(buffer, offset);
  312. const high = readUInt32LE(buffer, offset + 4);
  313. const ticks = (BigInt(high) << 32n) + BigInt(low);
  314. if (!ticks) return '';
  315. const seconds = Number(ticks / 10000000n);
  316. if (!Number.isFinite(seconds) || seconds <= 0) return '';
  317. if (seconds < 60) return `${seconds} 秒`;
  318. const minutes = Math.round(seconds / 60);
  319. if (minutes < 60) return `${minutes} 分钟`;
  320. const hours = Math.floor(minutes / 60);
  321. const restMinutes = minutes % 60;
  322. return restMinutes ? `${hours} 小时 ${restMinutes} 分钟` : `${hours} 小时`;
  323. }
  324. function parseLpstr(buffer, offset, codePage, padded = true) {
  325. const length = readUInt32LE(buffer, offset);
  326. const start = offset + 4;
  327. const byteLength = Math.max(0, Math.min(length, buffer.length - start));
  328. const raw = buffer.subarray(start, start + byteLength);
  329. return {
  330. value: cleanOleString(decodeCodePageBuffer(raw, codePage)),
  331. nextOffset: padded ? align4(start + byteLength) : start + byteLength,
  332. };
  333. }
  334. function parseLpwstr(buffer, offset, padded = true) {
  335. const charLength = readUInt32LE(buffer, offset);
  336. const start = offset + 4;
  337. const byteLength = Math.max(0, Math.min(charLength * 2, buffer.length - start));
  338. const raw = buffer.subarray(start, start + byteLength);
  339. return {
  340. value: cleanOleString(raw.toString('utf16le')),
  341. nextOffset: padded ? align4(start + byteLength) : start + byteLength,
  342. };
  343. }
  344. function parseVectorStringValue(buffer, offset, type, codePage) {
  345. const count = readUInt32LE(buffer, offset);
  346. let cursor = offset + 4;
  347. const values = [];
  348. for (let index = 0; index < count && cursor < buffer.length; index += 1) {
  349. const parsed = type === 0x101f ? parseLpwstr(buffer, cursor, true) : parseLpstr(buffer, cursor, codePage, false);
  350. if (parsed.value) values.push(parsed.value);
  351. cursor = parsed.nextOffset;
  352. }
  353. return { value: values, nextOffset: cursor };
  354. }
  355. function parseVectorVariantValue(buffer, offset, codePage) {
  356. const count = readUInt32LE(buffer, offset);
  357. let cursor = offset + 4;
  358. const values = [];
  359. for (let index = 0; index < count && cursor < buffer.length; index += 1) {
  360. const parsed = parseTypedPropertyValue(buffer, cursor, codePage);
  361. if (parsed.value !== '') values.push(parsed.value);
  362. cursor = parsed.nextOffset;
  363. }
  364. return { value: values, nextOffset: cursor };
  365. }
  366. function parseTypedPropertyValue(buffer, offset, codePage = 1252) {
  367. const type = readUInt16LE(buffer, offset);
  368. const valueOffset = offset + 4;
  369. if (!type || valueOffset > buffer.length) return { type, value: '', nextOffset: valueOffset };
  370. if (type === 0x02) return { type, value: readInt16LE(buffer, valueOffset), nextOffset: align4(valueOffset + 2) };
  371. if (type === 0x03) return { type, value: readInt32LE(buffer, valueOffset), nextOffset: valueOffset + 4 };
  372. if (type === 0x05) return { type, value: buffer.readDoubleLE(valueOffset), nextOffset: valueOffset + 8 };
  373. if (type === 0x0b) return { type, value: readUInt32LE(buffer, valueOffset) !== 0, nextOffset: valueOffset + 4 };
  374. if (type === 0x13) return { type, value: readUInt32LE(buffer, valueOffset), nextOffset: valueOffset + 4 };
  375. if (type === 0x1e) return { type, ...parseLpstr(buffer, valueOffset, codePage, true) };
  376. if (type === 0x1f) return { type, ...parseLpwstr(buffer, valueOffset, true) };
  377. if (type === 0x40) return { type, value: parseFileTimeValue(buffer, valueOffset), nextOffset: valueOffset + 8 };
  378. if (type === 0x50) return { type, ...parseLpwstr(buffer, valueOffset, true) };
  379. if (type === 0x51) return { type, ...parseLpwstr(buffer, valueOffset, false) };
  380. if (type === 0x101e || type === 0x101f) return { type, ...parseVectorStringValue(buffer, valueOffset, type, codePage) };
  381. if (type === 0x100c) return { type, ...parseVectorVariantValue(buffer, valueOffset, codePage) };
  382. if (type === 0x41) {
  383. const size = readUInt32LE(buffer, valueOffset);
  384. return { type, value: size ? `BLOB ${size} bytes` : '', nextOffset: align4(valueOffset + 4 + size) };
  385. }
  386. return { type, value: '', nextOffset: valueOffset + 4 };
  387. }
  388. function parsePropertyDictionary(buffer, offset, codePage) {
  389. const count = readUInt32LE(buffer, offset);
  390. const dictionary = new Map();
  391. let cursor = offset + 4;
  392. for (let index = 0; index < count && cursor + 8 <= buffer.length; index += 1) {
  393. const propertyId = readUInt32LE(buffer, cursor);
  394. const length = readUInt32LE(buffer, cursor + 4);
  395. cursor += 8;
  396. let byteLength = codePage === 1200 ? length * 2 : length;
  397. if (cursor + byteLength > buffer.length) byteLength = Math.max(0, Math.min(length, buffer.length - cursor));
  398. const raw = buffer.subarray(cursor, cursor + byteLength);
  399. const value = codePage === 1200 ? cleanOleString(raw.toString('utf16le')) : cleanOleString(decodeCodePageBuffer(raw, codePage));
  400. if (value) dictionary.set(propertyId, value.replace(/^\u0005/, '!'));
  401. cursor = align4(cursor + byteLength);
  402. }
  403. return dictionary;
  404. }
  405. function formatVersionNumber(value) {
  406. const number = Number(value);
  407. if (!Number.isFinite(number)) return value;
  408. return `${number >>> 16}.${String(number & 0xffff).padStart(4, '0')}`;
  409. }
  410. function parsePropertySet(buffer, offset, propertyMap = {}) {
  411. const size = readUInt32LE(buffer, offset);
  412. const count = readUInt32LE(buffer, offset + 4);
  413. const entries = [];
  414. for (let index = 0; index < count && offset + 8 + index * 8 + 8 <= buffer.length; index += 1) {
  415. entries.push({ id: readUInt32LE(buffer, offset + 8 + index * 8), offset: offset + readUInt32LE(buffer, offset + 12 + index * 8) });
  416. }
  417. let codePage = 1252;
  418. const codePageEntry = entries.find((entry) => entry.id === 0x01);
  419. if (codePageEntry) {
  420. const parsedCodePage = parseTypedPropertyValue(buffer, codePageEntry.offset, codePage).value;
  421. if (parsedCodePage) codePage = Number(parsedCodePage) || codePage;
  422. }
  423. const dictionaryEntry = entries.find((entry) => entry.id === 0x00);
  424. const dictionary = dictionaryEntry ? parsePropertyDictionary(buffer, dictionaryEntry.offset, codePage) : new Map();
  425. const fields = new Map();
  426. const endOffset = size ? offset + size : buffer.length;
  427. for (const entry of entries) {
  428. if (entry.id === 0x00 || entry.offset >= endOffset || entry.offset >= buffer.length) continue;
  429. const propertyInfo = propertyMap[entry.id];
  430. const parsed = parseTypedPropertyValue(buffer, entry.offset, codePage);
  431. let value = propertyInfo?.kind === 'duration_filetime' && parsed.type === 0x40
  432. ? parseFileTimeDuration(buffer, entry.offset + 4)
  433. : parsed.value;
  434. if (propertyInfo?.kind === 'version') value = formatVersionNumber(value);
  435. const name = propertyInfo?.key || (dictionary.get(entry.id) ? `custom:${dictionary.get(entry.id)}` : `ole_prop_${entry.id}`);
  436. addField(fields, name, value);
  437. }
  438. return fields;
  439. }
  440. function parsePropertySetStream(content, propertyMap) {
  441. const buffer = Buffer.from(content || []);
  442. const fields = new Map();
  443. if (buffer.length < 48 || readUInt16LE(buffer, 0) !== 0xfffe) return fields;
  444. const setCount = readUInt32LE(buffer, 24);
  445. for (let index = 0; index < setCount && 28 + index * 20 + 20 <= buffer.length; index += 1) {
  446. const setOffset = readUInt32LE(buffer, 28 + index * 20 + 16);
  447. if (!setOffset || setOffset >= buffer.length) continue;
  448. const parsed = parsePropertySet(buffer, setOffset, index === 0 ? propertyMap : {});
  449. for (const [key, value] of parsed.entries()) addField(fields, key, value);
  450. }
  451. return fields;
  452. }
  453. function findCfbEntry(cfb, streamName) {
  454. const bangName = streamName.replace(/^\u0005/, '!');
  455. const candidates = [streamName, `/${streamName}`, bangName, `/${bangName}`];
  456. for (const candidate of candidates) {
  457. const entry = CFB.find(cfb, candidate);
  458. if (entry?.content) return entry;
  459. }
  460. return null;
  461. }
  462. const rawSignalPattern = /(kingsoft|wps office|\bwps\b|\bkso\b|account|e-mail|email|mail|userid|user id|user_id|uid|账号|金山)/ig;
  463. function collectSignalSnippets(value, limit = 5) {
  464. const text = String(value || '').replace(/\u0000/g, '').replace(/[\x00-\x08\x0b\x0c\x0e-\x1f]+/g, ' ');
  465. const snippets = [];
  466. rawSignalPattern.lastIndex = 0;
  467. let match;
  468. while ((match = rawSignalPattern.exec(text)) && snippets.length < limit) {
  469. const start = Math.max(0, (match.index || 0) - 40);
  470. const end = Math.min(text.length, (match.index || 0) + match[0].length + 80);
  471. const snippet = text.slice(start, end).replace(/\s+/g, ' ').trim();
  472. if (isReadableSignalSnippet(snippet) && !snippets.includes(snippet)) snippets.push(snippet);
  473. }
  474. return snippets;
  475. }
  476. function isReadableSignalSnippet(value) {
  477. const text = String(value || '').trim();
  478. if (!text || text.includes('�')) return false;
  479. const chars = Array.from(text);
  480. const readable = chars.filter((char) => /[\p{Script=Han}A-Za-z0-9\s.,:;_@/\\\-()[\]{}"',。:;()【】《》、]/u.test(char)).length;
  481. return readable / Math.max(chars.length, 1) >= 0.75;
  482. }
  483. function collectBinarySignalSnippets(content) {
  484. const buffer = Buffer.from(content || []).subarray(0, 1024 * 1024);
  485. const candidates = [buffer.toString('utf16le'), decodeCodePageBuffer(buffer, 936), buffer.toString('utf8')];
  486. const snippets = [];
  487. for (const text of candidates) {
  488. for (const snippet of collectSignalSnippets(text, 3)) {
  489. if (!snippets.includes(snippet)) snippets.push(snippet);
  490. if (snippets.length >= 5) return snippets;
  491. }
  492. }
  493. return snippets;
  494. }
  495. function addOleSignalFields(fields, cfb) {
  496. for (let index = 0; index < cfb.FileIndex.length; index += 1) {
  497. const entry = cfb.FileIndex[index];
  498. const fullPath = cfb.FullPaths[index] || entry.name || `stream_${index}`;
  499. const signalKey = `ole_signal:${safeMetadataKey(fullPath)}`;
  500. for (const snippet of collectSignalSnippets(fullPath, 2)) addListField(fields, signalKey, snippet);
  501. if (entry?.content?.length && !isOlePropertySetStreamName(fullPath)) {
  502. for (const snippet of collectBinarySignalSnippets(entry.content)) addListField(fields, signalKey, snippet);
  503. }
  504. }
  505. }
  506. function isOlePropertySetStreamName(value) {
  507. return /(?:summaryinformation|documentsummaryinformation)$/i.test(String(value || '').replace(/^.*[\\/]/, '').replace(/^\u0005|^!/, ''));
  508. }
  509. function addWpsSignalFields(fields) {
  510. const entries = Array.from(fields.entries());
  511. for (const [key, value] of entries) {
  512. if (key.startsWith('wps:')) continue;
  513. const haystack = `${key} ${value}`;
  514. if (!collectSignalSnippets(haystack, 1).length) continue;
  515. addListField(fields, `wps:${safeMetadataKey(key)}`, value);
  516. }
  517. }
  518. async function extractDocxMetadata(filePath) {
  519. const zip = new AdmZip(filePath);
  520. const fields = new Map();
  521. const core = readZipText(zip, 'docProps/core.xml');
  522. const app = readZipText(zip, 'docProps/app.xml');
  523. const custom = readZipText(zip, 'docProps/custom.xml');
  524. addField(fields, 'title', xmlText(core, 'title'));
  525. addField(fields, 'subject', xmlText(core, 'subject'));
  526. addField(fields, 'author', xmlText(core, 'creator'));
  527. addField(fields, 'last_modified_by', xmlText(core, 'lastModifiedBy'));
  528. addField(fields, 'revision', xmlText(core, 'revision'));
  529. addField(fields, 'created', xmlText(core, 'created'));
  530. addField(fields, 'modified', xmlText(core, 'modified'));
  531. addField(fields, 'keywords', xmlText(core, 'keywords'));
  532. addField(fields, 'category', xmlText(core, 'category'));
  533. addField(fields, 'description', xmlText(core, 'description'));
  534. addField(fields, 'application', xmlText(app, 'Application'));
  535. addField(fields, 'app_version', xmlText(app, 'AppVersion'));
  536. addField(fields, 'company', xmlText(app, 'Company'));
  537. addField(fields, 'manager', xmlText(app, 'Manager'));
  538. addField(fields, 'template', xmlText(app, 'Template'));
  539. addField(fields, 'pages', xmlText(app, 'Pages'));
  540. addField(fields, 'words', xmlText(app, 'Words'));
  541. addField(fields, 'characters', xmlText(app, 'Characters'));
  542. addField(fields, 'lines', xmlText(app, 'Lines'));
  543. addField(fields, 'paragraphs', xmlText(app, 'Paragraphs'));
  544. addField(fields, 'total_time', xmlText(app, 'TotalTime'));
  545. for (const match of custom.matchAll(/<property\b[^>]*\bname="([^"]+)"[^>]*>([\s\S]*?)<\/property>/gi)) {
  546. const key = `custom:${decodeXml(match[1])}`;
  547. const valueMatch = match[2].match(/<[^>]+>([\s\S]*?)<\/[^>]+>/);
  548. addField(fields, key, valueMatch ? decodeXml(valueMatch[1]) : decodeXml(match[2]));
  549. }
  550. addWpsSignalFields(fields);
  551. return fields;
  552. }
  553. async function extractOleMetadata(filePath) {
  554. const buffer = await fs.readFile(filePath);
  555. const cfb = CFB.read(buffer, { type: 'buffer' });
  556. const fields = new Map();
  557. const summary = findCfbEntry(cfb, '\u0005SummaryInformation');
  558. const documentSummary = findCfbEntry(cfb, '\u0005DocumentSummaryInformation');
  559. if (summary) {
  560. for (const [key, value] of parsePropertySetStream(summary.content, SUMMARY_PROPERTY_MAP).entries()) addField(fields, key, value);
  561. }
  562. if (documentSummary) {
  563. for (const [key, value] of parsePropertySetStream(documentSummary.content, DOC_SUMMARY_PROPERTY_MAP).entries()) addField(fields, key, value);
  564. }
  565. addOleSignalFields(fields, cfb);
  566. addWpsSignalFields(fields);
  567. return fields;
  568. }
  569. async function extractConvertedDocxMetadata(filePath) {
  570. const converterUrl = pathToFileURL(path.join(__dirname, 'doc2markdown', 'convert.mjs')).href;
  571. const { withLegacyWordDocxFile } = await import(converterUrl);
  572. return withLegacyWordDocxFile(filePath, (docxPath) => extractDocxMetadata(docxPath));
  573. }
  574. function mergeMetadataFields(target, source, options = {}) {
  575. for (const [key, value] of source.entries()) {
  576. if (options.fillOnlyIfAbsent) addFieldIfAbsent(target, key, value);
  577. else addField(target, key, value);
  578. if (options.prefix) addField(target, `${options.prefix}:${key}`, value);
  579. }
  580. }
  581. async function hasZipHeader(filePath) {
  582. const handle = await fs.open(filePath, 'r');
  583. try {
  584. const buffer = Buffer.alloc(4);
  585. const result = await handle.read(buffer, 0, 4, 0);
  586. return result.bytesRead >= 4 && buffer[0] === 0x50 && buffer[1] === 0x4b && buffer[2] === 0x03 && buffer[3] === 0x04;
  587. } finally {
  588. await handle.close();
  589. }
  590. }
  591. async function extractLegacyWordMetadata(filePath) {
  592. const fields = new Map();
  593. const errors = [];
  594. try {
  595. mergeMetadataFields(fields, await extractOleMetadata(filePath));
  596. } catch (error) {
  597. errors.push(`OLE 元数据读取失败:${error.message || error}`);
  598. }
  599. try {
  600. mergeMetadataFields(fields, await extractConvertedDocxMetadata(filePath), {
  601. fillOnlyIfAbsent: true,
  602. prefix: 'converted_docx',
  603. });
  604. } catch (error) {
  605. errors.push(`转换 DOCX 元数据读取失败:${error.message || error}`);
  606. }
  607. if (errors.length) addListField(fields, 'metadata_error', errors.join(';'));
  608. addWpsSignalFields(fields);
  609. return fields;
  610. }
  611. const PDF_INFO_KEY_MAP = {
  612. Title: 'title',
  613. Author: 'author',
  614. Subject: 'subject',
  615. Keywords: 'keywords',
  616. Creator: 'creator',
  617. Producer: 'producer',
  618. CreationDate: 'created',
  619. ModDate: 'modified',
  620. PDFFormatVersion: 'pdf_version',
  621. };
  622. function getPdfMetadataValue(metadata, ...names) {
  623. if (!metadata) return '';
  624. for (const name of names) {
  625. if (typeof metadata.get === 'function') {
  626. const value = metadata.get(name);
  627. if (normalizeValue(value)) return value;
  628. }
  629. if (Object.prototype.hasOwnProperty.call(metadata, name) && normalizeValue(metadata[name])) return metadata[name];
  630. }
  631. return '';
  632. }
  633. function getPdfMetadataEntries(metadata) {
  634. if (!metadata) return [];
  635. if (typeof metadata[Symbol.iterator] === 'function') return Array.from(metadata);
  636. return Object.entries(metadata).filter(([, value]) => normalizeValue(value));
  637. }
  638. function canonicalPdfXmpKey(rawKey) {
  639. const key = String(rawKey || '').toLowerCase();
  640. if (/(^|:)title$/.test(key)) return 'title';
  641. if (/(^|:)creator$/.test(key)) return 'author';
  642. if (/creatortool$/.test(key)) return 'creator';
  643. if (/producer$/.test(key)) return 'producer';
  644. if (/(^|:)subject$/.test(key)) return 'subject';
  645. if (/keywords$/.test(key)) return 'keywords';
  646. if (/description$/.test(key)) return 'description';
  647. if (/createdate$/.test(key)) return 'created';
  648. if (/(modifydate|metadatadate)$/.test(key)) return 'modified';
  649. return '';
  650. }
  651. function addPdfInfoFields(fields, info) {
  652. for (const [rawKey, value] of Object.entries(info || {})) {
  653. const text = normalizeValue(value);
  654. if (!text) continue;
  655. if (PDF_INFO_KEY_MAP[rawKey]) addField(fields, PDF_INFO_KEY_MAP[rawKey], text);
  656. addField(fields, `pdf_info:${safeMetadataKey(rawKey)}`, text);
  657. }
  658. }
  659. function addPdfXmpFields(fields, metadata) {
  660. for (const [rawKey, value] of getPdfMetadataEntries(metadata)) {
  661. const text = normalizeValue(value);
  662. if (!text) continue;
  663. const canonical = canonicalPdfXmpKey(rawKey);
  664. if (canonical) addFieldIfAbsent(fields, canonical, text);
  665. addField(fields, `pdf_xmp:${safeMetadataKey(rawKey)}`, text);
  666. }
  667. const raw = typeof metadata?.getRaw === 'function' ? metadata.getRaw() : '';
  668. for (const snippet of collectSignalSnippets(raw, 5)) addListField(fields, 'pdf_xmp:raw_signals', snippet);
  669. }
  670. function decodeUtf16Be(buffer) {
  671. const chars = [];
  672. for (let offset = 0; offset + 1 < buffer.length; offset += 2) {
  673. const code = buffer.readUInt16BE(offset);
  674. if (code) chars.push(String.fromCharCode(code));
  675. }
  676. return chars.join('');
  677. }
  678. function decodePdfStringBuffer(buffer) {
  679. if (buffer.length >= 2 && buffer[0] === 0xfe && buffer[1] === 0xff) return decodeUtf16Be(buffer.subarray(2));
  680. if (buffer.length >= 2 && buffer[0] === 0xff && buffer[1] === 0xfe) return buffer.subarray(2).toString('utf16le');
  681. const utf8 = buffer.toString('utf8').trim();
  682. return utf8 || buffer.toString('latin1').trim();
  683. }
  684. function decodePdfLiteralString(value) {
  685. let text = '';
  686. for (let index = 0; index < value.length; index += 1) {
  687. const char = value[index];
  688. if (char !== '\\') {
  689. text += char;
  690. continue;
  691. }
  692. const next = value[index + 1];
  693. if (!next) continue;
  694. index += 1;
  695. if (next === 'n') text += '\n';
  696. else if (next === 'r') text += '\r';
  697. else if (next === 't') text += '\t';
  698. else if (next === 'b') text += '\b';
  699. else if (next === 'f') text += '\f';
  700. else if (/[0-7]/.test(next)) {
  701. let octal = next;
  702. for (let count = 0; count < 2 && /[0-7]/.test(value[index + 1] || ''); count += 1) octal += value[++index];
  703. text += String.fromCharCode(parseInt(octal, 8));
  704. } else {
  705. text += next;
  706. }
  707. }
  708. return decodePdfStringBuffer(Buffer.from(text, 'latin1'));
  709. }
  710. function decodePdfHexString(value) {
  711. const hex = value.replace(/\s+/g, '');
  712. if (!hex || hex.length % 2 !== 0) return '';
  713. try {
  714. return decodePdfStringBuffer(Buffer.from(hex, 'hex'));
  715. } catch {
  716. return '';
  717. }
  718. }
  719. function addPdfRawFields(fields, buffer) {
  720. const text = buffer.toString('latin1');
  721. const pattern = /\/(Title|Author|Subject|Keywords|Creator|Producer|CreationDate|ModDate)\s*(\((?:\\.|[^\\)]){0,1000}\)|<([0-9a-fA-F\s]{2,2000})>)/g;
  722. let match;
  723. while ((match = pattern.exec(text))) {
  724. const rawKey = match[1];
  725. const rawValue = match[3] ? decodePdfHexString(match[3]) : decodePdfLiteralString(match[2].slice(1, -1));
  726. addListField(fields, `pdf_raw:${safeMetadataKey(rawKey)}`, rawValue);
  727. }
  728. for (const snippet of collectBinarySignalSnippets(buffer)) addListField(fields, 'pdf_raw:signals', snippet);
  729. }
  730. async function extractPdfMetadata(filePath) {
  731. const buffer = await fs.readFile(filePath);
  732. const parser = new PDFParse({ data: buffer });
  733. const fields = new Map();
  734. try {
  735. const result = await parser.getInfo();
  736. const info = result.info || {};
  737. const metadata = result.metadata || null;
  738. addPdfInfoFields(fields, info);
  739. addPdfXmpFields(fields, metadata);
  740. addFieldIfAbsent(fields, 'title', getPdfMetadataValue(metadata, 'dc:title', 'Title'));
  741. addFieldIfAbsent(fields, 'author', getPdfMetadataValue(metadata, 'dc:creator', 'Author'));
  742. addFieldIfAbsent(fields, 'subject', getPdfMetadataValue(metadata, 'dc:subject', 'Subject'));
  743. addFieldIfAbsent(fields, 'keywords', getPdfMetadataValue(metadata, 'pdf:Keywords', 'Keywords'));
  744. addFieldIfAbsent(fields, 'creator', getPdfMetadataValue(metadata, 'xmp:CreatorTool', 'Creator'));
  745. addFieldIfAbsent(fields, 'producer', getPdfMetadataValue(metadata, 'pdf:Producer', 'Producer'));
  746. addFieldIfAbsent(fields, 'created', getPdfMetadataValue(metadata, 'xmp:CreateDate', 'CreationDate'));
  747. addFieldIfAbsent(fields, 'modified', getPdfMetadataValue(metadata, 'xmp:ModifyDate', 'xmp:MetadataDate', 'ModDate'));
  748. addField(fields, 'pages', result.total || result.pages || result.numpages);
  749. addField(fields, 'pdf_version', result.version || info.PDFFormatVersion);
  750. addField(fields, 'fingerprints', result.fingerprints);
  751. addField(fields, 'pdf_permissions', result.permission);
  752. addPdfRawFields(fields, buffer);
  753. addWpsSignalFields(fields);
  754. } finally {
  755. await parser.destroy();
  756. }
  757. return fields;
  758. }
  759. async function extractMetadata(file) {
  760. const fields = new Map();
  761. const stats = await fs.stat(file.file_path);
  762. addField(fields, 'file_name', file.file_name);
  763. addField(fields, 'extension', file.extension);
  764. addField(fields, 'size', file.size || stats.size);
  765. addField(fields, 'created_at', stats.birthtime.toISOString());
  766. addField(fields, 'modified_at', stats.mtime.toISOString());
  767. addField(fields, 'accessed_at', stats.atime.toISOString());
  768. try {
  769. const extension = String(file.extension || '').toLowerCase();
  770. if (extension === '.docx' || ((extension === '.doc' || extension === '.wps') && await hasZipHeader(file.file_path))) {
  771. mergeMetadataFields(fields, await extractDocxMetadata(file.file_path));
  772. } else if (extension === '.doc' || extension === '.wps') {
  773. mergeMetadataFields(fields, await extractLegacyWordMetadata(file.file_path));
  774. } else if (extension === '.pdf') {
  775. mergeMetadataFields(fields, await extractPdfMetadata(file.file_path));
  776. }
  777. } catch (error) {
  778. addField(fields, 'metadata_error', error.message || '元数据读取失败');
  779. }
  780. addDecodedBase64Fields(fields);
  781. return Array.from(fields.entries()).map(([key, value]) => ({
  782. key,
  783. label: getMetadataLabel(key),
  784. value,
  785. normalized: normalizeComparable(value),
  786. date_day: normalizeDateDay(value),
  787. comparable: isComparableKey(key),
  788. date_comparable: isDateComparableKey(key),
  789. }));
  790. }
  791. function buildRows(files) {
  792. const keyOrder = [];
  793. const rowsByKey = new Map();
  794. for (const file of files) {
  795. for (const item of file.metadata || []) {
  796. if (!rowsByKey.has(item.key)) {
  797. keyOrder.push(item.key);
  798. rowsByKey.set(item.key, { key: item.key, label: item.label, values: {}, duplicate_file_ids: [], same_day_file_ids: [] });
  799. }
  800. rowsByKey.get(item.key).values[file.file_id] = item.value;
  801. }
  802. }
  803. for (const key of keyOrder) {
  804. const row = rowsByKey.get(key);
  805. const normalizedToFiles = new Map();
  806. const dayToFiles = new Map();
  807. for (const file of files) {
  808. const item = (file.metadata || []).find((entry) => entry.key === key);
  809. if (!item?.comparable || !item.normalized) continue;
  810. if (item.date_comparable) {
  811. if (!item.date_day) continue;
  812. const list = dayToFiles.get(item.date_day) || [];
  813. list.push(file.file_id);
  814. dayToFiles.set(item.date_day, list);
  815. continue;
  816. }
  817. const list = normalizedToFiles.get(item.normalized) || [];
  818. list.push(file.file_id);
  819. normalizedToFiles.set(item.normalized, list);
  820. }
  821. row.duplicate_file_ids = Array.from(new Set(Array.from(normalizedToFiles.values()).filter((ids) => ids.length > 1).flat()));
  822. row.same_day_file_ids = Array.from(new Set(Array.from(dayToFiles.values()).filter((ids) => ids.length > 1).flat()));
  823. }
  824. return keyOrder.map((key) => rowsByKey.get(key));
  825. }
  826. function stripMarkdownForOutline(markdown) {
  827. return String(markdown || '')
  828. .replace(/!\[[^\]]*\]\([^)]*\)/g, ' ')
  829. .replace(/\[([^\]]+)\]\([^)]*\)/g, '$1')
  830. .replace(/<[^>]+>/g, ' ')
  831. .replace(/[`*_~]/g, '')
  832. .replace(/&nbsp;/gi, ' ')
  833. .replace(/&lt;/gi, '<')
  834. .replace(/&gt;/gi, '>')
  835. .replace(/&amp;/gi, '&');
  836. }
  837. function normalizeOutlineTitle(value) {
  838. return normalizeValue(stripMarkdownForOutline(value))
  839. .replace(/^(?:#{1,6}\s*)/, '')
  840. .replace(/^[-*+>]\s*/, '')
  841. .replace(/^(?:第[一二三四五六七八九十百千万\d]+[章节篇部分]|\d+(?:\.\d+)*[.)、.]?|[一二三四五六七八九十]+[、..]|([一二三四五六七八九十\d]+)|[①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮⑯⑰⑱⑲⑳])[\s、..-]*/, '')
  842. .replace(/[\s ]+/g, '')
  843. .replace(/[,。!?;:、“”‘’'"《》〈〉()()\[\]【】{}.,!?;:|/\\_-]+/g, '')
  844. .toLowerCase();
  845. }
  846. function cleanOutlineTitle(value) {
  847. return normalizeValue(stripMarkdownForOutline(value))
  848. .replace(/^(?:#{1,6}\s*)/, '')
  849. .replace(/^[-*+>]\s*/, '')
  850. .replace(/(?:\.{2,}|…{2,}|·{2,}|\s{3,})\s*\d+\s*$/g, '')
  851. .replace(/\s+\d{1,4}\s*$/g, '')
  852. .trim();
  853. }
  854. function splitTenderSentences(markdown) {
  855. const text = stripMarkdownForOutline(markdown)
  856. .replace(/\|/g, '\n')
  857. .replace(/\r?\n/g, '\n')
  858. .replace(/[\t ]+/g, ' ');
  859. const parts = text
  860. .split(/[。!?!?;;\n]+/)
  861. .map((item) => cleanOutlineTitle(item))
  862. .filter(Boolean);
  863. const seen = new Set();
  864. const sentences = [];
  865. for (const part of parts) {
  866. const normalized = normalizeOutlineTitle(part);
  867. if (normalized.length < 6 || normalized.length > 160 || seen.has(normalized)) continue;
  868. seen.add(normalized);
  869. sentences.push({ text: part, normalized });
  870. }
  871. return sentences;
  872. }
  873. function matchTenderSentence(title, tenderSentences) {
  874. const normalized = normalizeOutlineTitle(title);
  875. if (normalized.length < 6) return null;
  876. for (const sentence of tenderSentences) {
  877. if (sentence.normalized === normalized) return sentence;
  878. if (normalized.length >= 10 && sentence.normalized.includes(normalized)) return sentence;
  879. if (sentence.normalized.length >= 10 && normalized.includes(sentence.normalized) && sentence.normalized.length / normalized.length >= 0.8) return sentence;
  880. }
  881. return null;
  882. }
  883. function parseOutlineMarker(line) {
  884. const text = cleanOutlineTitle(line);
  885. const patterns = [
  886. { pattern: /^(?<number>\d+(?:\.\d+)*)(?:[.)、.])?\s*(?<title>.+)$/u },
  887. { pattern: /^(?<number>第[一二三四五六七八九十百千万\d]+[章节篇部分])\s*(?<title>.*)$/u },
  888. { pattern: /^(?<number>[一二三四五六七八九十]+[、..])\s*(?<title>.+)$/u },
  889. { pattern: /^(?<number>([一二三四五六七八九十\d]+))\s*(?<title>.+)$/u },
  890. { pattern: /^(?<number>[①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮⑯⑰⑱⑲⑳])\s*(?<title>.+)$/u },
  891. ];
  892. for (const { pattern } of patterns) {
  893. const match = text.match(pattern);
  894. if (!match?.groups) continue;
  895. const number = match.groups.number.trim();
  896. const title = cleanOutlineTitle(match.groups.title || number);
  897. if (!title || normalizeOutlineTitle(title).length < 2) continue;
  898. return { number, title, level: inferOutlineLevel(number) };
  899. }
  900. return null;
  901. }
  902. function inferOutlineLevel(number) {
  903. const marker = String(number || '').trim();
  904. if (/^\d+(?:\.\d+)+/.test(marker)) return marker.split('.').filter(Boolean).length;
  905. if (/^\d+/.test(marker) || /^第.+[章节篇部分]$/.test(marker) || /^[一二三四五六七八九十]+[、..]$/.test(marker)) return 1;
  906. if (/^(.+)$/.test(marker) || /^[①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮⑯⑰⑱⑲⑳]$/.test(marker)) return 2;
  907. return 1;
  908. }
  909. function isCatalogTitleLine(line) {
  910. return /^(?:#{1,6}\s*)?(目录|目次|contents)$/i.test(String(line || '').replace(/\s+/g, ''));
  911. }
  912. function parseCatalogLine(line) {
  913. const raw = cleanOutlineTitle(String(line || '').replace(/^\|+|\|+$/g, '').replace(/\|/g, ' '));
  914. if (!raw || /^[-:|\s]+$/.test(raw) || isCatalogTitleLine(raw)) return null;
  915. const hasPageTrail = /(?:\.{2,}|…{2,}|·{2,}|\s{3,})\s*\d+\s*$/.test(raw) || /\s\d{1,4}$/.test(raw);
  916. const marker = parseOutlineMarker(raw);
  917. if (marker) return marker;
  918. if (!hasPageTrail) return null;
  919. const title = cleanOutlineTitle(raw.replace(/(?:\.{2,}|…{2,}|·{2,}|\s{3,})\s*\d+\s*$/g, '').replace(/\s+\d{1,4}\s*$/g, ''));
  920. return title && normalizeOutlineTitle(title).length >= 2 ? { title, level: 1 } : null;
  921. }
  922. function extractCatalogOutline(markdown) {
  923. const lines = String(markdown || '').split(/\r?\n/);
  924. const start = lines.findIndex(isCatalogTitleLine);
  925. if (start < 0) return [];
  926. const items = [];
  927. let misses = 0;
  928. for (let index = start + 1; index < Math.min(lines.length, start + 180); index += 1) {
  929. const parsed = parseCatalogLine(lines[index]);
  930. if (!parsed) {
  931. if (items.length) misses += 1;
  932. if (misses >= 10) break;
  933. continue;
  934. }
  935. misses = 0;
  936. items.push({ ...parsed, source: 'catalog', confidence: 0.92 });
  937. }
  938. return items;
  939. }
  940. function extractHeadingOutline(markdown) {
  941. const items = [];
  942. const lines = String(markdown || '').split(/\r?\n/);
  943. for (const line of lines) {
  944. const match = line.match(/^(#{1,6})\s+(.+)$/);
  945. if (!match) continue;
  946. const title = cleanOutlineTitle(match[2]);
  947. if (!title || isCatalogTitleLine(title)) continue;
  948. const marker = parseOutlineMarker(title);
  949. items.push({ number: marker?.number, title: marker?.title || title, level: Math.min(match[1].length, 6), source: 'heading', confidence: 0.82 });
  950. }
  951. return items;
  952. }
  953. function extractSemanticOutline(markdown) {
  954. const items = [];
  955. const lines = String(markdown || '').split(/\r?\n/);
  956. for (const line of lines) {
  957. const text = cleanOutlineTitle(line);
  958. if (!text || text.length > 90 || /[。!?;;]$/.test(text) || /^\|/.test(text) || isCatalogTitleLine(text)) continue;
  959. const marker = parseOutlineMarker(text);
  960. const bold = /^\s*\*\*.+\*\*\s*$/.test(line);
  961. if (!marker && !bold) continue;
  962. items.push({ number: marker?.number, title: marker?.title || text, level: marker?.level || 2, source: 'semantic', confidence: marker ? 0.68 : 0.55 });
  963. }
  964. return items.slice(0, 260);
  965. }
  966. function buildOutlineItems(markdown, tenderSentences = []) {
  967. const candidates = [extractCatalogOutline(markdown), extractHeadingOutline(markdown), extractSemanticOutline(markdown)];
  968. const selected = candidates.find((items) => items.length >= 3) || candidates.find((items) => items.length) || [];
  969. const stack = [];
  970. const items = [];
  971. const seen = new Set();
  972. for (const candidate of selected) {
  973. let level = Math.max(1, Math.min(Number(candidate.level) || 1, 6));
  974. if (level > stack.length + 1) level = stack.length + 1;
  975. const title = cleanOutlineTitle(candidate.title);
  976. const normalized = normalizeOutlineTitle(title);
  977. if (!title || normalized.length < 2) continue;
  978. const key = `${level}:${normalized}`;
  979. if (seen.has(key)) continue;
  980. seen.add(key);
  981. stack.splice(level - 1);
  982. const parent = stack[level - 2] || null;
  983. const pathTitles = [...(parent?.path_titles || []), title];
  984. const matched = matchTenderSentence(title, tenderSentences);
  985. const item = {
  986. id: `O${String(items.length + 1).padStart(5, '0')}`,
  987. level,
  988. number: candidate.number,
  989. title,
  990. normalized_title: normalized,
  991. path_titles: pathTitles,
  992. normalized_path: pathTitles.map(normalizeOutlineTitle).filter(Boolean).join('>'),
  993. source: candidate.source,
  994. confidence: candidate.confidence,
  995. order: items.length,
  996. parent_id: parent?.id,
  997. from_tender: Boolean(matched),
  998. matched_tender_sentence: matched?.text,
  999. duplicate_group_ids: [],
  1000. similar_group_ids: [],
  1001. };
  1002. items.push(item);
  1003. stack[level - 1] = item;
  1004. }
  1005. return { items, source: selected[0]?.source, confidence: selected.length ? Number((selected.reduce((sum, item) => sum + item.confidence, 0) / selected.length).toFixed(2)) : 0 };
  1006. }
  1007. function intersectSize(a, b) {
  1008. let count = 0;
  1009. for (const item of a) if (b.has(item)) count += 1;
  1010. return count;
  1011. }
  1012. function bigramSimilarity(a, b) {
  1013. const left = String(a || '');
  1014. const right = String(b || '');
  1015. if (!left || !right) return 0;
  1016. if (left === right) return 1;
  1017. const toBigrams = (value) => {
  1018. const chars = Array.from(value);
  1019. if (chars.length <= 1) return new Set(chars);
  1020. return new Set(chars.slice(0, -1).map((char, index) => `${char}${chars[index + 1]}`));
  1021. };
  1022. const leftSet = toBigrams(left);
  1023. const rightSet = toBigrams(right);
  1024. const shared = intersectSize(leftSet, rightSet);
  1025. return (2 * shared) / (leftSet.size + rightSet.size || 1);
  1026. }
  1027. function lcsSimilarity(left, right) {
  1028. if (!left.length || !right.length) return 0;
  1029. const dp = Array.from({ length: left.length + 1 }, () => Array(right.length + 1).fill(0));
  1030. for (let i = 1; i <= left.length; i += 1) {
  1031. for (let j = 1; j <= right.length; j += 1) {
  1032. dp[i][j] = left[i - 1] === right[j - 1] ? dp[i - 1][j - 1] + 1 : Math.max(dp[i - 1][j], dp[i][j - 1]);
  1033. }
  1034. }
  1035. return dp[left.length][right.length] / Math.max(left.length, right.length);
  1036. }
  1037. function riskFromScore(score) {
  1038. if (score >= 0.75) return 'high';
  1039. if (score >= 0.55) return 'medium';
  1040. if (score >= 0.35) return 'low';
  1041. return 'none';
  1042. }
  1043. function buildOutlineComparison(files) {
  1044. const groups = [];
  1045. const byTitle = new Map();
  1046. const byPath = new Map();
  1047. const successful = files.filter((file) => file.status === 'success');
  1048. for (const file of successful) {
  1049. for (const item of file.items || []) {
  1050. if (item.from_tender) continue;
  1051. const titleList = byTitle.get(item.normalized_title) || [];
  1052. titleList.push({ file, item });
  1053. byTitle.set(item.normalized_title, titleList);
  1054. const pathList = byPath.get(item.normalized_path) || [];
  1055. pathList.push({ file, item });
  1056. byPath.set(item.normalized_path, pathList);
  1057. }
  1058. }
  1059. function addGroup(type, entries, title, score) {
  1060. const fileIds = Array.from(new Set(entries.map((entry) => entry.file.file_id)));
  1061. if (fileIds.length < 2) return null;
  1062. const id = `G${String(groups.length + 1).padStart(4, '0')}`;
  1063. const group = { id, type, title, score, file_ids: fileIds, item_ids: {}, paths: {} };
  1064. for (const entry of entries) {
  1065. group.item_ids[entry.file.file_id] = [...(group.item_ids[entry.file.file_id] || []), entry.item.id];
  1066. group.paths[entry.file.file_id] = [...(group.paths[entry.file.file_id] || []), entry.item.path_titles.join(' > ')];
  1067. if (type === 'duplicate') entry.item.duplicate_group_ids.push(id);
  1068. else entry.item.similar_group_ids.push(id);
  1069. }
  1070. groups.push(group);
  1071. return group;
  1072. }
  1073. for (const entries of byPath.values()) addGroup('duplicate', entries, entries[0]?.item.path_titles.join(' > ') || entries[0]?.item.title || '', 1);
  1074. for (const entries of byTitle.values()) {
  1075. const alreadyGrouped = entries.every((entry) => entry.item.duplicate_group_ids.length);
  1076. if (!alreadyGrouped) addGroup('duplicate', entries, entries[0]?.item.title || '', 0.95);
  1077. }
  1078. const seenSimilar = new Set();
  1079. for (let i = 0; i < successful.length; i += 1) {
  1080. for (let j = i + 1; j < successful.length; j += 1) {
  1081. for (const left of successful[i].items.filter((item) => !item.from_tender && !item.duplicate_group_ids.length)) {
  1082. for (const right of successful[j].items.filter((item) => !item.from_tender && !item.duplicate_group_ids.length && Math.abs(item.level - left.level) <= 1)) {
  1083. const score = bigramSimilarity(left.normalized_title, right.normalized_title);
  1084. if (score < 0.86) continue;
  1085. const key = [successful[i].file_id, left.id, successful[j].file_id, right.id].join(':');
  1086. if (seenSimilar.has(key)) continue;
  1087. seenSimilar.add(key);
  1088. addGroup('similar', [{ file: successful[i], item: left }, { file: successful[j], item: right }], left.title, Number(score.toFixed(2)));
  1089. }
  1090. }
  1091. }
  1092. }
  1093. const pairwiseSimilarities = [];
  1094. for (let i = 0; i < successful.length; i += 1) {
  1095. for (let j = i + 1; j < successful.length; j += 1) {
  1096. const leftItems = successful[i].items.filter((item) => !item.from_tender);
  1097. const rightItems = successful[j].items.filter((item) => !item.from_tender);
  1098. const leftTitles = new Set(leftItems.map((item) => item.normalized_title));
  1099. const rightTitles = new Set(rightItems.map((item) => item.normalized_title));
  1100. const leftPaths = new Set(leftItems.map((item) => item.normalized_path));
  1101. const rightPaths = new Set(rightItems.map((item) => item.normalized_path));
  1102. const titleShared = intersectSize(leftTitles, rightTitles);
  1103. const pathShared = intersectSize(leftPaths, rightPaths);
  1104. const titleOverlap = titleShared / Math.max(Math.min(leftTitles.size, rightTitles.size), 1);
  1105. const pathOverlap = pathShared / Math.max(Math.min(leftPaths.size, rightPaths.size), 1);
  1106. const orderSimilarity = lcsSimilarity(leftItems.map((item) => item.normalized_title), rightItems.map((item) => item.normalized_title));
  1107. const score = Number((pathOverlap * 0.45 + titleOverlap * 0.35 + orderSimilarity * 0.2).toFixed(2));
  1108. pairwiseSimilarities.push({
  1109. file_a_id: successful[i].file_id,
  1110. file_b_id: successful[j].file_id,
  1111. score,
  1112. title_overlap: Number(titleOverlap.toFixed(2)),
  1113. path_overlap: Number(pathOverlap.toFixed(2)),
  1114. order_similarity: Number(orderSimilarity.toFixed(2)),
  1115. shared_count: Math.max(titleShared, pathShared),
  1116. risk: riskFromScore(score),
  1117. });
  1118. }
  1119. }
  1120. return { duplicateGroups: groups.sort((a, b) => b.score - a.score || b.file_ids.length - a.file_ids.length), pairwiseSimilarities };
  1121. }
  1122. function stripImagesFromMarkdown(markdown) {
  1123. return String(markdown || '')
  1124. .replace(markdownImagePattern, ' ')
  1125. .replace(htmlImageSrcPattern, ' ')
  1126. .replace(htmlImagePattern, ' ');
  1127. }
  1128. function codePointToString(value, fallback) {
  1129. try {
  1130. const codePoint = Number.parseInt(value, 10);
  1131. return Number.isFinite(codePoint) ? String.fromCodePoint(codePoint) : fallback;
  1132. } catch {
  1133. return fallback;
  1134. }
  1135. }
  1136. function hexCodePointToString(value, fallback) {
  1137. try {
  1138. const codePoint = Number.parseInt(value, 16);
  1139. return Number.isFinite(codePoint) ? String.fromCodePoint(codePoint) : fallback;
  1140. } catch {
  1141. return fallback;
  1142. }
  1143. }
  1144. function decodeBasicHtmlEntities(value) {
  1145. return String(value || '')
  1146. .replace(/&nbsp;/gi, ' ')
  1147. .replace(/&lt;/gi, '<')
  1148. .replace(/&gt;/gi, '>')
  1149. .replace(/&quot;/gi, '"')
  1150. .replace(/&apos;/gi, "'")
  1151. .replace(/&amp;/gi, '&')
  1152. .replace(/&#x([0-9a-f]+);/gi, (match, hex) => hexCodePointToString(hex, match))
  1153. .replace(/&#(\d+);/g, (match, code) => codePointToString(code, match));
  1154. }
  1155. function normalizeContentLineBreaks(value) {
  1156. return String(value || '').replace(/\r\n/g, '\n').replace(/\r/g, '\n');
  1157. }
  1158. function addContentTextBlock(blocks, value) {
  1159. const text = cleanContentSentence(decodeBasicHtmlEntities(value));
  1160. for (const line of normalizeContentLineBreaks(text).split(/\n+/)) {
  1161. const cleaned = cleanContentSentence(line);
  1162. if (cleaned) blocks.push(cleaned);
  1163. }
  1164. }
  1165. function extractHtmlCellTextBlocks($, cell) {
  1166. const blocks = [];
  1167. const node = $(cell).clone();
  1168. node.find('img').remove();
  1169. node.find('br').replaceWith('\n');
  1170. node.find('p, li, h1, h2, h3, h4, h5, h6, blockquote, div').each((_, element) => {
  1171. const block = $(element).clone();
  1172. block.find('img').remove();
  1173. block.find('br').replaceWith('\n');
  1174. addContentTextBlock(blocks, block.text());
  1175. $(element).remove();
  1176. });
  1177. addContentTextBlock(blocks, node.text());
  1178. return blocks;
  1179. }
  1180. function extractHtmlTableTextBlocks(tableHtml) {
  1181. const $ = cheerio.load(tableHtml, { decodeEntities: false });
  1182. const blocks = [];
  1183. $('tr').each((_, row) => {
  1184. $(row).children('th, td').each((__, cell) => {
  1185. for (const block of extractHtmlCellTextBlocks($, cell)) {
  1186. addContentTextBlock(blocks, block);
  1187. }
  1188. });
  1189. });
  1190. if (!blocks.length) addContentTextBlock(blocks, $.root().text());
  1191. return blocks;
  1192. }
  1193. function splitMarkdownTableRow(line) {
  1194. let text = String(line || '').trim();
  1195. if (text.startsWith('|')) text = text.slice(1);
  1196. if (text.endsWith('|') && !text.endsWith('\\|')) text = text.slice(0, -1);
  1197. const cells = [];
  1198. let current = '';
  1199. let escaped = false;
  1200. for (const char of text) {
  1201. if (char === '\\' && !escaped) {
  1202. escaped = true;
  1203. current += char;
  1204. continue;
  1205. }
  1206. if (char === '|' && !escaped) {
  1207. cells.push(current.replace(/\\\|/g, '|').trim());
  1208. current = '';
  1209. continue;
  1210. }
  1211. current += char;
  1212. escaped = false;
  1213. }
  1214. cells.push(current.replace(/\\\|/g, '|').trim());
  1215. return cells;
  1216. }
  1217. function isMarkdownTableSeparator(line) {
  1218. const cells = splitMarkdownTableRow(line);
  1219. return cells.length > 1 && cells.every((cell) => /^:?-{2,}:?$/.test(cell.replace(/\s+/g, '')));
  1220. }
  1221. function isMarkdownTableRow(line) {
  1222. return splitMarkdownTableRow(line).length > 1;
  1223. }
  1224. function cleanMarkdownInlineText(value) {
  1225. return decodeBasicHtmlEntities(String(value || '')
  1226. .replace(markdownImagePattern, ' ')
  1227. .replace(htmlImageSrcPattern, ' ')
  1228. .replace(htmlImagePattern, ' ')
  1229. .replace(/`([^`]+)`/g, '$1')
  1230. .replace(/\[([^\]]+)\]\([^)]*\)/g, '$1')
  1231. .replace(/<br\s*\/?\s*>/gi, '\n')
  1232. .replace(/<\/(?:p|div|li|h[1-6]|blockquote|section|article)>/gi, '\n')
  1233. .replace(/<[^>]+>/g, ' ')
  1234. .replace(/\*\*([^*\n]+)\*\*/g, '$1')
  1235. .replace(/__([^_\n]+)__/g, '$1')
  1236. .replace(/~~([^~\n]+)~~/g, '$1'));
  1237. }
  1238. function cleanMarkdownLine(value) {
  1239. return cleanMarkdownInlineText(value)
  1240. .replace(/^\s{0,3}#{1,6}\s+/, '')
  1241. .replace(/^\s*(?:[-*+]|>)\s+/, '')
  1242. .replace(/[\t ]+/g, ' ')
  1243. .trim();
  1244. }
  1245. function extractMarkdownTextBlocks(markdown) {
  1246. const lines = normalizeContentLineBreaks(String(markdown || '').replace(/```[\s\S]*?```/g, '\n')).split('\n');
  1247. const blocks = [];
  1248. const paragraph = [];
  1249. function flushParagraph() {
  1250. if (!paragraph.length) return;
  1251. addContentTextBlock(blocks, paragraph.join(' '));
  1252. paragraph.length = 0;
  1253. }
  1254. for (let index = 0; index < lines.length; index += 1) {
  1255. if (index + 1 < lines.length && isMarkdownTableRow(lines[index]) && isMarkdownTableSeparator(lines[index + 1])) {
  1256. flushParagraph();
  1257. const tableRows = [splitMarkdownTableRow(lines[index])];
  1258. index += 2;
  1259. while (index < lines.length && isMarkdownTableRow(lines[index])) {
  1260. if (!isMarkdownTableSeparator(lines[index])) tableRows.push(splitMarkdownTableRow(lines[index]));
  1261. index += 1;
  1262. }
  1263. index -= 1;
  1264. for (const row of tableRows) {
  1265. for (const cell of row) {
  1266. addContentTextBlock(blocks, cleanMarkdownInlineText(cell));
  1267. }
  1268. }
  1269. continue;
  1270. }
  1271. const rawLine = lines[index];
  1272. const cleaned = cleanMarkdownLine(rawLine);
  1273. if (!cleaned) {
  1274. flushParagraph();
  1275. continue;
  1276. }
  1277. const standalone = /^\s{0,3}#{1,6}\s+/.test(rawLine)
  1278. || /^\s*(?:[-*+]|>)\s+/.test(rawLine)
  1279. || /^\s*(?:\d+(?:\.\d+)*[.)、.]|[一二三四五六七八九十]+[、..]|([一二三四五六七八九十\d]+)|[①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮⑯⑰⑱⑲⑳])\s+/.test(rawLine);
  1280. const fieldLine = /^[^::\s]{1,18}[::]/.test(cleaned);
  1281. const sentenceLine = /[。!?!?;;]$/.test(cleaned);
  1282. if (standalone || fieldLine || sentenceLine) {
  1283. flushParagraph();
  1284. addContentTextBlock(blocks, cleaned);
  1285. } else {
  1286. paragraph.push(cleaned);
  1287. }
  1288. }
  1289. flushParagraph();
  1290. return blocks;
  1291. }
  1292. function extractContentTextBlocks(markdown) {
  1293. const source = stripImagesFromMarkdown(markdown);
  1294. const tableBlocks = [];
  1295. const withMarkers = source.replace(htmlTablePattern, (tableHtml) => {
  1296. const index = tableBlocks.length;
  1297. tableBlocks.push(extractHtmlTableTextBlocks(tableHtml));
  1298. return `\n\n${contentTableTokenPrefix}${index}\n\n`;
  1299. });
  1300. const tokenPattern = new RegExp(`(${contentTableTokenPrefix}\\d+)`, 'g');
  1301. const blocks = [];
  1302. for (const chunk of withMarkers.split(tokenPattern)) {
  1303. const tokenMatch = chunk.match(new RegExp(`^${contentTableTokenPrefix}(\\d+)$`));
  1304. if (tokenMatch) {
  1305. blocks.push(...(tableBlocks[Number(tokenMatch[1])] || []));
  1306. } else {
  1307. blocks.push(...extractMarkdownTextBlocks(chunk));
  1308. }
  1309. }
  1310. return blocks;
  1311. }
  1312. function normalizeContentSentence(value) {
  1313. return stripLeadingContentSequence(String(value || ''))
  1314. .replace(/^\uFEFF/, '')
  1315. .replace(/[\u0000-\u001f\u007f-\u009f\u200b-\u200f\u202a-\u202e\ufeff]/g, '')
  1316. .replace(/[\s ]+/g, ' ')
  1317. .trim();
  1318. }
  1319. function stripLeadingContentSequence(value) {
  1320. let text = String(value || '').trim();
  1321. const patterns = [
  1322. /^\s*[\d0-9]+(?:\\?[..][\d0-9]+)*\s*(?:\\?[..]|[))、])\s*/u,
  1323. /^\s*[\d0-9]+(?:\\?[..][\d0-9]+)*\s+(?=[A-Za-z\u4e00-\u9fff((])/u,
  1324. /^\s*\((?:[\d0-9]+(?:\\?[..][\d0-9]+)*|[一二三四五六七八九十百千万]+)\)\s*(?:\\?[..]|[、])?\s*/u,
  1325. /^\s*[一二三四五六七八九十百千万]+\s*(?:\\?[..]|[、))])\s*/u,
  1326. /^\s*((?:[一二三四五六七八九十百千万]+|[\d0-9]+(?:\\?[..][\d0-9]+)*))\s*(?:\\?[..]|[、])?\s*/u,
  1327. /^\s*[①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮⑯⑰⑱⑲⑳]\s*(?:\\?[..]|[、])?\s*/u,
  1328. /^\s*第(?:[\d0-9]+|[一二三四五六七八九十百千万]+)[章节篇部分卷]\s*/u,
  1329. ];
  1330. let changed = true;
  1331. while (changed) {
  1332. changed = false;
  1333. for (const pattern of patterns) {
  1334. const next = text.replace(pattern, '');
  1335. if (next !== text) {
  1336. text = next.trimStart();
  1337. changed = true;
  1338. break;
  1339. }
  1340. }
  1341. }
  1342. return text;
  1343. }
  1344. function cleanContentSentence(value) {
  1345. return String(value || '')
  1346. .replace(/^\uFEFF/, '')
  1347. .replace(/[\u0000-\u001f\u007f-\u009f\u200b-\u200f\u202a-\u202e\ufeff]/g, '')
  1348. .replace(/[\t ]+/g, ' ')
  1349. .replace(/[ ]+/g, ' ')
  1350. .trim();
  1351. }
  1352. function splitContentBlockSentences(block) {
  1353. const text = cleanContentSentence(block);
  1354. if (!text) return [];
  1355. const parts = [];
  1356. let start = 0;
  1357. for (let index = 0; index < text.length; index += 1) {
  1358. const char = text[index];
  1359. const currentLength = text.slice(start, index + 1).replace(/\s+/g, '').length;
  1360. const strongBoundary = /[。!?!?]/.test(char);
  1361. const clauseBoundary = /[;;]/.test(char) && currentLength >= 20;
  1362. if (strongBoundary || clauseBoundary) {
  1363. parts.push(text.slice(start, index + 1));
  1364. start = index + 1;
  1365. }
  1366. }
  1367. if (start < text.length) parts.push(text.slice(start));
  1368. return parts;
  1369. }
  1370. function isInformativeContentSentence(sentence) {
  1371. const compact = String(sentence || '').replace(/\s+/g, '');
  1372. if (!compact || /^\d+$/.test(compact)) return false;
  1373. const contentChars = compact.match(/[A-Za-z0-9\u4e00-\u9fff]/g) || [];
  1374. if (contentChars.length < 4) return false;
  1375. if (compact.length >= 12) return true;
  1376. if (compact.length >= 6 && /[::]/.test(compact) && /[A-Za-z\u4e00-\u9fff]{2,}/.test(compact)) return true;
  1377. return compact.length >= 6
  1378. && /[\u4e00-\u9fff]/.test(compact)
  1379. && /(?:日历天|个月|万元|GHz|MHz|GB|MB|kg|mm|cm|天|年|元|%|%)/i.test(compact);
  1380. }
  1381. function splitContentSentences(markdown) {
  1382. const sentences = [];
  1383. for (const block of extractContentTextBlocks(markdown)) {
  1384. for (const part of splitContentBlockSentences(block)) {
  1385. const sentence = cleanContentSentence(part);
  1386. const normalized = normalizeContentSentence(sentence);
  1387. if (!normalized) continue;
  1388. if (!isInformativeContentSentence(normalized)) continue;
  1389. sentences.push({ sentence: sentence.length > 600 ? `${sentence.slice(0, 600)}...` : sentence, normalized });
  1390. }
  1391. }
  1392. return sentences;
  1393. }
  1394. function buildDuplicateSentences(globalSentences) {
  1395. return Array.from(globalSentences.values())
  1396. .filter((item) => item.file_ids.length > 1)
  1397. .sort((a, b) => b.file_ids.length - a.file_ids.length || b.sentence.length - a.sentence.length || a.first_order - b.first_order)
  1398. .map((item, index) => ({ ...item, id: `S${String(index + 1).padStart(6, '0')}` }));
  1399. }
  1400. function extractImageTargets(markdown) {
  1401. const targets = [];
  1402. for (const match of String(markdown || '').matchAll(markdownImagePattern)) {
  1403. const target = String(match.groups?.target || '').trim().replace(/^<|>$/g, '');
  1404. if (target) targets.push(target);
  1405. }
  1406. for (const match of String(markdown || '').matchAll(htmlImageSrcPattern)) {
  1407. const target = String(match.groups?.src || '').trim();
  1408. if (target) targets.push(target);
  1409. }
  1410. return targets;
  1411. }
  1412. function isPathInsideDirectory(baseDir, targetPath) {
  1413. const relative = path.relative(baseDir, targetPath);
  1414. return relative === '' || (relative && !relative.startsWith('..') && !path.isAbsolute(relative));
  1415. }
  1416. function resolveAssetPath(app, value) {
  1417. const url = new URL(value);
  1418. const roots = {
  1419. 'generated-images': getGeneratedImagesDir(app),
  1420. 'imported-images': getImportedImagesDir(app),
  1421. };
  1422. const rootDir = roots[url.hostname];
  1423. if (!rootDir) return '';
  1424. const relativePath = decodeURIComponent(url.pathname.replace(/^\/+/, ''));
  1425. if (!relativePath) return '';
  1426. const baseDir = path.resolve(rootDir);
  1427. const filePath = path.resolve(baseDir, relativePath);
  1428. return isPathInsideDirectory(baseDir, filePath) && filePath !== baseDir ? filePath : '';
  1429. }
  1430. async function readImageTargetBuffer(app, target) {
  1431. const value = String(target || '').trim();
  1432. if (!value) return null;
  1433. const dataMatch = value.match(/^data:image\/[^;]+;base64,(?<data>[A-Za-z0-9+/=\s]+)$/i);
  1434. if (dataMatch?.groups?.data) return Buffer.from(dataMatch.groups.data.replace(/\s+/g, ''), 'base64');
  1435. if (/^yibiao-asset:\/\//i.test(value)) {
  1436. const filePath = resolveAssetPath(app, value);
  1437. return filePath ? fs.readFile(filePath) : null;
  1438. }
  1439. if (/^file:\/\//i.test(value)) {
  1440. return fs.readFile(new URL(value));
  1441. }
  1442. return null;
  1443. }
  1444. function buildDuplicateImages(globalImages) {
  1445. return Array.from(globalImages.values())
  1446. .filter((item) => item.file_ids.length > 1)
  1447. .sort((a, b) => b.file_ids.length - a.file_ids.length || Object.values(b.occurrences).reduce((sum, count) => sum + count, 0) - Object.values(a.occurrences).reduce((sum, count) => sum + count, 0))
  1448. .map((item, index) => ({ ...item, id: `I${String(index + 1).padStart(6, '0')}` }));
  1449. }
  1450. function createInitialAnalysis(signature, bidFiles) {
  1451. const total = bidFiles.length;
  1452. return {
  1453. status: 'running',
  1454. progress: 0,
  1455. message: '正在启动元数据分析',
  1456. signature,
  1457. started_at: now(),
  1458. updated_at: now(),
  1459. contentExtraction: { status: 'running', completed: 0, total: 0 },
  1460. metadataExtraction: { status: total ? 'running' : 'success', completed: 0, total },
  1461. files: [],
  1462. rows: [],
  1463. contentFiles: [],
  1464. logs: [],
  1465. };
  1466. }
  1467. function createInitialOutlineAnalysis(signature, bidFiles) {
  1468. return {
  1469. status: 'pending',
  1470. progress: 0,
  1471. message: '等待元数据提取完成后开始目录分析',
  1472. signature,
  1473. started_at: now(),
  1474. updated_at: now(),
  1475. tenderSentenceCount: 0,
  1476. tenderMatchedItemCount: 0,
  1477. extraction: { status: bidFiles.length ? 'pending' : 'success', completed: 0, total: bidFiles.length },
  1478. files: [],
  1479. duplicateGroups: [],
  1480. pairwiseSimilarities: [],
  1481. };
  1482. }
  1483. function createInitialContentAnalysis(signature, bidFiles) {
  1484. return {
  1485. status: 'pending',
  1486. progress: 0,
  1487. message: '等待正文内容提取完成后开始正文比对',
  1488. signature,
  1489. started_at: now(),
  1490. updated_at: now(),
  1491. tenderSentenceCount: 0,
  1492. tenderMatchedSentenceCount: 0,
  1493. totalSentenceCount: 0,
  1494. extraction: { status: bidFiles.length ? 'pending' : 'success', completed: 0, total: bidFiles.length },
  1495. duplicateSentences: [],
  1496. };
  1497. }
  1498. function createInitialImageAnalysis(signature, bidFiles) {
  1499. return {
  1500. status: 'pending',
  1501. progress: 0,
  1502. message: '等待正文内容提取完成后开始图片比对',
  1503. signature,
  1504. started_at: now(),
  1505. updated_at: now(),
  1506. extraction: { status: bidFiles.length ? 'pending' : 'success', completed: 0, total: bidFiles.length },
  1507. totalImageCount: 0,
  1508. files: [],
  1509. duplicateImages: [],
  1510. };
  1511. }
  1512. function createDuplicateCheckService({ app, configStore, workspaceStore } = {}) {
  1513. const running = new Map();
  1514. function emit(webContents, state) {
  1515. if (webContents && !webContents.isDestroyed()) {
  1516. webContents.send('duplicate-check:event', { duplicateCheck: state });
  1517. }
  1518. }
  1519. function isCurrentDuplicateCheckSignature(signature) {
  1520. if (!signature) return true;
  1521. const current = workspaceStore.loadDuplicateCheck() || {};
  1522. const currentSignature = createSignature({
  1523. tenderFile: current.tenderFile || null,
  1524. bidFiles: Array.isArray(current.bidFiles) ? current.bidFiles : [],
  1525. });
  1526. return currentSignature === signature;
  1527. }
  1528. function updateAnalysis(partial, webContents, signature) {
  1529. if (!isCurrentDuplicateCheckSignature(signature)) return null;
  1530. const prev = workspaceStore.loadDuplicateCheck() || {};
  1531. const prevAnalysis = prev.metadataAnalysis || {};
  1532. const metadataAnalysis = { ...prevAnalysis, ...partial, updated_at: now() };
  1533. const next = workspaceStore.updateDuplicateCheck({ metadataAnalysis });
  1534. emit(webContents, next);
  1535. return next;
  1536. }
  1537. function updateOutlineAnalysis(partial, webContents, signature) {
  1538. if (!isCurrentDuplicateCheckSignature(signature)) return null;
  1539. const prev = workspaceStore.loadDuplicateCheck() || {};
  1540. const prevAnalysis = prev.outlineAnalysis || {};
  1541. const outlineAnalysis = { ...prevAnalysis, ...partial, updated_at: now() };
  1542. const next = workspaceStore.updateDuplicateCheck({ outlineAnalysis });
  1543. emit(webContents, next);
  1544. return next;
  1545. }
  1546. function updateContentAnalysis(partial, webContents, signature) {
  1547. if (!isCurrentDuplicateCheckSignature(signature)) return null;
  1548. const prev = workspaceStore.loadDuplicateCheck() || {};
  1549. const prevAnalysis = prev.contentAnalysis || {};
  1550. const contentAnalysis = { ...prevAnalysis, ...partial, updated_at: now() };
  1551. const next = workspaceStore.updateDuplicateCheck({ contentAnalysis });
  1552. emit(webContents, next);
  1553. return next;
  1554. }
  1555. function updateImageAnalysis(partial, webContents, signature) {
  1556. if (!isCurrentDuplicateCheckSignature(signature)) return null;
  1557. const prev = workspaceStore.loadDuplicateCheck() || {};
  1558. const prevAnalysis = prev.imageAnalysis || {};
  1559. const imageAnalysis = { ...prevAnalysis, ...partial, updated_at: now() };
  1560. const next = workspaceStore.updateDuplicateCheck({ imageAnalysis });
  1561. emit(webContents, next);
  1562. return next;
  1563. }
  1564. async function runContentExtraction(allFiles, webContents, signature) {
  1565. const config = configStore ? configStore.load() : { file_parser: { provider: 'local' } };
  1566. const dir = path.join(getDuplicateCheckDir(app), 'contents');
  1567. await fs.mkdir(dir, { recursive: true });
  1568. const results = [];
  1569. updateAnalysis({ contentExtraction: { status: 'running', completed: 0, total: allFiles.length }, message: '正在提取正文内容' }, webContents, signature);
  1570. for (const file of allFiles) {
  1571. const fileId = stableFileId(file);
  1572. try {
  1573. const markdown = (await parseDocumentWithConfig(app, file.file_path, config, {
  1574. assetScope: `duplicate-check-content-${fileId}`,
  1575. preserveImages: true,
  1576. })).trim();
  1577. const contentPath = path.join(dir, `${fileId}.md`);
  1578. await fs.writeFile(contentPath, markdown, 'utf-8');
  1579. results.push({ file_id: fileId, file_name: file.file_name, status: 'success', content_path: contentPath, content_length: markdown.length });
  1580. } catch (error) {
  1581. results.push({ file_id: fileId, file_name: file.file_name, status: 'error', error: error.message || '正文提取失败' });
  1582. }
  1583. updateAnalysis({ contentExtraction: { status: 'running', completed: results.length, total: allFiles.length }, contentFiles: results, message: `正文内容提取 ${results.length}/${allFiles.length}` }, webContents, signature);
  1584. }
  1585. const status = results.some((item) => item.status === 'error') ? 'error' : 'success';
  1586. updateAnalysis({ contentExtraction: { status, completed: results.length, total: allFiles.length }, contentFiles: results }, webContents, signature);
  1587. return results;
  1588. }
  1589. async function runMetadataExtraction(bidFiles, webContents, signature) {
  1590. const results = [];
  1591. updateAnalysis({ metadataExtraction: { status: 'running', completed: 0, total: bidFiles.length }, message: '正在提取投标文件元数据' }, webContents, signature);
  1592. for (const file of bidFiles) {
  1593. const fileId = stableFileId(file);
  1594. try {
  1595. results.push({ file_id: fileId, file_name: file.file_name, status: 'success', metadata: await extractMetadata(file) });
  1596. } catch (error) {
  1597. results.push({ file_id: fileId, file_name: file.file_name, status: 'error', error: error.message || '元数据提取失败', metadata: [] });
  1598. }
  1599. const rows = buildRows(results);
  1600. updateAnalysis({ metadataExtraction: { status: 'running', completed: results.length, total: bidFiles.length }, files: results, rows, message: `元数据提取 ${results.length}/${bidFiles.length}` }, webContents, signature);
  1601. }
  1602. const rows = buildRows(results);
  1603. const status = results.some((item) => item.status === 'error') ? 'error' : 'success';
  1604. updateAnalysis({ metadataExtraction: { status, completed: results.length, total: bidFiles.length }, files: results, rows }, webContents, signature);
  1605. return results;
  1606. }
  1607. async function readContentMarkdown(contentFiles, file) {
  1608. const fileId = stableFileId(file);
  1609. const item = contentFiles.find((entry) => entry.file_id === fileId && entry.status === 'success' && entry.content_path);
  1610. if (!item) throw new Error('正文内容尚未成功提取,无法进行目录分析');
  1611. return fs.readFile(item.content_path, 'utf-8');
  1612. }
  1613. async function runOutlineAnalysis(tenderFile, bidFiles, contentFiles, signature, webContents) {
  1614. updateOutlineAnalysis({ status: 'running', progress: 5, extraction: { status: 'running', completed: 0, total: bidFiles.length }, message: '正在准备目录分析' }, webContents, signature);
  1615. const results = [];
  1616. let tenderSentences = [];
  1617. if (tenderFile) {
  1618. try {
  1619. const tenderMarkdown = await readContentMarkdown(contentFiles, tenderFile);
  1620. tenderSentences = splitTenderSentences(tenderMarkdown);
  1621. } catch (error) {
  1622. updateOutlineAnalysis({ message: `招标文件句子白名单生成失败,继续对比投标文件目录:${error.message || error}` }, webContents, signature);
  1623. }
  1624. }
  1625. updateOutlineAnalysis({ tenderSentenceCount: tenderSentences.length, message: '正在提取投标文件目录' }, webContents, signature);
  1626. for (const file of bidFiles) {
  1627. const fileId = stableFileId(file);
  1628. try {
  1629. const markdown = await readContentMarkdown(contentFiles, file);
  1630. const extracted = buildOutlineItems(markdown, tenderSentences);
  1631. const tenderMatchedCount = extracted.items.filter((item) => item.from_tender).length;
  1632. results.push({
  1633. file_id: fileId,
  1634. file_name: file.file_name,
  1635. status: 'success',
  1636. source: extracted.source,
  1637. confidence: extracted.confidence,
  1638. item_count: extracted.items.length,
  1639. tender_matched_count: tenderMatchedCount,
  1640. items: extracted.items,
  1641. });
  1642. } catch (error) {
  1643. results.push({ file_id: fileId, file_name: file.file_name, status: 'error', item_count: 0, tender_matched_count: 0, items: [], error: error.message || '目录提取失败' });
  1644. }
  1645. updateOutlineAnalysis({
  1646. status: 'running',
  1647. progress: bidFiles.length ? Math.round((results.length / bidFiles.length) * 80) : 80,
  1648. extraction: { status: 'running', completed: results.length, total: bidFiles.length },
  1649. files: results,
  1650. tenderSentenceCount: tenderSentences.length,
  1651. tenderMatchedItemCount: results.reduce((sum, item) => sum + (item.tender_matched_count || 0), 0),
  1652. message: `目录提取 ${results.length}/${bidFiles.length}`,
  1653. }, webContents, signature);
  1654. }
  1655. const comparison = buildOutlineComparison(results);
  1656. const failed = results.some((item) => item.status === 'error');
  1657. updateOutlineAnalysis({
  1658. status: failed ? 'error' : 'success',
  1659. progress: 100,
  1660. message: failed ? '部分文件目录分析失败' : '目录分析完成',
  1661. signature,
  1662. extraction: { status: failed ? 'error' : 'success', completed: results.length, total: bidFiles.length },
  1663. files: results,
  1664. tenderSentenceCount: tenderSentences.length,
  1665. tenderMatchedItemCount: results.reduce((sum, item) => sum + (item.tender_matched_count || 0), 0),
  1666. duplicateGroups: comparison.duplicateGroups,
  1667. pairwiseSimilarities: comparison.pairwiseSimilarities,
  1668. }, webContents, signature);
  1669. return results;
  1670. }
  1671. async function runContentDuplicateAnalysis(tenderFile, bidFiles, contentFiles, signature, webContents) {
  1672. updateContentAnalysis({ status: 'running', progress: 5, extraction: { status: 'running', completed: 0, total: bidFiles.length }, message: '正在准备正文比对' }, webContents, signature);
  1673. let tenderSentenceSet = new Set();
  1674. if (tenderFile) {
  1675. try {
  1676. const tenderMarkdown = await readContentMarkdown(contentFiles, tenderFile);
  1677. tenderSentenceSet = new Set(splitContentSentences(tenderMarkdown).map((item) => item.normalized));
  1678. } catch (error) {
  1679. updateContentAnalysis({ message: `招标文件句子白名单生成失败,继续比对投标正文:${error.message || error}` }, webContents, signature);
  1680. }
  1681. }
  1682. const globalSentences = new Map();
  1683. let totalSentenceCount = 0;
  1684. let tenderMatchedSentenceCount = 0;
  1685. let firstOrder = 0;
  1686. for (const file of bidFiles) {
  1687. const fileId = stableFileId(file);
  1688. try {
  1689. const markdown = await readContentMarkdown(contentFiles, file);
  1690. const sentences = splitContentSentences(markdown);
  1691. totalSentenceCount += sentences.length;
  1692. const local = new Map();
  1693. for (const sentence of sentences) {
  1694. if (tenderSentenceSet.has(sentence.normalized)) {
  1695. tenderMatchedSentenceCount += 1;
  1696. continue;
  1697. }
  1698. const current = local.get(sentence.normalized) || { sentence: sentence.sentence, count: 0, order: firstOrder++ };
  1699. current.count += 1;
  1700. local.set(sentence.normalized, current);
  1701. }
  1702. for (const [normalized, item] of local.entries()) {
  1703. const global = globalSentences.get(normalized) || { sentence: item.sentence, normalized, file_ids: [], occurrences: {}, first_order: item.order };
  1704. if (!global.file_ids.includes(fileId)) global.file_ids.push(fileId);
  1705. global.occurrences[fileId] = item.count;
  1706. globalSentences.set(normalized, global);
  1707. }
  1708. } catch (error) {
  1709. updateContentAnalysis({ message: `${file.file_name} 正文比对失败:${error.message || error}` }, webContents, signature);
  1710. }
  1711. updateContentAnalysis({
  1712. status: 'running',
  1713. progress: bidFiles.length ? Math.round((globalSentences.size ? 10 : 5) + (bidFiles.indexOf(file) + 1) / bidFiles.length * 80) : 85,
  1714. tenderSentenceCount: tenderSentenceSet.size,
  1715. tenderMatchedSentenceCount,
  1716. totalSentenceCount,
  1717. extraction: { status: 'running', completed: bidFiles.indexOf(file) + 1, total: bidFiles.length },
  1718. message: `正文比对 ${bidFiles.indexOf(file) + 1}/${bidFiles.length}`,
  1719. }, webContents, signature);
  1720. }
  1721. const duplicateSentences = buildDuplicateSentences(globalSentences);
  1722. updateContentAnalysis({
  1723. status: 'success',
  1724. progress: 100,
  1725. message: '正文比对完成',
  1726. signature,
  1727. tenderSentenceCount: tenderSentenceSet.size,
  1728. tenderMatchedSentenceCount,
  1729. totalSentenceCount,
  1730. extraction: { status: 'success', completed: bidFiles.length, total: bidFiles.length },
  1731. duplicateSentences,
  1732. }, webContents, signature);
  1733. return { status: 'success', duplicateSentences };
  1734. }
  1735. async function runImageDuplicateAnalysis(bidFiles, contentFiles, signature, webContents) {
  1736. updateImageAnalysis({ status: 'running', progress: 5, extraction: { status: 'running', completed: 0, total: bidFiles.length }, message: '正在准备图片比对' }, webContents, signature);
  1737. const results = [];
  1738. const globalImages = new Map();
  1739. let totalImageCount = 0;
  1740. for (const file of bidFiles) {
  1741. const fileId = stableFileId(file);
  1742. try {
  1743. const markdown = await readContentMarkdown(contentFiles, file);
  1744. const targets = extractImageTargets(markdown);
  1745. totalImageCount += targets.length;
  1746. const local = new Map();
  1747. for (const target of targets) {
  1748. try {
  1749. const buffer = await readImageTargetBuffer(app, target);
  1750. if (!buffer?.length) continue;
  1751. const hash = crypto.createHash('sha256').update(buffer).digest('hex');
  1752. const current = local.get(hash) || { count: 0, preview_url: target };
  1753. current.count += 1;
  1754. local.set(hash, current);
  1755. } catch {
  1756. // Ignore individual unreadable images; other images in the same file can still be compared.
  1757. }
  1758. }
  1759. for (const [hash, item] of local.entries()) {
  1760. const global = globalImages.get(hash) || { hash, preview_url: item.preview_url, file_ids: [], occurrences: {} };
  1761. if (!global.file_ids.includes(fileId)) global.file_ids.push(fileId);
  1762. global.occurrences[fileId] = item.count;
  1763. globalImages.set(hash, global);
  1764. }
  1765. results.push({ file_id: fileId, file_name: file.file_name, status: 'success', image_count: targets.length, unique_image_count: local.size });
  1766. } catch (error) {
  1767. results.push({ file_id: fileId, file_name: file.file_name, status: 'error', image_count: 0, unique_image_count: 0, error: error.message || '图片比对失败' });
  1768. }
  1769. updateImageAnalysis({
  1770. status: 'running',
  1771. progress: bidFiles.length ? Math.round((results.length / bidFiles.length) * 85) : 85,
  1772. extraction: { status: 'running', completed: results.length, total: bidFiles.length },
  1773. files: results,
  1774. totalImageCount,
  1775. message: `图片比对 ${results.length}/${bidFiles.length}`,
  1776. }, webContents, signature);
  1777. }
  1778. const duplicateImages = buildDuplicateImages(globalImages);
  1779. const failed = results.some((item) => item.status === 'error');
  1780. updateImageAnalysis({
  1781. status: failed ? 'error' : 'success',
  1782. progress: 100,
  1783. message: failed ? '部分文件图片比对失败' : '图片比对完成',
  1784. signature,
  1785. extraction: { status: failed ? 'error' : 'success', completed: results.length, total: bidFiles.length },
  1786. files: results,
  1787. totalImageCount,
  1788. duplicateImages,
  1789. }, webContents, signature);
  1790. return { status: failed ? 'error' : 'success', duplicateImages };
  1791. }
  1792. async function run(signature, payload, webContents) {
  1793. const tenderFile = payload.tenderFile || null;
  1794. const bidFiles = Array.isArray(payload.bidFiles) ? payload.bidFiles : [];
  1795. const allFiles = [tenderFile, ...bidFiles].filter(Boolean);
  1796. try {
  1797. const contentPromise = runContentExtraction(allFiles, webContents, signature);
  1798. const metadataFiles = await runMetadataExtraction(bidFiles, webContents, signature);
  1799. updateOutlineAnalysis({ status: 'running', progress: 1, message: '元数据提取完成,等待正文内容用于目录分析', extraction: { status: 'running', completed: 0, total: bidFiles.length } }, webContents, signature);
  1800. updateContentAnalysis({ status: 'running', progress: 1, message: '元数据提取完成,等待正文内容用于正文比对', extraction: { status: 'running', completed: 0, total: bidFiles.length } }, webContents, signature);
  1801. updateImageAnalysis({ status: 'running', progress: 1, message: '元数据提取完成,等待正文内容用于图片比对', extraction: { status: 'running', completed: 0, total: bidFiles.length } }, webContents, signature);
  1802. const contentFiles = await contentPromise;
  1803. const [outlineFiles, contentResult, imageResult] = await Promise.all([
  1804. runOutlineAnalysis(tenderFile, bidFiles, contentFiles, signature, webContents),
  1805. runContentDuplicateAnalysis(tenderFile, bidFiles, contentFiles, signature, webContents),
  1806. runImageDuplicateAnalysis(bidFiles, contentFiles, signature, webContents),
  1807. ]);
  1808. const failed = contentFiles.some((item) => item.status === 'error')
  1809. || metadataFiles.some((item) => item.status === 'error')
  1810. || outlineFiles.some((item) => item.status === 'error')
  1811. || contentResult.status === 'error'
  1812. || imageResult.status === 'error';
  1813. updateAnalysis({ status: failed ? 'error' : 'success', progress: 100, message: failed ? '部分文件分析失败' : '元数据分析完成' }, webContents, signature);
  1814. } catch (error) {
  1815. updateAnalysis({ status: 'error', progress: 100, message: error.message || '元数据分析失败' }, webContents, signature);
  1816. } finally {
  1817. running.delete(signature);
  1818. }
  1819. }
  1820. return {
  1821. startMetadataAnalysis(payload = {}, webContents) {
  1822. const signature = createSignature(payload);
  1823. const force = payload.force === true;
  1824. const current = workspaceStore.loadDuplicateCheck() || {};
  1825. if (!force
  1826. && current.metadataAnalysis?.signature === signature && current.metadataAnalysis?.status === 'success'
  1827. && current.outlineAnalysis?.signature === signature && current.outlineAnalysis?.status === 'success'
  1828. && current.contentAnalysis?.signature === signature && current.contentAnalysis?.status === 'success'
  1829. && current.imageAnalysis?.signature === signature && current.imageAnalysis?.status === 'success') {
  1830. emit(webContents, current);
  1831. return current.metadataAnalysis;
  1832. }
  1833. if (!force && running.has(signature)) {
  1834. emit(webContents, current);
  1835. return current.metadataAnalysis || { status: 'running', signature };
  1836. }
  1837. const bidFiles = Array.isArray(payload.bidFiles) ? payload.bidFiles : [];
  1838. const metadataAnalysis = createInitialAnalysis(signature, bidFiles);
  1839. const outlineAnalysis = createInitialOutlineAnalysis(signature, bidFiles);
  1840. const contentAnalysis = createInitialContentAnalysis(signature, bidFiles);
  1841. const imageAnalysis = createInitialImageAnalysis(signature, bidFiles);
  1842. const next = workspaceStore.updateDuplicateCheck({ tenderFile: payload.tenderFile || null, bidFiles, metadataAnalysis, outlineAnalysis, contentAnalysis, imageAnalysis });
  1843. emit(webContents, next);
  1844. const promise = run(signature, payload, webContents);
  1845. running.set(signature, promise);
  1846. return metadataAnalysis;
  1847. },
  1848. };
  1849. }
  1850. module.exports = { createDuplicateCheckService };