| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579 |
- const fs = require('node:fs/promises');
- const path = require('node:path');
- const crypto = require('node:crypto');
- const { dialog } = require('electron');
- const AdmZip = require('adm-zip');
- const { getImportedImagesDir } = require('../utils/paths.cjs');
- const parserLabels = {
- local: '本地解析',
- 'mineru-accurate-api': 'MinerU 精准解析 API',
- 'mineru-agent-api': 'MinerU-Agent 轻量解析 API',
- };
- const localSupportedExtensions = new Set(['.txt', '.md', '.markdown', '.docx', '.pdf', '.doc', '.wps']);
- const mineruAgentSupportedExtensions = new Set([
- '.pdf', '.doc', '.docx', '.ppt', '.pptx', '.png', '.jpg', '.jpeg', '.jp2', '.webp', '.gif', '.bmp', '.xls', '.xlsx',
- ]);
- const mineruAccurateSupportedExtensions = new Set([
- '.pdf', '.doc', '.docx', '.ppt', '.pptx', '.png', '.jpg', '.jpeg', '.jp2', '.webp', '.gif', '.bmp', '.html',
- ]);
- const duplicateCheckSupportedExtensions = new Set(['.doc', '.docx', '.wps', '.pdf', '.md', '.markdown']);
- const remoteImageTimeoutMs = 10000;
- const markdownImagePattern = /!\[(?<alt>[^\]]*)\]\((?<target><[^>]+>|[^)\s]+)(?<title>\s+"[^"]*")?\)/gi;
- const htmlImageSrcPattern = /(<img\b[^>]*?\bsrc=["'])(?<src>[^"']+)(["'][^>]*>)/gi;
- function sleep(ms) {
- return new Promise((resolve) => setTimeout(resolve, ms));
- }
- function getSupportedExtensions(provider) {
- if (provider === 'mineru-agent-api') {
- return mineruAgentSupportedExtensions;
- }
- if (provider === 'mineru-accurate-api') {
- return mineruAccurateSupportedExtensions;
- }
- return localSupportedExtensions;
- }
- function getSelectableExtensions(provider) {
- if (provider === 'local') {
- return localSupportedExtensions;
- }
- return new Set([...getSupportedExtensions(provider), ...localSupportedExtensions]);
- }
- function resolveFileParser(config, filePath) {
- const requestedProvider = config.file_parser?.provider || 'local';
- const ext = path.extname(filePath).toLowerCase();
- const requestedSupported = getSupportedExtensions(requestedProvider).has(ext);
- if (requestedSupported) {
- return { provider: requestedProvider, requestedProvider, ext, supported: true, fallbackToLocal: false };
- }
- if (requestedProvider !== 'local' && localSupportedExtensions.has(ext)) {
- return { provider: 'local', requestedProvider, ext, supported: true, fallbackToLocal: true };
- }
- return { provider: requestedProvider, requestedProvider, ext, supported: false, fallbackToLocal: false };
- }
- async function parseLocalDocument(filePath, options = {}) {
- const ext = path.extname(filePath).toLowerCase();
- if (ext === '.txt') {
- return fs.readFile(filePath, 'utf-8');
- }
- const { convertPathToMarkdown } = await import('./doc2markdown/convert.mjs');
- return convertPathToMarkdown(filePath, {
- includeImages: options.preserveImages,
- imageResolver: options.imageResolver,
- });
- }
- function formatImportError(error) {
- const rawMessage = error instanceof Error ? error.message : String(error || '未知错误');
- if (/Can't find end of central directory|is this a zip file/i.test(rawMessage)) {
- return '文件解析失败:该文件不是有效的 DOCX 文档,请用 Word/WPS 另存为标准 DOCX 后重试';
- }
- return `文件解析失败:${rawMessage || '未知错误'}`;
- }
- async function parseWithMineruAgent(filePath, options = {}) {
- const fileName = path.basename(filePath);
- const createResponse = await fetch('https://mineru.net/api/v1/agent/parse/file', {
- method: 'POST',
- headers: { 'Content-Type': 'application/json' },
- body: JSON.stringify({
- file_name: fileName,
- language: 'ch',
- enable_table: true,
- is_ocr: true,
- enable_formula: true,
- }),
- });
- const createResult = await createResponse.json();
- if (!createResponse.ok || createResult.code !== 0) {
- throw new Error(`申请 MinerU-Agent 上传链接失败:HTTP ${createResponse.status},${JSON.stringify(createResult)}`);
- }
- const taskId = createResult.data?.task_id;
- const fileUrl = createResult.data?.file_url;
- if (!taskId || !fileUrl) {
- throw new Error(`MinerU-Agent 响应缺少 task_id/file_url:${JSON.stringify(createResult)}`);
- }
- await uploadFile(fileUrl, filePath);
- const finalResult = await pollMineruAgent(taskId, fileName);
- const markdownUrl = finalResult.data.markdown_url;
- if (!markdownUrl) {
- throw new Error('MinerU-Agent 解析完成但未返回 markdown_url');
- }
- return downloadText(markdownUrl, '下载 MinerU-Agent Markdown 失败').then((markdown) => (
- options.preserveImages
- ? rewriteMarkdownImages(markdown, options.assets, { baseUrl: markdownUrl })
- : stripMarkdownImages(markdown)
- ));
- }
- async function pollMineruAgent(taskId, fileName) {
- const startedAt = Date.now();
- const timeoutMs = 300000;
- const intervalMs = 3000;
- while (Date.now() - startedAt < timeoutMs) {
- const response = await fetch(`https://mineru.net/api/v1/agent/parse/${taskId}`);
- const result = await response.json();
- if (!response.ok || result.code !== 0) {
- throw new Error(`查询 MinerU-Agent 任务失败:HTTP ${response.status},${JSON.stringify(result)}`);
- }
- const data = result.data || {};
- if (data.state === 'done') {
- return { raw: result, data };
- }
- if (data.state === 'failed') {
- throw new Error(`MinerU-Agent 解析失败:${data.err_msg || '未知错误'}${data.err_code ? ` (${data.err_code})` : ''}`);
- }
- console.log(`WAIT ${fileName}: ${data.state || 'unknown'}`);
- await sleep(intervalMs);
- }
- throw new Error(`MinerU-Agent 轮询超时,请稍后重试,task_id: ${taskId}`);
- }
- async function parseWithMineruAccurate(filePath, token, options = {}) {
- if (!token) {
- throw new Error('请先在设置中填写 MinerU Token');
- }
- const fileName = path.basename(filePath);
- const createResponse = await fetch('https://mineru.net/api/v4/file-urls/batch', {
- method: 'POST',
- headers: {
- Authorization: `Bearer ${token}`,
- 'Content-Type': 'application/json',
- },
- body: JSON.stringify({
- files: [{ name: fileName, data_id: makeDataId(fileName), is_ocr: true }],
- model_version: 'vlm',
- language: 'ch',
- enable_table: true,
- enable_formula: true,
- }),
- });
- const createResult = await createResponse.json();
- if (!createResponse.ok || createResult.code !== 0) {
- throw new Error(`申请 MinerU 精准解析上传链接失败:HTTP ${createResponse.status},${JSON.stringify(createResult)}`);
- }
- const batchId = createResult.data?.batch_id;
- const fileUrl = createResult.data?.file_urls?.[0];
- if (!batchId || !fileUrl) {
- throw new Error(`MinerU 精准解析响应缺少 batch_id/file_url:${JSON.stringify(createResult)}`);
- }
- await uploadFile(fileUrl, filePath);
- const finalResult = await pollMineruAccurate(token, batchId, fileName);
- const fullZipUrl = finalResult.item.full_zip_url;
- if (!fullZipUrl) {
- throw new Error('MinerU 精准解析完成但未返回 full_zip_url');
- }
- const zipBuffer = await downloadBuffer(fullZipUrl);
- return extractMarkdownFromZip(zipBuffer, options);
- }
- async function pollMineruAccurate(token, batchId, fileName) {
- const startedAt = Date.now();
- const timeoutMs = 600000;
- const intervalMs = 5000;
- while (Date.now() - startedAt < timeoutMs) {
- const response = await fetch(`https://mineru.net/api/v4/extract-results/batch/${batchId}`, {
- headers: { Authorization: `Bearer ${token}`, Accept: '*/*' },
- });
- const result = await response.json();
- if (!response.ok || result.code !== 0) {
- throw new Error(`查询 MinerU 精准解析任务失败:HTTP ${response.status},${JSON.stringify(result)}`);
- }
- const items = result.data?.extract_result || [];
- const item = items.find((candidate) => candidate.file_name === fileName) || items[0];
- if (item?.state === 'done') {
- return { raw: result, item };
- }
- if (item?.state === 'failed') {
- throw new Error(`MinerU 精准解析失败:${item.err_msg || '未知错误'}`);
- }
- console.log(`WAIT ${fileName}: ${item?.state || 'unknown'}`);
- await sleep(intervalMs);
- }
- throw new Error(`MinerU 精准解析轮询超时,请稍后重试,batch_id: ${batchId}`);
- }
- async function uploadFile(fileUrl, filePath) {
- const buffer = await fs.readFile(filePath);
- const response = await fetch(fileUrl, { method: 'PUT', body: buffer });
- if (!response.ok) {
- throw new Error(`文件上传失败:HTTP ${response.status},${await response.text()}`);
- }
- }
- async function downloadText(url, fallbackMessage) {
- const response = await fetch(url);
- if (!response.ok) {
- throw new Error(`${fallbackMessage}:HTTP ${response.status}`);
- }
- return response.text();
- }
- async function downloadBuffer(url) {
- const response = await fetch(url);
- if (!response.ok) {
- throw new Error(`下载 MinerU 精准解析结果失败:HTTP ${response.status}`);
- }
- return Buffer.from(await response.arrayBuffer());
- }
- async function extractMarkdownFromZip(zipBuffer, options = {}) {
- const zip = new AdmZip(zipBuffer);
- const entries = zip.getEntries();
- const fullMd = entries.find((entry) => /(^|[/\\])full\.md$/i.test(entry.entryName));
- const anyMd = entries.find((entry) => entry.entryName.toLowerCase().endsWith('.md'));
- const target = fullMd || anyMd;
- if (!target) {
- throw new Error('MinerU 精准解析结果 zip 中未找到 Markdown 文件');
- }
- const markdown = target.getData().toString('utf8');
- if (!options.preserveImages) {
- return stripMarkdownImages(markdown);
- }
- return rewriteMarkdownImages(markdown, options.assets, {
- zipEntries: entries,
- markdownEntryName: target.entryName,
- });
- }
- function makeDataId(fileName) {
- return fileName.replace(/[^A-Za-z0-9_.-]+/g, '_').slice(0, 96) || 'document';
- }
- async function createLocalFileSelection(filePath) {
- const stats = await fs.stat(filePath);
- const extension = path.extname(filePath).toLowerCase();
- return {
- id: crypto.createHash('sha1').update(filePath).digest('hex'),
- file_name: path.basename(filePath),
- file_path: filePath,
- extension,
- size: stats.size,
- modified_at: stats.mtime.toISOString(),
- };
- }
- function stripMarkdownImages(text) {
- return String(text || '')
- .replace(markdownImagePattern, '')
- .replace(/<img\b[^>]*>/gi, '')
- .replace(/\n{3,}/g, '\n\n');
- }
- function createAssetContext(app, scope = 'documents') {
- if (!app?.getPath) return null;
- const safeScope = String(scope || 'documents').replace(/[^A-Za-z0-9._-]+/g, '_') || 'documents';
- const batchId = `${safeScope}-${Date.now()}-${crypto.randomUUID().slice(0, 8)}`;
- return {
- baseDir: path.join(getImportedImagesDir(app), batchId),
- urlPrefix: `yibiao-asset://imported-images/${encodeURIComponent(batchId)}`,
- index: 0,
- };
- }
- async function deleteImportedImageAssets(assets) {
- if (!assets?.baseDir) return;
- await fs.rm(assets.baseDir, { recursive: true, force: true });
- }
- function imageExtensionFromMime(mime) {
- const normalized = String(mime || '').toLowerCase();
- if (normalized.includes('jpeg') || normalized.includes('jpg')) return '.jpg';
- if (normalized.includes('png')) return '.png';
- if (normalized.includes('gif')) return '.gif';
- if (normalized.includes('bmp')) return '.bmp';
- if (normalized.includes('webp')) return '.webp';
- return '';
- }
- function imageExtensionFromPath(value) {
- const ext = path.extname(String(value || '').split(/[?#]/)[0]).toLowerCase();
- return ['.png', '.jpg', '.jpeg', '.gif', '.bmp', '.webp'].includes(ext) ? (ext === '.jpeg' ? '.jpg' : ext) : '';
- }
- async function saveImportedImage(assets, buffer, sourceName, mime) {
- if (!assets || !buffer?.length) return null;
- const ext = imageExtensionFromMime(mime) || imageExtensionFromPath(sourceName) || '.png';
- assets.index += 1;
- const fileName = `image-${String(assets.index).padStart(4, '0')}${ext}`;
- await fs.mkdir(assets.baseDir, { recursive: true });
- await fs.writeFile(path.join(assets.baseDir, fileName), buffer);
- return `${assets.urlPrefix}/${encodeURIComponent(fileName)}`;
- }
- function createImageResolver(assets) {
- if (!assets) return null;
- return ({ buffer, mime, sourceName }) => saveImportedImage(assets, Buffer.isBuffer(buffer) ? buffer : Buffer.from(buffer), sourceName, mime);
- }
- function cleanMarkdownImageTarget(target) {
- const value = String(target || '').trim();
- return value.startsWith('<') && value.endsWith('>') ? value.slice(1, -1) : value;
- }
- function parseDataUrl(value) {
- const match = /^data:([^;,]+);base64,(.+)$/i.exec(String(value || ''));
- if (!match) return null;
- return { mime: match[1], buffer: Buffer.from(match[2], 'base64') };
- }
- async function loadRemoteImage(url) {
- const controller = new AbortController();
- const timeout = setTimeout(() => controller.abort(), remoteImageTimeoutMs);
- try {
- const response = await fetch(url, { signal: controller.signal });
- if (!response.ok) return null;
- const contentType = response.headers.get('content-type') || '';
- if (contentType && !/^image\//i.test(contentType)) return null;
- return { buffer: Buffer.from(await response.arrayBuffer()), mime: contentType };
- } finally {
- clearTimeout(timeout);
- }
- }
- function findZipEntryImage(zipEntries, imagePath, markdownEntryName) {
- let decodedPath = imagePath;
- try {
- decodedPath = decodeURIComponent(imagePath);
- } catch {
- decodedPath = imagePath;
- }
- const normalized = decodedPath.replace(/\\/g, '/').replace(/^\.\//, '');
- const markdownDir = path.posix.dirname(String(markdownEntryName || '').replace(/\\/g, '/'));
- const candidates = [
- normalized,
- path.posix.normalize(path.posix.join(markdownDir === '.' ? '' : markdownDir, normalized)),
- ].map((item) => item.replace(/^\/+/, '').toLowerCase());
- const direct = zipEntries.find((entry) => candidates.includes(entry.entryName.replace(/\\/g, '/').replace(/^\/+/, '').toLowerCase()));
- if (direct) return direct;
- const basename = path.posix.basename(normalized).toLowerCase();
- return zipEntries.find((entry) => path.posix.basename(entry.entryName.replace(/\\/g, '/')).toLowerCase() === basename);
- }
- function isPathInsideDirectory(baseDir, targetPath) {
- const relative = path.relative(baseDir, targetPath);
- return relative === '' || (relative && !relative.startsWith('..') && !path.isAbsolute(relative));
- }
- async function resolveImageToAssetUrl(source, assets, context = {}) {
- const value = cleanMarkdownImageTarget(source);
- if (!value) return null;
- if (/^yibiao-asset:\/\//i.test(value)) return value;
- const data = parseDataUrl(value);
- if (data) {
- return saveImportedImage(assets, data.buffer, 'data-image', data.mime);
- }
- if (/^https?:\/\//i.test(value) || context.baseUrl) {
- try {
- const url = /^https?:\/\//i.test(value) ? value : new URL(value, context.baseUrl).toString();
- const loaded = await loadRemoteImage(url);
- if (loaded) {
- return saveImportedImage(assets, loaded.buffer, url, loaded.mime);
- }
- } catch {
- return null;
- }
- }
- if (context.zipEntries) {
- const entry = findZipEntryImage(context.zipEntries, value, context.markdownEntryName);
- if (entry && !entry.isDirectory) {
- return saveImportedImage(assets, entry.getData(), entry.entryName, '');
- }
- }
- if (context.localBaseDir && !/^[a-z][a-z0-9+.-]*:/i.test(value)) {
- try {
- let decodedValue = value;
- try {
- decodedValue = decodeURIComponent(value);
- } catch {
- decodedValue = value;
- }
- if (path.isAbsolute(decodedValue)) {
- return null;
- }
- const baseDir = path.resolve(context.localBaseDir);
- const localPath = path.resolve(baseDir, decodedValue);
- if (!isPathInsideDirectory(baseDir, localPath)) {
- return null;
- }
- const buffer = await fs.readFile(localPath);
- return saveImportedImage(assets, buffer, localPath, '');
- } catch {
- return null;
- }
- }
- return null;
- }
- async function rewriteMarkdownImages(markdown, assets, context = {}) {
- let result = await replaceMatchesAsync(String(markdown || ''), markdownImagePattern, async (match) => {
- const nextUrl = await resolveImageToAssetUrl(match.groups?.target || '', assets, context);
- const alt = match.groups?.alt || '';
- const title = match.groups?.title || '';
- return nextUrl ? `` : '';
- });
- result = await replaceMatchesAsync(result, htmlImageSrcPattern, async (match) => {
- const nextUrl = await resolveImageToAssetUrl(match.groups?.src || '', assets, context);
- return nextUrl ? `${match[1]}${nextUrl}${match[3]}` : '';
- });
- return result;
- }
- async function replaceMatchesAsync(text, pattern, createReplacement) {
- const matches = [...String(text || '').matchAll(pattern)];
- if (!matches.length) return text;
- const parts = [];
- let lastIndex = 0;
- for (const match of matches) {
- const index = match.index ?? 0;
- parts.push(text.slice(lastIndex, index));
- parts.push(await createReplacement(match));
- lastIndex = index + match[0].length;
- }
- parts.push(text.slice(lastIndex));
- return parts.join('');
- }
- async function parseDocumentWithConfig(app, filePath, config, options = {}) {
- const parser = resolveFileParser(config, filePath);
- if (!parser.supported) {
- throw new Error(`当前${parserLabels[parser.requestedProvider] || '解析方式'}不支持该文件格式`);
- }
- const provider = parser.provider;
- const preserveImages = options.preserveImages === true;
- const assets = preserveImages ? createAssetContext(app, options.assetScope || 'documents') : null;
- const parseOptions = { preserveImages, assets, imageResolver: createImageResolver(assets) };
- let markdown = '';
- try {
- if (provider === 'mineru-agent-api') {
- markdown = await parseWithMineruAgent(filePath, parseOptions);
- } else if (provider === 'mineru-accurate-api') {
- markdown = await parseWithMineruAccurate(filePath, config.file_parser?.mineru_token || '', parseOptions);
- } else {
- markdown = await parseLocalDocument(filePath, parseOptions);
- markdown = preserveImages ? await rewriteMarkdownImages(markdown, assets, { localBaseDir: path.dirname(filePath) }) : stripMarkdownImages(markdown);
- }
- } catch (error) {
- await deleteImportedImageAssets(assets).catch(() => undefined);
- throw error;
- }
- return preserveImages ? markdown : stripMarkdownImages(markdown);
- }
- function createFileService({ app, configStore } = {}) {
- return {
- async importDocument() {
- const config = configStore ? configStore.load() : { file_parser: { provider: 'local' } };
- const provider = config.file_parser?.provider || 'local';
- const supportedExtensions = getSelectableExtensions(provider);
- const result = await dialog.showOpenDialog({
- title: '选择招标文件',
- properties: ['openFile'],
- filters: [
- { name: parserLabels[provider] || '招标文件', extensions: [...supportedExtensions].map((item) => item.slice(1)) },
- { name: '所有文件', extensions: ['*'] },
- ],
- });
- if (result.canceled || result.filePaths.length === 0) {
- return { success: false, message: '已取消选择' };
- }
- const filePath = result.filePaths[0];
- const ext = path.extname(filePath).toLowerCase();
- const parser = resolveFileParser(config, filePath);
- if (!supportedExtensions.has(ext)) {
- return { success: false, message: `当前${parserLabels[provider] || '解析方式'}不支持该文件格式` };
- }
- let fileContent = '';
- try {
- fileContent = (await parseDocumentWithConfig(app, filePath, config, { assetScope: 'technical-plan', preserveImages: false })).trim();
- } catch (error) {
- return {
- success: false,
- message: formatImportError(error),
- file_name: path.basename(filePath),
- parser_provider: parser.provider,
- parser_label: parserLabels[parser.provider] || '本地解析',
- };
- }
- if (!fileContent) {
- return { success: false, message: '未提取到有效 Markdown 内容,请检查文件内容' };
- }
- return {
- success: true,
- message: parser.fallbackToLocal ? '文件解析完成,当前格式已自动使用本地解析' : '文件解析完成',
- file_content: fileContent,
- file_name: path.basename(filePath),
- parser_provider: parser.provider,
- parser_label: parserLabels[parser.provider] || '本地解析',
- };
- },
- async selectDuplicateCheckFiles(options = {}) {
- const multiple = options?.multiple !== false;
- const result = await dialog.showOpenDialog({
- title: multiple ? '选择投标文件' : '选择招标文件',
- properties: multiple ? ['openFile', 'multiSelections'] : ['openFile'],
- filters: [
- { name: '标书文档', extensions: [...duplicateCheckSupportedExtensions].map((item) => item.slice(1)) },
- { name: '所有文件', extensions: ['*'] },
- ],
- });
- if (result.canceled || result.filePaths.length === 0) {
- return { success: false, message: '已取消选择', files: [] };
- }
- const supportedPaths = result.filePaths.filter((filePath) => duplicateCheckSupportedExtensions.has(path.extname(filePath).toLowerCase()));
- if (!supportedPaths.length) {
- return { success: false, message: '未选择支持的文件类型', files: [] };
- }
- const files = await Promise.all(supportedPaths.map(createLocalFileSelection));
- return {
- success: true,
- message: `已选择 ${files.length} 个文件`,
- files,
- };
- },
- };
- }
- module.exports = {
- createFileService,
- parseDocumentWithConfig,
- resolveFileParser,
- };
|