convert.mjs 31 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036
  1. import { spawn } from 'node:child_process';
  2. import { existsSync } from 'node:fs';
  3. import { copyFile, mkdtemp, readFile, readdir, rm } from 'node:fs/promises';
  4. import os from 'node:os';
  5. import path from 'node:path';
  6. import { pathToFileURL } from 'node:url';
  7. import chardet from 'chardet';
  8. import * as cheerio from 'cheerio';
  9. import iconv from 'iconv-lite';
  10. import mammoth from 'mammoth';
  11. import { lookup as lookupMimeType } from 'mime-types';
  12. import { PDFParse } from 'pdf-parse';
  13. import { getDocument, OPS } from 'pdfjs-dist/legacy/build/pdf.mjs';
  14. import TurndownService from 'turndown';
  15. import turndownPluginGfm from 'turndown-plugin-gfm';
  16. const MARKDOWN_SUFFIXES = new Set(['.md', '.markdown']);
  17. const DOCX_SUFFIXES = new Set(['.docx']);
  18. const PDF_SUFFIXES = new Set(['.pdf']);
  19. const LEGACY_WORD_SUFFIXES = new Set(['.doc', '.wps']);
  20. const PDF_HEADER = Buffer.from('%PDF-');
  21. const ZIP_LOCAL_FILE_HEADER = Buffer.from([0x50, 0x4b, 0x03, 0x04]);
  22. const OLE_COMPOUND_HEADER = Buffer.from([0xd0, 0xcf, 0x11, 0xe0, 0xa1, 0xb1, 0x1a, 0xe1]);
  23. const MARKDOWN_IMAGE_PATTERN = /!\[(?<alt>[^\]]*)\]\((?<target><[^>]+>|[^)\s]+)(?<title>\s+"[^"]*")?\)/gi;
  24. const PDF_POINT_TOLERANCE = 2.5;
  25. const PDF_GRID_MIN_LINE_LENGTH = 12;
  26. const PDF_GRID_MIN_WIDTH = 40;
  27. const PDF_GRID_MIN_HEIGHT = 12;
  28. const PDF_TEXT_DUPLICATE_TOLERANCE = 1;
  29. const PDF_TEXT_LINE_TOLERANCE = 3;
  30. const { gfm } = turndownPluginGfm;
  31. const PDF_OP_NAMES = Object.fromEntries(Object.entries(OPS).map(([name, value]) => [value, name]));
  32. export class ConversionError extends Error {
  33. constructor(code, message, details = {}) {
  34. super(message);
  35. this.name = 'ConversionError';
  36. this.code = code;
  37. this.details = details;
  38. }
  39. }
  40. export async function convertPathToMarkdown(inputPath, options = {}) {
  41. const resolvedPath = path.resolve(inputPath);
  42. const includeImages = Boolean(options.includeImages);
  43. const imageResolver = typeof options.imageResolver === 'function' ? options.imageResolver : null;
  44. const format = await detectFileFormat(resolvedPath);
  45. if (format === 'markdown') {
  46. return convertMarkdownFile(resolvedPath, includeImages, imageResolver);
  47. }
  48. if (format === 'docx') {
  49. return convertDocxFile(resolvedPath, includeImages, imageResolver);
  50. }
  51. if (format === 'pdf') {
  52. return convertPdfFile(resolvedPath, includeImages, imageResolver);
  53. }
  54. if (format === 'legacy_word') {
  55. return convertLegacyWordFile(resolvedPath, includeImages, imageResolver);
  56. }
  57. throw new ConversionError('unsupported_format', '不支持的文件格式', {
  58. inputPath: resolvedPath,
  59. format,
  60. });
  61. }
  62. export async function detectFileFormat(inputPath) {
  63. const suffix = path.extname(inputPath).toLowerCase();
  64. const header = await readFileHeader(inputPath, 8);
  65. if (isPdfHeader(header)) {
  66. return 'pdf';
  67. }
  68. if (isZipHeader(header)) {
  69. return 'docx';
  70. }
  71. if (isOleCompoundHeader(header)) {
  72. return 'legacy_word';
  73. }
  74. if (MARKDOWN_SUFFIXES.has(suffix)) {
  75. return 'markdown';
  76. }
  77. if (PDF_SUFFIXES.has(suffix)) {
  78. return 'pdf';
  79. }
  80. if (DOCX_SUFFIXES.has(suffix)) {
  81. return 'docx';
  82. }
  83. if (LEGACY_WORD_SUFFIXES.has(suffix)) {
  84. return 'legacy_word';
  85. }
  86. return 'unknown';
  87. }
  88. function isPdfHeader(header) {
  89. return header.subarray(0, PDF_HEADER.length).equals(PDF_HEADER);
  90. }
  91. function isZipHeader(header) {
  92. return header.subarray(0, ZIP_LOCAL_FILE_HEADER.length).equals(ZIP_LOCAL_FILE_HEADER);
  93. }
  94. function isOleCompoundHeader(header) {
  95. return header.subarray(0, OLE_COMPOUND_HEADER.length).equals(OLE_COMPOUND_HEADER);
  96. }
  97. async function readFileHeader(inputPath, bytes) {
  98. const buffer = await readFile(inputPath);
  99. return buffer.subarray(0, bytes);
  100. }
  101. async function convertMarkdownFile(inputPath, includeImages, imageResolver) {
  102. const raw = await readFile(inputPath);
  103. const detected = chardet.detect(raw) || 'UTF-8';
  104. let text = iconv.decode(raw, normalizeEncoding(detected));
  105. text = text.replace(/^\uFEFF/, '');
  106. text = normalizeNewlinesOnly(text);
  107. if (includeImages) {
  108. text = await inlineLocalMarkdownImages(text, path.dirname(inputPath), imageResolver);
  109. } else {
  110. text = stripMarkdownImages(text);
  111. }
  112. return ensureTrailingNewline(text.trimEnd());
  113. }
  114. function normalizeEncoding(value) {
  115. const normalized = String(value).toLowerCase();
  116. if (normalized === 'utf-8' || normalized === 'utf8') {
  117. return 'utf8';
  118. }
  119. if (normalized === 'gb18030' || normalized === 'gbk' || normalized === 'gb2312') {
  120. return 'gb18030';
  121. }
  122. return value;
  123. }
  124. async function convertDocxFile(inputPath, includeImages, imageResolver) {
  125. const result = await mammoth.convertToHtml(
  126. { path: inputPath },
  127. { convertImage: buildMammothImageConverter(includeImages, imageResolver) }
  128. );
  129. const html = cleanHtml(result.value, includeImages);
  130. const { html: htmlWithoutTables, placeholders } = preserveTables(html);
  131. let markdown = htmlToMarkdown(htmlWithoutTables);
  132. markdown = restoreTables(markdown, placeholders);
  133. return normalizeGeneratedMarkdown(includeImages ? markdown : stripMarkdownImages(markdown));
  134. }
  135. function buildMammothImageConverter(includeImages, imageResolver) {
  136. return mammoth.images.imgElement(async (image) => {
  137. if (!includeImages) {
  138. return { src: '' };
  139. }
  140. if (imageResolver) {
  141. const buffer = await image.readAsBuffer();
  142. const src = await imageResolver({ buffer, mime: image.contentType, sourceName: 'docx-image' });
  143. return { src: src || '' };
  144. }
  145. const base64 = await image.readAsBase64String();
  146. return { src: `data:${image.contentType};base64,${base64}` };
  147. });
  148. }
  149. function cleanHtml(html, includeImages) {
  150. const $ = cheerio.load(html, { decodeEntities: false });
  151. $('a').each((_, element) => {
  152. const anchor = $(element);
  153. const href = anchor.attr('href') || '';
  154. if (!anchor.text().trim() && anchor.find('img').length === 0) {
  155. anchor.remove();
  156. return;
  157. }
  158. if (href.startsWith('#')) {
  159. anchor.replaceWith(anchor.contents());
  160. }
  161. });
  162. $('img').each((_, element) => {
  163. const image = $(element);
  164. if (!image.attr('src')) {
  165. image.remove();
  166. }
  167. });
  168. if (!includeImages) {
  169. $('img').remove();
  170. }
  171. return $.root().html() || '';
  172. }
  173. function htmlToMarkdown(html) {
  174. const turndownService = new TurndownService({
  175. bulletListMarker: '-',
  176. codeBlockStyle: 'fenced',
  177. headingStyle: 'atx',
  178. });
  179. turndownService.use(gfm);
  180. return turndownService.turndown(html);
  181. }
  182. function preserveTables(html) {
  183. const $ = cheerio.load(html, { decodeEntities: false });
  184. const placeholders = new Map();
  185. $('table').each((index, element) => {
  186. const placeholder = `TABLEPLACEHOLDER${String(index + 1).padStart(4, '0')}`;
  187. placeholders.set(placeholder, $.html(element));
  188. $(element).replaceWith(placeholder);
  189. });
  190. return { html: $.root().html() || '', placeholders };
  191. }
  192. function restoreTables(markdown, placeholders) {
  193. let restored = markdown;
  194. for (const [placeholder, tableHtml] of placeholders.entries()) {
  195. restored = restored.replaceAll(placeholder, `\n\n${tableHtml}\n\n`);
  196. }
  197. return restored;
  198. }
  199. async function convertPdfFile(inputPath, includeImages, imageResolver) {
  200. const buffer = await readFile(inputPath);
  201. const parser = new PDFParse({ data: buffer });
  202. try {
  203. const textResult = await parser.getText({ parseHyperlinks: true });
  204. const tableResult = await safePdfCall(() => parser.getTable());
  205. const pdfJsTableResult = await safePdfCall(() => extractPdfJsTables(buffer));
  206. const imageResult = includeImages ? await safePdfCall(() => parser.getImage()) : null;
  207. const markdown = await renderPdfMarkdown(textResult, tableResult, pdfJsTableResult, imageResult, includeImages, imageResolver);
  208. if (!hasInformativeText(markdown)) {
  209. throw new ConversionError('pdf_text_layer_missing', 'PDF 未检测到可选中文字层', {
  210. inputPath,
  211. });
  212. }
  213. return normalizeGeneratedMarkdown(markdown);
  214. } finally {
  215. await parser.destroy();
  216. }
  217. }
  218. async function safePdfCall(callback) {
  219. try {
  220. return await callback();
  221. } catch {
  222. return null;
  223. }
  224. }
  225. async function extractPdfJsTables(buffer) {
  226. const loadingTask = getDocument({
  227. data: new Uint8Array(buffer),
  228. disableWorker: true,
  229. });
  230. const document = await loadingTask.promise;
  231. try {
  232. const pages = [];
  233. for (let pageNumber = 1; pageNumber <= document.numPages; pageNumber += 1) {
  234. const page = await document.getPage(pageNumber);
  235. const [textContent, operatorList] = await Promise.all([
  236. page.getTextContent(),
  237. page.getOperatorList(),
  238. ]);
  239. const textItems = normalizePdfJsTextItems(textContent.items || []);
  240. const rectangles = extractPdfJsRectangles(operatorList);
  241. pages.push({ tables: buildPdfJsTables(rectangles, textItems) });
  242. }
  243. return { pages };
  244. } finally {
  245. await document.destroy();
  246. }
  247. }
  248. function normalizePdfJsTextItems(items) {
  249. const normalized = items
  250. .map((item) => {
  251. const text = collapsePdfWhitespace(item.str || '');
  252. if (!text) {
  253. return null;
  254. }
  255. const transform = Array.isArray(item.transform) ? item.transform : [];
  256. const x = Number(transform[4] || 0);
  257. const y = Number(transform[5] || 0);
  258. const width = Number(item.width || 0);
  259. const height = Number(item.height || Math.abs(transform[3] || 0));
  260. return {
  261. text,
  262. x,
  263. y,
  264. width,
  265. height,
  266. centerX: x + width / 2,
  267. centerY: y + height / 2,
  268. };
  269. })
  270. .filter(Boolean)
  271. .sort((left, right) => right.y - left.y || left.x - right.x);
  272. const kept = [];
  273. for (const item of normalized) {
  274. const duplicate = kept.some((previous) => previous.text === item.text
  275. && Math.abs(previous.x - item.x) <= PDF_TEXT_DUPLICATE_TOLERANCE
  276. && Math.abs(previous.y - item.y) <= PDF_TEXT_DUPLICATE_TOLERANCE);
  277. if (!duplicate) {
  278. kept.push(item);
  279. }
  280. }
  281. return kept;
  282. }
  283. function extractPdfJsRectangles(operatorList) {
  284. const rectangles = [];
  285. const stack = [];
  286. let matrix = [1, 0, 0, 1, 0, 0];
  287. for (let index = 0; index < operatorList.fnArray.length; index += 1) {
  288. const name = PDF_OP_NAMES[operatorList.fnArray[index]];
  289. const args = operatorList.argsArray[index];
  290. if (name === 'save') {
  291. stack.push(matrix.slice());
  292. } else if (name === 'restore') {
  293. matrix = stack.pop() || [1, 0, 0, 1, 0, 0];
  294. } else if (name === 'transform') {
  295. matrix = multiplyPdfMatrix(matrix, args || [1, 0, 0, 1, 0, 0]);
  296. } else if (name === 'constructPath') {
  297. rectangles.push(...extractPdfJsPathRectangles(args?.[1] || [], matrix));
  298. }
  299. }
  300. return rectangles;
  301. }
  302. function multiplyPdfMatrix(left, right) {
  303. return [
  304. left[0] * right[0] + left[2] * right[1],
  305. left[1] * right[0] + left[3] * right[1],
  306. left[0] * right[2] + left[2] * right[3],
  307. left[1] * right[2] + left[3] * right[3],
  308. left[0] * right[4] + left[2] * right[5] + left[4],
  309. left[1] * right[4] + left[3] * right[5] + left[5],
  310. ];
  311. }
  312. function transformPdfPoint(matrix, x, y) {
  313. return {
  314. x: matrix[0] * x + matrix[2] * y + matrix[4],
  315. y: matrix[1] * x + matrix[3] * y + matrix[5],
  316. };
  317. }
  318. function extractPdfJsPathRectangles(paths, matrix) {
  319. const rectangles = [];
  320. for (const pathChunk of paths) {
  321. const points = [];
  322. for (let index = 0; index < pathChunk.length;) {
  323. const op = pathChunk[index];
  324. index += 1;
  325. if (op === 0 || op === 1) {
  326. points.push(transformPdfPoint(matrix, pathChunk[index], pathChunk[index + 1]));
  327. index += 2;
  328. } else if (op === 3) {
  329. // closePath has no coordinates.
  330. } else {
  331. break;
  332. }
  333. }
  334. if (points.length < 2) {
  335. continue;
  336. }
  337. const xValues = points.map((point) => point.x);
  338. const yValues = points.map((point) => point.y);
  339. const x1 = Math.min(...xValues);
  340. const x2 = Math.max(...xValues);
  341. const y1 = Math.min(...yValues);
  342. const y2 = Math.max(...yValues);
  343. rectangles.push({
  344. x1,
  345. x2,
  346. y1,
  347. y2,
  348. width: x2 - x1,
  349. height: y2 - y1,
  350. });
  351. }
  352. return rectangles;
  353. }
  354. function buildPdfJsTables(rectangles, textItems) {
  355. const horizontalLines = [];
  356. const verticalLines = [];
  357. for (const rectangle of rectangles) {
  358. if (rectangle.width >= PDF_GRID_MIN_LINE_LENGTH && rectangle.height <= PDF_POINT_TOLERANCE) {
  359. horizontalLines.push({
  360. x1: rectangle.x1,
  361. x2: rectangle.x2,
  362. y: (rectangle.y1 + rectangle.y2) / 2,
  363. });
  364. }
  365. if (rectangle.height >= PDF_GRID_MIN_LINE_LENGTH && rectangle.width <= PDF_POINT_TOLERANCE) {
  366. verticalLines.push({
  367. x: (rectangle.x1 + rectangle.x2) / 2,
  368. y1: rectangle.y1,
  369. y2: rectangle.y2,
  370. });
  371. }
  372. }
  373. return buildPdfJsLineComponents(horizontalLines, verticalLines)
  374. .map((component) => buildPdfJsTableFromComponent(component, textItems))
  375. .filter(Boolean)
  376. .sort((left, right) => right.bounds.y2 - left.bounds.y2 || left.bounds.x1 - right.bounds.x1)
  377. .map((table) => table.rows);
  378. }
  379. function buildPdfJsLineComponents(horizontalLines, verticalLines) {
  380. const lines = [
  381. ...horizontalLines.map((line) => ({ kind: 'h', line })),
  382. ...verticalLines.map((line) => ({ kind: 'v', line })),
  383. ];
  384. const parent = lines.map((_, index) => index);
  385. for (let hIndex = 0; hIndex < horizontalLines.length; hIndex += 1) {
  386. for (let vIndex = 0; vIndex < verticalLines.length; vIndex += 1) {
  387. if (pdfJsLinesIntersect(horizontalLines[hIndex], verticalLines[vIndex])) {
  388. union(parent, hIndex, horizontalLines.length + vIndex);
  389. }
  390. }
  391. }
  392. const groups = new Map();
  393. for (let index = 0; index < lines.length; index += 1) {
  394. const root = find(parent, index);
  395. if (!groups.has(root)) {
  396. groups.set(root, { horizontalLines: [], verticalLines: [] });
  397. }
  398. const group = groups.get(root);
  399. if (lines[index].kind === 'h') {
  400. group.horizontalLines.push(lines[index].line);
  401. } else {
  402. group.verticalLines.push(lines[index].line);
  403. }
  404. }
  405. return [...groups.values()];
  406. }
  407. function pdfJsLinesIntersect(horizontalLine, verticalLine) {
  408. return verticalLine.x >= horizontalLine.x1 - PDF_POINT_TOLERANCE
  409. && verticalLine.x <= horizontalLine.x2 + PDF_POINT_TOLERANCE
  410. && horizontalLine.y >= verticalLine.y1 - PDF_POINT_TOLERANCE
  411. && horizontalLine.y <= verticalLine.y2 + PDF_POINT_TOLERANCE;
  412. }
  413. function find(parent, index) {
  414. if (parent[index] !== index) {
  415. parent[index] = find(parent, parent[index]);
  416. }
  417. return parent[index];
  418. }
  419. function union(parent, left, right) {
  420. const leftRoot = find(parent, left);
  421. const rightRoot = find(parent, right);
  422. if (leftRoot !== rightRoot) {
  423. parent[rightRoot] = leftRoot;
  424. }
  425. }
  426. function buildPdfJsTableFromComponent(component, textItems) {
  427. const xCoordinates = clusterPdfCoordinates(component.verticalLines.map((line) => line.x)).sort((left, right) => left - right);
  428. const yCoordinates = clusterPdfCoordinates(component.horizontalLines.map((line) => line.y)).sort((left, right) => right - left);
  429. if (xCoordinates.length < 2 || yCoordinates.length < 2) {
  430. return null;
  431. }
  432. const bounds = {
  433. x1: xCoordinates[0],
  434. x2: xCoordinates[xCoordinates.length - 1],
  435. y1: yCoordinates[yCoordinates.length - 1],
  436. y2: yCoordinates[0],
  437. };
  438. if (bounds.x2 - bounds.x1 < PDF_GRID_MIN_WIDTH || bounds.y2 - bounds.y1 < PDF_GRID_MIN_HEIGHT) {
  439. return null;
  440. }
  441. const rows = [];
  442. for (let rowIndex = 0; rowIndex < yCoordinates.length - 1; rowIndex += 1) {
  443. const top = yCoordinates[rowIndex];
  444. const bottom = yCoordinates[rowIndex + 1];
  445. const row = [];
  446. for (let columnIndex = 0; columnIndex < xCoordinates.length - 1; columnIndex += 1) {
  447. const left = xCoordinates[columnIndex];
  448. const right = xCoordinates[columnIndex + 1];
  449. const cellItems = textItems.filter((item) => item.centerX >= left - PDF_POINT_TOLERANCE
  450. && item.centerX <= right + PDF_POINT_TOLERANCE
  451. && item.centerY <= top + PDF_POINT_TOLERANCE
  452. && item.centerY >= bottom - PDF_POINT_TOLERANCE);
  453. row.push(renderPdfJsCellText(cellItems));
  454. }
  455. rows.push(row);
  456. }
  457. const nonEmptyCells = rows.flat().filter(Boolean).length;
  458. if (nonEmptyCells < 2) {
  459. return null;
  460. }
  461. return { bounds, rows };
  462. }
  463. function clusterPdfCoordinates(values) {
  464. const sortedValues = [...values].sort((left, right) => left - right);
  465. const clusters = [];
  466. for (const value of sortedValues) {
  467. const previous = clusters[clusters.length - 1];
  468. if (previous && Math.abs(previous.values[previous.values.length - 1] - value) <= PDF_POINT_TOLERANCE) {
  469. previous.values.push(value);
  470. } else {
  471. clusters.push({ values: [value] });
  472. }
  473. }
  474. return clusters.map((cluster) => cluster.values.reduce((sum, value) => sum + value, 0) / cluster.values.length);
  475. }
  476. function renderPdfJsCellText(items) {
  477. if (items.length === 0) {
  478. return '';
  479. }
  480. const lines = [];
  481. for (const item of [...items].sort((left, right) => right.y - left.y || left.x - right.x)) {
  482. let line = lines.find((candidate) => Math.abs(candidate.y - item.y) <= PDF_TEXT_LINE_TOLERANCE);
  483. if (!line) {
  484. line = { y: item.y, items: [] };
  485. lines.push(line);
  486. }
  487. line.items.push(item);
  488. }
  489. return lines
  490. .sort((left, right) => right.y - left.y)
  491. .map((line) => collapsePdfWhitespace(line.items
  492. .sort((left, right) => left.x - right.x)
  493. .map((item) => item.text)
  494. .join('')))
  495. .filter(Boolean)
  496. .map((line) => line.replace(/\|/g, '\\|'))
  497. .join('<br>');
  498. }
  499. async function renderPdfMarkdown(textResult, tableResult, pdfJsTableResult, imageResult, includeImages, imageResolver) {
  500. const textPages = Array.isArray(textResult?.pages) ? textResult.pages : [];
  501. const tablePages = Array.isArray(tableResult?.pages) ? tableResult.pages : [];
  502. const pdfJsTablePages = Array.isArray(pdfJsTableResult?.pages) ? pdfJsTableResult.pages : [];
  503. const imagePages = Array.isArray(imageResult?.pages) ? imageResult.pages : [];
  504. const pageCount = Math.max(textPages.length, tablePages.length, pdfJsTablePages.length, imagePages.length, 1);
  505. const parts = [];
  506. for (let index = 0; index < pageCount; index += 1) {
  507. const pageParts = [];
  508. const pageText = normalizePdfPlainText(textPages[index]?.text || '');
  509. const tables = [
  510. ...(tablePages[index]?.tables || []),
  511. ...(pdfJsTablePages[index]?.tables || []),
  512. ];
  513. const tableMarkdownList = [];
  514. for (const table of tables) {
  515. const tableMarkdown = renderMarkdownTable(table);
  516. if (tableMarkdown && !hasSimilarPdfTable(tableMarkdownList, tableMarkdown)) {
  517. tableMarkdownList.push(tableMarkdown);
  518. }
  519. }
  520. const dedupedText = removePdfTableDuplicateText(pageText, tableMarkdownList);
  521. if (dedupedText) {
  522. pageParts.push(dedupedText);
  523. }
  524. pageParts.push(...tableMarkdownList);
  525. if (includeImages) {
  526. const images = imagePages[index]?.images || [];
  527. for (let imageIndex = 0; imageIndex < images.length; imageIndex += 1) {
  528. const dataUrl = images[imageIndex]?.dataUrl;
  529. if (dataUrl) {
  530. const assetUrl = imageResolver ? await resolveDataUrlImage(dataUrl, imageResolver, `pdf-page-${index + 1}-image-${imageIndex + 1}`) : null;
  531. pageParts.push(`![Page ${index + 1} Image ${imageIndex + 1}](${assetUrl || dataUrl})`);
  532. }
  533. }
  534. }
  535. if (pageParts.length > 0) {
  536. parts.push(pageParts.join('\n\n'));
  537. }
  538. }
  539. if (parts.length === 0 && textResult?.text) {
  540. return normalizePdfPlainText(textResult.text);
  541. }
  542. return parts.join('\n\n');
  543. }
  544. function removePdfTableDuplicateText(pageText, tableMarkdownList) {
  545. if (!pageText || tableMarkdownList.length === 0) {
  546. return pageText;
  547. }
  548. const tableProbe = compactForDedup(tableMarkdownList.join('\n'));
  549. const keptLines = [];
  550. for (const line of pageText.split('\n')) {
  551. const trimmed = collapsePdfWhitespace(line);
  552. if (!trimmed) {
  553. continue;
  554. }
  555. const probe = compactForDedup(trimmed);
  556. if (!probe) {
  557. continue;
  558. }
  559. const collapsedProbe = collapseRepeatedPdfProbe(probe);
  560. if (isPdfLineCoveredByTables(probe, tableProbe)
  561. || (collapsedProbe !== probe && isPdfLineCoveredByTables(collapsedProbe, tableProbe))) {
  562. continue;
  563. }
  564. keptLines.push(trimmed);
  565. }
  566. return keptLines.join('\n').replace(/\n{3,}/g, '\n\n').trim();
  567. }
  568. function hasSimilarPdfTable(existingTables, nextTable) {
  569. const nextProbe = compactForDedup(nextTable);
  570. if (!nextProbe) {
  571. return true;
  572. }
  573. return existingTables.some((table) => {
  574. const probe = compactForDedup(table);
  575. if (!probe) {
  576. return false;
  577. }
  578. return probe.includes(nextProbe) || nextProbe.includes(probe);
  579. });
  580. }
  581. function isPdfLineCoveredByTables(lineProbe, tableProbe) {
  582. if (lineProbe.length <= 2) {
  583. return tableProbe.includes(lineProbe);
  584. }
  585. if (tableProbe.includes(lineProbe)) {
  586. return true;
  587. }
  588. if (lineProbe.length < 8) {
  589. return false;
  590. }
  591. const chunkSize = Math.min(12, Math.max(6, Math.floor(lineProbe.length / 2)));
  592. let covered = 0;
  593. let total = 0;
  594. for (let index = 0; index < lineProbe.length; index += chunkSize) {
  595. const chunk = lineProbe.slice(index, index + chunkSize);
  596. if (chunk.length < 4) {
  597. continue;
  598. }
  599. total += 1;
  600. if (tableProbe.includes(chunk)) {
  601. covered += 1;
  602. }
  603. }
  604. return total > 0 && covered / total >= 0.75;
  605. }
  606. function compactForDedup(text) {
  607. return String(text || '')
  608. .replace(/<br\s*\/?\s*>/gi, '')
  609. .replace(/[|\\`*_#\-\s,。;:!?、,.!:;?%()()\[\]{}《》<>]/g, '')
  610. .toLowerCase();
  611. }
  612. function renderMarkdownTable(table) {
  613. const rows = normalizePdfTableRows(table);
  614. if (rows.length === 0) {
  615. return '';
  616. }
  617. const width = Math.max(...rows.map((row) => row.length));
  618. const normalizedRows = rows.map((row) => row.concat(Array(Math.max(0, width - row.length)).fill('')));
  619. const header = normalizedRows[0];
  620. const body = normalizedRows.slice(1);
  621. const lines = [renderMarkdownTableRow(header), renderMarkdownTableRow(Array(width).fill('---'))];
  622. for (const row of body) {
  623. lines.push(renderMarkdownTableRow(row));
  624. }
  625. return lines.join('\n');
  626. }
  627. function normalizePdfTableRows(table) {
  628. if (!Array.isArray(table)) {
  629. return [];
  630. }
  631. return table
  632. .map((row) => (Array.isArray(row) ? row : []))
  633. .map((row) => row.map((cell) => normalizeTableCell(cell)))
  634. .filter((row) => row.some((cell) => cell));
  635. }
  636. function normalizeTableCell(value) {
  637. if (value === null || value === undefined) {
  638. return '';
  639. }
  640. return collapsePdfWhitespace(String(value).replace(/\n+/g, '<br>')).replace(/\|/g, '\\|');
  641. }
  642. function renderMarkdownTableRow(row) {
  643. return `|${row.join('|')}|`;
  644. }
  645. export async function withLegacyWordDocxFile(inputPath, callback) {
  646. const soffice = await findLibreOfficeCommand();
  647. if (!soffice) {
  648. throw new ConversionError('office_backend_missing', '未找到 LibreOffice,无法转换 DOC/WPS 文件', {
  649. inputPath,
  650. });
  651. }
  652. const tempDir = await mkdtemp(path.join(os.tmpdir(), 'doc2md-node-'));
  653. try {
  654. const legacyInput = path.join(tempDir, `${safeStem(inputPath)}${await getLegacyConversionSuffix(inputPath)}`);
  655. await copyFile(inputPath, legacyInput);
  656. await runLibreOfficeConvert(soffice, legacyInput, tempDir);
  657. const files = await readdir(tempDir);
  658. const docxName = files.find((file) => path.extname(file).toLowerCase() === '.docx');
  659. if (!docxName) {
  660. throw new ConversionError('office_conversion_failed', 'LibreOffice 未生成 DOCX 文件', {
  661. inputPath,
  662. });
  663. }
  664. return await callback(path.join(tempDir, docxName), tempDir);
  665. } finally {
  666. await rm(tempDir, { recursive: true, force: true });
  667. }
  668. }
  669. async function convertLegacyWordFile(inputPath, includeImages, imageResolver) {
  670. return withLegacyWordDocxFile(inputPath, (docxPath) => convertDocxFile(docxPath, includeImages, imageResolver));
  671. }
  672. async function getLegacyConversionSuffix(inputPath) {
  673. const suffix = path.extname(inputPath).toLowerCase();
  674. const header = await readFileHeader(inputPath, 8);
  675. if (DOCX_SUFFIXES.has(suffix) && isOleCompoundHeader(header)) {
  676. return '.doc';
  677. }
  678. return suffix || '.doc';
  679. }
  680. async function findLibreOfficeCommand() {
  681. const candidates = [
  682. process.env.LIBREOFFICE_PATH,
  683. 'soffice',
  684. 'libreoffice',
  685. 'C:\\Program Files\\LibreOffice\\program\\soffice.exe',
  686. 'C:\\Program Files (x86)\\LibreOffice\\program\\soffice.exe',
  687. ].filter(Boolean);
  688. for (const candidate of candidates) {
  689. if (path.isAbsolute(candidate)) {
  690. if (existsSync(candidate)) {
  691. return candidate;
  692. }
  693. continue;
  694. }
  695. if (await canRunCommand(candidate, ['--version'])) {
  696. return candidate;
  697. }
  698. }
  699. return null;
  700. }
  701. async function canRunCommand(command, args) {
  702. try {
  703. await runProcess(command, args, { timeoutMs: 10000 });
  704. return true;
  705. } catch {
  706. return false;
  707. }
  708. }
  709. async function runLibreOfficeConvert(soffice, inputPath, outputDir) {
  710. const profileDir = await mkdtemp(path.join(os.tmpdir(), 'doc2md-lo-profile-'));
  711. try {
  712. const profileUri = pathToFileUri(profileDir);
  713. const args = [
  714. '--headless',
  715. '--nologo',
  716. '--nolockcheck',
  717. '--nodefault',
  718. '--nofirststartwizard',
  719. `-env:UserInstallation=${profileUri}`,
  720. '--convert-to',
  721. 'docx',
  722. '--outdir',
  723. outputDir,
  724. inputPath,
  725. ];
  726. await runProcess(soffice, args, { timeoutMs: 180000 });
  727. } finally {
  728. await rm(profileDir, { recursive: true, force: true });
  729. }
  730. }
  731. function runProcess(command, args, options = {}) {
  732. return new Promise((resolve, reject) => {
  733. const child = spawn(command, args, { windowsHide: true });
  734. let stdout = '';
  735. let stderr = '';
  736. let settled = false;
  737. const timeout = options.timeoutMs
  738. ? setTimeout(() => {
  739. if (settled) {
  740. return;
  741. }
  742. settled = true;
  743. child.kill('SIGTERM');
  744. reject(new Error(`命令执行超时: ${command}`));
  745. }, options.timeoutMs)
  746. : null;
  747. child.stdout?.on('data', (chunk) => {
  748. stdout += chunk.toString('utf8');
  749. });
  750. child.stderr?.on('data', (chunk) => {
  751. stderr += chunk.toString('utf8');
  752. });
  753. child.on('error', (error) => {
  754. if (settled) {
  755. return;
  756. }
  757. settled = true;
  758. if (timeout) {
  759. clearTimeout(timeout);
  760. }
  761. reject(error);
  762. });
  763. child.on('close', (code) => {
  764. if (settled) {
  765. return;
  766. }
  767. settled = true;
  768. if (timeout) {
  769. clearTimeout(timeout);
  770. }
  771. if (code === 0) {
  772. resolve({ stdout, stderr });
  773. } else {
  774. reject(new Error(`命令执行失败(${code}): ${command}\n${stderr || stdout}`));
  775. }
  776. });
  777. });
  778. }
  779. function normalizePdfPlainText(text) {
  780. const lines = normalizeNewlinesOnly(String(text || '').replace(/\f/g, '\n\n'))
  781. .split('\n')
  782. .map((line) => collapseRepeatedPdfText(collapsePdfWhitespace(line)));
  783. return lines.join('\n').replace(/\n{3,}/g, '\n\n').trim();
  784. }
  785. function collapseRepeatedPdfText(text) {
  786. return collapseRepeatedPdfChunks(text, 2, true)
  787. .replace(/([\u4e00-\u9fff][\u4e00-\u9fffA-Za-z0-9()()::、,,。..\-]{1,29})\s+\1/g, '$1')
  788. .replace(/^([一二三四五六七八九十])\1(?=[.、.])/u, '$1');
  789. }
  790. function collapseRepeatedPdfProbe(text) {
  791. return collapseRepeatedPdfChunks(text, 1, false);
  792. }
  793. function collapseRepeatedPdfChunks(text, minLength, requireCjk) {
  794. let result = text;
  795. let changed = true;
  796. let rounds = 0;
  797. while (changed && rounds < 4) {
  798. changed = false;
  799. rounds += 1;
  800. const maxLength = Math.min(30, Math.floor(result.length / 2));
  801. for (let length = minLength; length <= maxLength; length += 1) {
  802. for (let index = 0; index + length * 2 <= result.length; index += 1) {
  803. const chunk = result.slice(index, index + length);
  804. if (!chunk.trim() || (requireCjk && !/[\u4e00-\u9fff]/.test(chunk))) {
  805. continue;
  806. }
  807. if (chunk === result.slice(index + length, index + length * 2)) {
  808. result = `${result.slice(0, index + length)}${result.slice(index + length * 2)}`;
  809. changed = true;
  810. break;
  811. }
  812. }
  813. if (changed) {
  814. break;
  815. }
  816. }
  817. }
  818. return result;
  819. }
  820. function collapsePdfWhitespace(text) {
  821. return String(text || '')
  822. .replace(/[ \t]+/g, ' ')
  823. .trim()
  824. .replace(/(?<=[\u4e00-\u9fff])\s+(?=[\u4e00-\u9fff])/g, '')
  825. .replace(/\s+([,.;:!?%])/g, '$1')
  826. .replace(/\s+([,。;:!?、%])/g, '$1');
  827. }
  828. function hasInformativeText(markdown) {
  829. const probe = stripMarkdownImages(markdown);
  830. return /[A-Za-z0-9\u4e00-\u9fff]{10,}/.test(probe.replace(/\s+/g, ''));
  831. }
  832. function normalizeGeneratedMarkdown(markdown) {
  833. const normalized = normalizeNewlinesOnly(markdown).replace(/\n{3,}/g, '\n\n').trim();
  834. return ensureTrailingNewline(normalized);
  835. }
  836. function normalizeNewlinesOnly(text) {
  837. return String(text || '').replace(/\r\n/g, '\n').replace(/\r/g, '\n');
  838. }
  839. function ensureTrailingNewline(text) {
  840. return text.endsWith('\n') ? text : `${text}\n`;
  841. }
  842. function stripMarkdownImages(text) {
  843. return String(text || '')
  844. .replace(MARKDOWN_IMAGE_PATTERN, '')
  845. .replace(/<img\b[^>]*>/gi, '')
  846. .replace(/\n{3,}/g, '\n\n');
  847. }
  848. async function inlineLocalMarkdownImages(text, baseDir, imageResolver) {
  849. const replacements = [];
  850. for (const match of text.matchAll(MARKDOWN_IMAGE_PATTERN)) {
  851. replacements.push(replaceMarkdownImage(match, baseDir, imageResolver));
  852. }
  853. let result = text;
  854. for (const replacement of await Promise.all(replacements)) {
  855. if (replacement) {
  856. result = result.replace(replacement.original, replacement.next);
  857. }
  858. }
  859. return result;
  860. }
  861. async function replaceMarkdownImage(match, baseDir, imageResolver) {
  862. const target = match.groups?.target || '';
  863. const cleanTarget = target.startsWith('<') && target.endsWith('>') ? target.slice(1, -1) : target;
  864. if (isRemoteOrDataUrl(cleanTarget)) {
  865. return null;
  866. }
  867. const localPath = resolveLocalPath(baseDir, cleanTarget);
  868. if (!localPath) {
  869. return null;
  870. }
  871. const dataUri = imageResolver
  872. ? await pathToAssetUri(localPath, imageResolver)
  873. : await pathToDataUri(localPath);
  874. if (!dataUri) {
  875. return null;
  876. }
  877. const alt = match.groups?.alt || '';
  878. const title = match.groups?.title || '';
  879. return { original: match[0], next: `![${alt}](${dataUri}${title})` };
  880. }
  881. function isRemoteOrDataUrl(value) {
  882. return /^(https?:|data:)/i.test(value);
  883. }
  884. function resolveLocalPath(baseDir, target) {
  885. let decodedTarget = target;
  886. try {
  887. decodedTarget = decodeURIComponent(target);
  888. } catch {
  889. decodedTarget = target;
  890. }
  891. if (path.isAbsolute(decodedTarget)) {
  892. return null;
  893. }
  894. const resolvedBaseDir = path.resolve(baseDir);
  895. const candidate = path.resolve(resolvedBaseDir, decodedTarget);
  896. const relative = path.relative(resolvedBaseDir, candidate);
  897. if (relative && (relative.startsWith('..') || path.isAbsolute(relative))) {
  898. return null;
  899. }
  900. return existsSync(candidate) ? candidate : null;
  901. }
  902. async function pathToAssetUri(inputPath, imageResolver) {
  903. const mimeType = lookupMimeType(inputPath) || 'application/octet-stream';
  904. const buffer = await readFile(inputPath);
  905. return imageResolver({ buffer, mime: mimeType, sourceName: inputPath });
  906. }
  907. async function pathToDataUri(inputPath) {
  908. const mimeType = lookupMimeType(inputPath) || 'application/octet-stream';
  909. const data = await readFile(inputPath);
  910. return `data:${mimeType};base64,${data.toString('base64')}`;
  911. }
  912. async function resolveDataUrlImage(dataUrl, imageResolver, sourceName) {
  913. const match = /^data:([^;,]+);base64,(.+)$/i.exec(String(dataUrl || ''));
  914. if (!match) return null;
  915. return imageResolver({ buffer: Buffer.from(match[2], 'base64'), mime: match[1], sourceName });
  916. }
  917. function safeStem(inputPath) {
  918. const stem = path.basename(inputPath, path.extname(inputPath));
  919. return stem.replace(/[^A-Za-z0-9._-]+/g, '_') || 'upload';
  920. }
  921. function pathToFileUri(value) {
  922. return pathToFileURL(path.resolve(value)).href;
  923. }