conversion.py 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827
  1. from __future__ import annotations
  2. import base64
  3. import mimetypes
  4. import os
  5. import re
  6. import shutil
  7. import subprocess
  8. import tempfile
  9. import zipfile
  10. from pathlib import Path
  11. from urllib.parse import urlparse
  12. import mammoth
  13. import olefile
  14. from bs4 import BeautifulSoup
  15. from charset_normalizer import from_bytes
  16. from markdownify import markdownify as html_to_markdown
  17. from .config import settings
  18. from .errors import ConversionError
  19. MARKDOWN_SUFFIXES = {".md", ".markdown"}
  20. MARKDOWN_IMAGE_PATTERN = re.compile(
  21. r'!\[(?P<alt>[^\]]*)\]\((?P<target><[^>]+>|[^)\s]+)(?P<title>\s+"[^"]*")?\)',
  22. re.IGNORECASE,
  23. )
  24. HTML_IMAGE_PATTERN = re.compile(
  25. r"<img\b(?P<before>[^>]*?)src=[\"'](?P<src>[^\"']+)[\"'](?P<after>[^>]*)>",
  26. re.IGNORECASE,
  27. )
  28. def convert_path_to_markdown(
  29. input_path_str: str,
  30. include_images: bool,
  31. original_name: str | None = None,
  32. ) -> str:
  33. input_path = Path(input_path_str)
  34. detected_format = detect_file_format(input_path, original_name)
  35. if detected_format == "markdown":
  36. return convert_markdown_file(input_path, include_images)
  37. if detected_format == "docx":
  38. return convert_docx_file(input_path, include_images)
  39. if detected_format == "pdf":
  40. return convert_pdf_file(input_path, include_images)
  41. if detected_format == "ole_word_compatible":
  42. return convert_ole_word_compatible_file(input_path, include_images)
  43. raise ConversionError(
  44. code="unsupported_format",
  45. message="Unsupported input format",
  46. status_code=400,
  47. details={
  48. "filename": original_name or input_path.name,
  49. "detected_format": detected_format,
  50. },
  51. )
  52. def detect_file_format(input_path: Path, original_name: str | None = None) -> str:
  53. suffix = (
  54. Path(original_name).suffix if original_name else input_path.suffix
  55. ).lower()
  56. with input_path.open("rb") as file_obj:
  57. header = file_obj.read(8)
  58. if suffix in MARKDOWN_SUFFIXES:
  59. return "markdown"
  60. if header.startswith(b"%PDF-"):
  61. return "pdf"
  62. if header.startswith(b"PK\x03\x04") and is_docx_package(input_path):
  63. return "docx"
  64. if header == b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1" and is_word_compatible_ole(
  65. input_path
  66. ):
  67. return "ole_word_compatible"
  68. if suffix in MARKDOWN_SUFFIXES:
  69. return "markdown"
  70. return "unknown"
  71. def is_docx_package(input_path: Path) -> bool:
  72. try:
  73. with zipfile.ZipFile(input_path) as archive:
  74. names = set(archive.namelist())
  75. except zipfile.BadZipFile:
  76. return False
  77. return "[Content_Types].xml" in names and "word/document.xml" in names
  78. def is_word_compatible_ole(input_path: Path) -> bool:
  79. try:
  80. with olefile.OleFileIO(str(input_path)) as ole:
  81. streams = {"/".join(item) for item in ole.listdir()}
  82. except OSError:
  83. return False
  84. return "WordDocument" in streams
  85. def convert_markdown_file(input_path: Path, include_images: bool) -> str:
  86. raw = input_path.read_bytes()
  87. match = from_bytes(raw).best()
  88. if match is None:
  89. raise ConversionError(
  90. code="invalid_markdown",
  91. message="Unable to detect Markdown encoding",
  92. status_code=400,
  93. details={"filename": input_path.name},
  94. )
  95. text = str(match)
  96. text = normalize_newlines_only(text)
  97. if include_images:
  98. text = inline_local_markdown_images(text, input_path.parent)
  99. else:
  100. text = strip_markdown_images(text)
  101. return ensure_trailing_newline(text)
  102. def convert_docx_file(input_path: Path, include_images: bool) -> str:
  103. with input_path.open("rb") as docx_file:
  104. result = mammoth.convert_to_html(
  105. docx_file,
  106. convert_image=build_image_converter(include_images),
  107. )
  108. html = clean_html(result.value, include_images)
  109. html_without_tables, placeholders = preserve_tables(html)
  110. markdown = html_fragment_to_markdown(html_without_tables)
  111. markdown = restore_tables(markdown, placeholders)
  112. markdown = normalize_generated_markdown(markdown)
  113. if not include_images:
  114. markdown = strip_markdown_images(markdown)
  115. return markdown
  116. def convert_pdf_file(input_path: Path, include_images: bool) -> str:
  117. if not pdf_has_text_layer(input_path):
  118. raise ConversionError(
  119. code="pdf_text_layer_missing",
  120. message="PDF does not contain a selectable text layer",
  121. status_code=422,
  122. details={"filename": input_path.name},
  123. )
  124. attempts: list[dict[str, object]] = []
  125. errors: list[str] = []
  126. strategies = [
  127. ("pymupdf4llm", convert_pdf_with_pymupdf4llm),
  128. ("pymupdf_blocks", convert_pdf_with_pymupdf_blocks),
  129. ("pdfplumber", convert_pdf_with_pdfplumber),
  130. ("pdfminer", convert_pdf_with_pdfminer),
  131. ("pypdf", convert_pdf_with_pypdf),
  132. ]
  133. for name, strategy in strategies:
  134. try:
  135. markdown = strategy(input_path, include_images)
  136. if not markdown.strip():
  137. raise ValueError("empty markdown output")
  138. score = score_pdf_markdown(markdown)
  139. attempts.append({"name": name, "score": score, "markdown": markdown})
  140. if is_pdf_markdown_acceptable(markdown, score):
  141. return markdown
  142. except Exception as exc: # noqa: BLE001
  143. errors.append(f"{name}: {exc}")
  144. if attempts:
  145. best_attempt = max(attempts, key=lambda item: int(item["score"]))
  146. return str(best_attempt["markdown"])
  147. raise ConversionError(
  148. code="pdf_text_extraction_failed",
  149. message="All non-OCR PDF extraction strategies failed",
  150. status_code=422,
  151. details={"filename": input_path.name, "errors": errors},
  152. )
  153. def pdf_has_text_layer(input_path: Path, max_pages_to_check: int = 5) -> bool:
  154. import fitz
  155. document = fitz.open(input_path)
  156. try:
  157. page_limit = min(document.page_count, max_pages_to_check)
  158. if page_limit == 0:
  159. return False
  160. text_pages = 0
  161. total_chars = 0
  162. total_words = 0
  163. for page_index in range(page_limit):
  164. page = document.load_page(page_index)
  165. page_text = page.get_text("text").strip()
  166. word_count = len(page.get_text("words"))
  167. total_chars += len(page_text)
  168. total_words += word_count
  169. if len(page_text) >= 30 or word_count >= 10:
  170. text_pages += 1
  171. return (
  172. text_pages >= max(1, page_limit // 2)
  173. or total_words >= 50
  174. or total_chars >= 200
  175. )
  176. finally:
  177. document.close()
  178. def convert_pdf_with_pymupdf4llm(input_path: Path, include_images: bool) -> str:
  179. import pymupdf4llm
  180. markdown = pymupdf4llm.to_markdown(
  181. str(input_path),
  182. embed_images=include_images,
  183. ignore_images=not include_images,
  184. show_progress=False,
  185. page_separators=False,
  186. force_text=True,
  187. table_strategy="lines_strict",
  188. )
  189. return finalize_pdf_markdown(
  190. markdown,
  191. input_path=input_path,
  192. include_images=include_images,
  193. append_images=False,
  194. )
  195. def convert_pdf_with_pymupdf_blocks(input_path: Path, include_images: bool) -> str:
  196. import fitz
  197. document = fitz.open(input_path)
  198. try:
  199. page_parts: list[str] = []
  200. for page_index, page in enumerate(document):
  201. block_parts: list[str] = []
  202. page_height = float(page.rect.height or 0)
  203. for block in page.get_text("blocks", sort=True):
  204. x0, y0, x1, y1, text, _block_no, block_type = block
  205. if block_type != 0:
  206. continue
  207. if should_skip_pdf_margin_block(float(y0), float(y1), page_height):
  208. continue
  209. cleaned = normalize_pdf_block_text(str(text))
  210. if not cleaned:
  211. continue
  212. block_parts.append(cleaned)
  213. if block_parts:
  214. page_parts.append("\n\n".join(block_parts))
  215. markdown = "\n\n".join(page_parts)
  216. return finalize_pdf_markdown(
  217. markdown,
  218. input_path=input_path,
  219. include_images=include_images,
  220. append_images=True,
  221. )
  222. finally:
  223. document.close()
  224. def convert_pdf_with_pdfplumber(input_path: Path, include_images: bool) -> str:
  225. import pdfplumber
  226. page_parts: list[str] = []
  227. table_settings = {
  228. "vertical_strategy": "lines_strict",
  229. "horizontal_strategy": "lines_strict",
  230. "intersection_tolerance": 5,
  231. "snap_tolerance": 3,
  232. }
  233. with pdfplumber.open(input_path) as pdf:
  234. for page in pdf.pages:
  235. parts: list[str] = []
  236. text = page.extract_text(layout=True) or page.extract_text() or ""
  237. normalized_text = normalize_pdf_plain_text(text)
  238. if normalized_text:
  239. parts.append(normalized_text)
  240. for table in page.extract_tables(table_settings) or []:
  241. rendered_table = render_pdf_table_markdown(table)
  242. if rendered_table:
  243. parts.append(rendered_table)
  244. if parts:
  245. page_parts.append("\n\n".join(parts))
  246. markdown = "\n\n".join(page_parts)
  247. return finalize_pdf_markdown(
  248. markdown,
  249. input_path=input_path,
  250. include_images=include_images,
  251. append_images=True,
  252. )
  253. def convert_pdf_with_pdfminer(input_path: Path, include_images: bool) -> str:
  254. from pdfminer.high_level import extract_text
  255. from pdfminer.layout import LAParams
  256. markdown = extract_text(
  257. str(input_path),
  258. laparams=LAParams(line_margin=0.3, char_margin=2.0, word_margin=0.1),
  259. )
  260. markdown = normalize_pdf_plain_text(markdown)
  261. return finalize_pdf_markdown(
  262. markdown,
  263. input_path=input_path,
  264. include_images=include_images,
  265. append_images=True,
  266. )
  267. def convert_pdf_with_pypdf(input_path: Path, include_images: bool) -> str:
  268. from pypdf import PdfReader
  269. reader = PdfReader(str(input_path))
  270. page_parts: list[str] = []
  271. for page in reader.pages:
  272. try:
  273. text = page.extract_text(extraction_mode="layout") or ""
  274. except TypeError:
  275. text = page.extract_text() or ""
  276. if not text:
  277. text = page.extract_text() or ""
  278. normalized_text = normalize_pdf_plain_text(text)
  279. if normalized_text:
  280. page_parts.append(normalized_text)
  281. markdown = "\n\n".join(page_parts)
  282. return finalize_pdf_markdown(
  283. markdown,
  284. input_path=input_path,
  285. include_images=include_images,
  286. append_images=True,
  287. )
  288. def finalize_pdf_markdown(
  289. markdown: str,
  290. *,
  291. input_path: Path,
  292. include_images: bool,
  293. append_images: bool,
  294. ) -> str:
  295. if not include_images:
  296. markdown = strip_markdown_images(markdown)
  297. markdown = normalize_generated_markdown(markdown)
  298. if include_images and append_images:
  299. markdown = append_pdf_images_markdown(markdown, input_path)
  300. if not markdown.strip():
  301. raise ValueError("empty markdown output")
  302. return markdown
  303. def append_pdf_images_markdown(markdown: str, input_path: Path) -> str:
  304. image_markdown = extract_pdf_images_markdown(input_path)
  305. if not image_markdown:
  306. return markdown
  307. base = markdown.rstrip()
  308. if not base:
  309. return ensure_trailing_newline(image_markdown)
  310. return f"{base}\n\n{image_markdown}\n"
  311. def extract_pdf_images_markdown(input_path: Path) -> str:
  312. import fitz
  313. document = fitz.open(input_path)
  314. try:
  315. image_lines: list[str] = []
  316. for page_index, page in enumerate(document):
  317. seen_xrefs: set[int] = set()
  318. for image_index, image in enumerate(page.get_images(full=True), start=1):
  319. xref = int(image[0])
  320. if xref in seen_xrefs:
  321. continue
  322. seen_xrefs.add(xref)
  323. image_data = document.extract_image(xref)
  324. if not image_data:
  325. continue
  326. ext = image_data.get("ext", "bin")
  327. mime_type = mimetypes.guess_type(f"file.{ext}")[0] or f"image/{ext}"
  328. data_uri = make_data_uri(mime_type, image_data["image"])
  329. image_lines.append(
  330. f"![Page {page_index + 1} Image {image_index}]({data_uri})"
  331. )
  332. return "\n\n".join(image_lines)
  333. finally:
  334. document.close()
  335. def normalize_pdf_block_text(text: str) -> str:
  336. lines = [
  337. collapse_pdf_whitespace(line)
  338. for line in normalize_newlines_only(text).splitlines()
  339. if line.strip()
  340. ]
  341. if not lines:
  342. return ""
  343. merged = lines[0]
  344. for line in lines[1:]:
  345. merged = join_pdf_fragments(merged, line)
  346. return merged.strip()
  347. def normalize_pdf_plain_text(text: str) -> str:
  348. text = normalize_newlines_only(text).replace("\x0c", "\n\n")
  349. lines = [collapse_pdf_whitespace(line) for line in text.splitlines()]
  350. text = "\n".join(line for line in lines if line or line == "")
  351. text = re.sub(r"\n{3,}", "\n\n", text)
  352. return text.strip()
  353. def collapse_pdf_whitespace(text: str) -> str:
  354. collapsed = re.sub(r"[ \t]+", " ", text).strip()
  355. collapsed = re.sub(r"(?<=[\u4e00-\u9fff])\s+(?=[\u4e00-\u9fff])", "", collapsed)
  356. collapsed = re.sub(r"\s+([,.;:!?%])", r"\1", collapsed)
  357. collapsed = re.sub(r"\s+([,。;:!?、%])", r"\1", collapsed)
  358. return collapsed
  359. def join_pdf_fragments(left: str, right: str) -> str:
  360. if not left:
  361. return right
  362. if not right:
  363. return left
  364. if (
  365. left.endswith("-")
  366. and re.search(r"[A-Za-z]-$", left)
  367. and re.match(r"^[A-Za-z]", right)
  368. ):
  369. return f"{left[:-1]}{right}"
  370. if right[0] in ",.;:!?%)]},。;:!?、%)》】】":
  371. separator = ""
  372. elif left[-1] in "([{《【":
  373. separator = ""
  374. elif re.search(r"[\u4e00-\u9fff]$", left) and re.match(r"^[\u4e00-\u9fff]", right):
  375. separator = ""
  376. else:
  377. separator = " "
  378. return f"{left}{separator}{right}".strip()
  379. def should_skip_pdf_margin_block(y0: float, y1: float, page_height: float) -> bool:
  380. if page_height <= 0:
  381. return False
  382. top_margin = page_height * 0.03
  383. bottom_margin = page_height * 0.03
  384. return y0 <= top_margin or y1 >= (page_height - bottom_margin)
  385. def render_pdf_table_markdown(table: list[list[object | None]]) -> str:
  386. cleaned_rows: list[list[str]] = []
  387. for row in table:
  388. cleaned_row = [normalize_pdf_table_cell(cell) for cell in row]
  389. if any(cell for cell in cleaned_row):
  390. cleaned_rows.append(cleaned_row)
  391. if not cleaned_rows:
  392. return ""
  393. width = max(len(row) for row in cleaned_rows)
  394. normalized_rows = [row + [""] * (width - len(row)) for row in cleaned_rows]
  395. header = normalized_rows[0]
  396. separator = ["---"] * width
  397. body = normalized_rows[1:]
  398. lines = [render_markdown_table_row(header), render_markdown_table_row(separator)]
  399. lines.extend(render_markdown_table_row(row) for row in body)
  400. return "\n".join(lines)
  401. def normalize_pdf_table_cell(value: object | None) -> str:
  402. if value is None:
  403. return ""
  404. text = normalize_newlines_only(str(value)).strip()
  405. text = re.sub(r"\n+", "<br>", text)
  406. text = collapse_pdf_whitespace(text)
  407. return text.replace("|", r"\|")
  408. def render_markdown_table_row(row: list[str]) -> str:
  409. return f"|{'|'.join(row)}|"
  410. def score_pdf_markdown(markdown: str) -> int:
  411. probe = normalize_newlines_only(strip_markdown_images(markdown)).strip()
  412. if not probe:
  413. return -10000
  414. lines = [line.strip() for line in probe.splitlines() if line.strip()]
  415. cjk_count = len(re.findall(r"[\u4e00-\u9fff]", probe))
  416. latin_count = len(re.findall(r"[A-Za-z]", probe))
  417. digit_count = len(re.findall(r"\d", probe))
  418. weird_count = probe.count("�") + len(
  419. re.findall(r"[\x00-\x08\x0b\x0c\x0e-\x1f]", probe)
  420. )
  421. long_line_count = sum(len(line) >= 12 for line in lines)
  422. short_line_count = sum(0 < len(line) <= 2 for line in lines)
  423. table_count = probe.count("|---")
  424. heading_count = len(re.findall(r"(?m)^#{1,6}\s", probe))
  425. return (
  426. len(probe)
  427. + cjk_count * 2
  428. + latin_count
  429. + digit_count
  430. + long_line_count * 8
  431. + table_count * 40
  432. + heading_count * 20
  433. - short_line_count * 6
  434. - weird_count * 120
  435. )
  436. def is_pdf_markdown_acceptable(markdown: str, score: int) -> bool:
  437. probe = normalize_newlines_only(strip_markdown_images(markdown)).strip()
  438. if not probe:
  439. return False
  440. informative_count = len(re.findall(r"[A-Za-z0-9\u4e00-\u9fff]", probe))
  441. return len(probe) >= 120 and informative_count >= 80 and score >= 320
  442. def convert_ole_word_compatible_file(input_path: Path, include_images: bool) -> str:
  443. with tempfile.TemporaryDirectory(prefix="doc2md-ole-") as temp_dir:
  444. temp_dir_path = Path(temp_dir)
  445. legacy_input = temp_dir_path / f"{input_path.stem}.doc"
  446. normalized_input = temp_dir_path / f"{input_path.stem}.normalized.docx"
  447. shutil.copy2(input_path, legacy_input)
  448. normalize_ole_with_office(legacy_input, normalized_input)
  449. return convert_docx_file(normalized_input, include_images)
  450. def normalize_ole_with_office(input_path: Path, output_path: Path) -> None:
  451. if os.name == "nt":
  452. try:
  453. normalize_with_word(input_path, output_path)
  454. return
  455. except Exception: # noqa: BLE001
  456. pass
  457. normalize_with_libreoffice(input_path, output_path)
  458. def normalize_with_word(input_path: Path, output_path: Path) -> None:
  459. import pythoncom
  460. import win32com.client
  461. pythoncom.CoInitialize()
  462. word = win32com.client.DispatchEx("Word.Application")
  463. word.Visible = False
  464. word.DisplayAlerts = 0
  465. document = None
  466. try:
  467. document = word.Documents.Open(
  468. str(input_path),
  469. ConfirmConversions=False,
  470. ReadOnly=True,
  471. AddToRecentFiles=False,
  472. Visible=False,
  473. )
  474. document.SaveAs2(str(output_path), FileFormat=16)
  475. finally:
  476. if document is not None:
  477. document.Close(False)
  478. word.Quit()
  479. pythoncom.CoUninitialize()
  480. def normalize_with_libreoffice(input_path: Path, output_path: Path) -> None:
  481. soffice = find_libreoffice_command()
  482. if soffice is None:
  483. raise ConversionError(
  484. code="office_backend_missing",
  485. message="No available Office backend found for legacy document conversion",
  486. status_code=500,
  487. details={"filename": input_path.name},
  488. )
  489. output_dir = output_path.parent
  490. output_dir.mkdir(parents=True, exist_ok=True)
  491. with tempfile.TemporaryDirectory(prefix="doc2md-lo-profile-") as profile_dir:
  492. profile_uri = Path(profile_dir).resolve().as_uri()
  493. command = [
  494. soffice,
  495. "--headless",
  496. "--nologo",
  497. "--nolockcheck",
  498. "--nodefault",
  499. "--nofirststartwizard",
  500. f"-env:UserInstallation={profile_uri}",
  501. "--convert-to",
  502. "docx",
  503. "--outdir",
  504. str(output_dir),
  505. str(input_path),
  506. ]
  507. completed = subprocess.run(
  508. command,
  509. capture_output=True,
  510. text=True,
  511. encoding="utf-8",
  512. errors="ignore",
  513. timeout=settings.office_timeout_seconds,
  514. check=False,
  515. )
  516. produced_output = output_dir / f"{input_path.stem}.docx"
  517. if produced_output.exists():
  518. if produced_output != output_path:
  519. shutil.move(str(produced_output), str(output_path))
  520. return
  521. raise ConversionError(
  522. code="office_conversion_failed",
  523. message="LibreOffice failed to convert legacy document",
  524. status_code=500,
  525. details={
  526. "filename": input_path.name,
  527. "stdout": completed.stdout.strip(),
  528. "stderr": completed.stderr.strip(),
  529. },
  530. )
  531. def build_image_converter(include_images: bool):
  532. def convert_image(image):
  533. attributes: dict[str, str] = {}
  534. alt_text = getattr(image, "alt_text", None)
  535. if alt_text:
  536. attributes["alt"] = alt_text
  537. if not include_images:
  538. attributes["src"] = ""
  539. return attributes
  540. with image.open() as image_bytes:
  541. data = image_bytes.read()
  542. attributes["src"] = make_data_uri(image.content_type, data)
  543. return attributes
  544. return mammoth.images.img_element(convert_image)
  545. def make_data_uri(content_type: str | None, data: bytes) -> str:
  546. mime_type = content_type or "application/octet-stream"
  547. encoded = base64.b64encode(data).decode("ascii")
  548. return f"data:{mime_type};base64,{encoded}"
  549. def clean_html(html: str, include_images: bool) -> str:
  550. soup = BeautifulSoup(html, "html.parser")
  551. for anchor in soup.find_all("a"):
  552. href = anchor.get("href", "")
  553. if not anchor.get_text(strip=True) and not anchor.find("img"):
  554. anchor.decompose()
  555. continue
  556. if href.startswith("#"):
  557. anchor.unwrap()
  558. if not include_images:
  559. for image in soup.find_all("img"):
  560. image.decompose()
  561. return str(soup)
  562. def preserve_tables(html: str) -> tuple[str, dict[str, str]]:
  563. soup = BeautifulSoup(html, "html.parser")
  564. placeholders: dict[str, str] = {}
  565. for index, table in enumerate(soup.find_all("table"), start=1):
  566. placeholder = f"TABLEPLACEHOLDER{index:04d}"
  567. placeholders[placeholder] = str(table)
  568. table.replace_with(soup.new_string(placeholder))
  569. return str(soup), placeholders
  570. def html_fragment_to_markdown(html: str) -> str:
  571. return html_to_markdown(
  572. html,
  573. heading_style="ATX",
  574. bullets="-",
  575. strong_em_symbol="*",
  576. )
  577. def restore_tables(markdown: str, placeholders: dict[str, str]) -> str:
  578. for placeholder, table_html in placeholders.items():
  579. markdown = markdown.replace(placeholder, f"\n\n{table_html}\n\n")
  580. return markdown
  581. def normalize_newlines_only(markdown: str) -> str:
  582. return markdown.replace("\r\n", "\n").replace("\r", "\n")
  583. def normalize_generated_markdown(markdown: str) -> str:
  584. markdown = normalize_newlines_only(markdown)
  585. markdown = re.sub(r"\n{3,}", "\n\n", markdown)
  586. return ensure_trailing_newline(markdown.strip())
  587. def ensure_trailing_newline(text: str) -> str:
  588. return text if text.endswith("\n") else f"{text}\n"
  589. def strip_markdown_images(text: str) -> str:
  590. text = MARKDOWN_IMAGE_PATTERN.sub("", text)
  591. text = HTML_IMAGE_PATTERN.sub("", text)
  592. text = re.sub(r"\n{3,}", "\n\n", text)
  593. return text
  594. def inline_local_markdown_images(text: str, base_dir: Path) -> str:
  595. text = MARKDOWN_IMAGE_PATTERN.sub(
  596. lambda match: replace_markdown_image(match, base_dir), text
  597. )
  598. text = HTML_IMAGE_PATTERN.sub(
  599. lambda match: replace_html_image(match, base_dir), text
  600. )
  601. return text
  602. def replace_markdown_image(match: re.Match[str], base_dir: Path) -> str:
  603. target = match.group("target")
  604. clean_target = (
  605. target[1:-1] if target.startswith("<") and target.endswith(">") else target
  606. )
  607. if is_remote_or_data_url(clean_target):
  608. return match.group(0)
  609. local_path = resolve_local_path(base_dir, clean_target)
  610. if local_path is None:
  611. return match.group(0)
  612. data_uri = path_to_data_uri(local_path)
  613. title = match.group("title") or ""
  614. alt = match.group("alt")
  615. return f"![{alt}]({data_uri}{title})"
  616. def replace_html_image(match: re.Match[str], base_dir: Path) -> str:
  617. src = match.group("src")
  618. if is_remote_or_data_url(src):
  619. return match.group(0)
  620. local_path = resolve_local_path(base_dir, src)
  621. if local_path is None:
  622. return match.group(0)
  623. data_uri = path_to_data_uri(local_path)
  624. before = match.group("before")
  625. after = match.group("after")
  626. return f'<img{before}src="{data_uri}"{after}>'
  627. def resolve_local_path(base_dir: Path, target: str) -> Path | None:
  628. candidate = Path(target)
  629. if not candidate.is_absolute():
  630. candidate = (base_dir / candidate).resolve()
  631. if not candidate.exists() or not candidate.is_file():
  632. return None
  633. return candidate
  634. def path_to_data_uri(path: Path) -> str:
  635. mime_type = mimetypes.guess_type(path.name)[0] or "application/octet-stream"
  636. return make_data_uri(mime_type, path.read_bytes())
  637. def is_remote_or_data_url(value: str) -> bool:
  638. parsed = urlparse(value)
  639. return parsed.scheme in {"http", "https", "data"}
  640. def find_libreoffice_command() -> str | None:
  641. candidates = [
  642. shutil.which("soffice"),
  643. shutil.which("libreoffice"),
  644. "/usr/lib/libreoffice/program/soffice",
  645. ]
  646. if os.name == "nt":
  647. candidates.extend(
  648. [
  649. r"C:\Program Files\LibreOffice\program\soffice.exe",
  650. r"C:\Program Files (x86)\LibreOffice\program\soffice.exe",
  651. ]
  652. )
  653. for candidate in candidates:
  654. if candidate and Path(candidate).exists():
  655. return str(candidate)
  656. return None