wxcz_admin
/
biaoshu-ai-git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827
							from __future__ import annotations

import base64
import mimetypes
import os
import re
import shutil
import subprocess
import tempfile
import zipfile
from pathlib import Path
from urllib.parse import urlparse

import mammoth
import olefile
from bs4 import BeautifulSoup
from charset_normalizer import from_bytes
from markdownify import markdownify as html_to_markdown

from .config import settings
from .errors import ConversionError

MARKDOWN_SUFFIXES = {".md", ".markdown"}
MARKDOWN_IMAGE_PATTERN = re.compile(
    r'!\[(?P<alt>[^\]]*)\]\((?P<target><[^>]+>|[^)\s]+)(?P<title>\s+"[^"]*")?\)',
    re.IGNORECASE,
)
HTML_IMAGE_PATTERN = re.compile(
    r"<img\b(?P<before>[^>]*?)src=[\"'](?P<src>[^\"']+)[\"'](?P<after>[^>]*)>",
    re.IGNORECASE,
)


def convert_path_to_markdown(
    input_path_str: str,
    include_images: bool,
    original_name: str | None = None,
) -> str:
    input_path = Path(input_path_str)
    detected_format = detect_file_format(input_path, original_name)

    if detected_format == "markdown":
        return convert_markdown_file(input_path, include_images)
    if detected_format == "docx":
        return convert_docx_file(input_path, include_images)
    if detected_format == "pdf":
        return convert_pdf_file(input_path, include_images)
    if detected_format == "ole_word_compatible":
        return convert_ole_word_compatible_file(input_path, include_images)

    raise ConversionError(
        code="unsupported_format",
        message="Unsupported input format",
        status_code=400,
        details={
            "filename": original_name or input_path.name,
            "detected_format": detected_format,
        },
    )


def detect_file_format(input_path: Path, original_name: str | None = None) -> str:
    suffix = (
        Path(original_name).suffix if original_name else input_path.suffix
    ).lower()
    with input_path.open("rb") as file_obj:
        header = file_obj.read(8)

    if suffix in MARKDOWN_SUFFIXES:
        return "markdown"

    if header.startswith(b"%PDF-"):
        return "pdf"

    if header.startswith(b"PK\x03\x04") and is_docx_package(input_path):
        return "docx"

    if header == b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1" and is_word_compatible_ole(
        input_path
    ):
        return "ole_word_compatible"

    if suffix in MARKDOWN_SUFFIXES:
        return "markdown"

    return "unknown"


def is_docx_package(input_path: Path) -> bool:
    try:
        with zipfile.ZipFile(input_path) as archive:
            names = set(archive.namelist())
    except zipfile.BadZipFile:
        return False
    return "[Content_Types].xml" in names and "word/document.xml" in names


def is_word_compatible_ole(input_path: Path) -> bool:
    try:
        with olefile.OleFileIO(str(input_path)) as ole:
            streams = {"/".join(item) for item in ole.listdir()}
    except OSError:
        return False
    return "WordDocument" in streams


def convert_markdown_file(input_path: Path, include_images: bool) -> str:
    raw = input_path.read_bytes()
    match = from_bytes(raw).best()
    if match is None:
        raise ConversionError(
            code="invalid_markdown",
            message="Unable to detect Markdown encoding",
            status_code=400,
            details={"filename": input_path.name},
        )

    text = str(match)
    text = normalize_newlines_only(text)
    if include_images:
        text = inline_local_markdown_images(text, input_path.parent)
    else:
        text = strip_markdown_images(text)
    return ensure_trailing_newline(text)


def convert_docx_file(input_path: Path, include_images: bool) -> str:
    with input_path.open("rb") as docx_file:
        result = mammoth.convert_to_html(
            docx_file,
            convert_image=build_image_converter(include_images),
        )

    html = clean_html(result.value, include_images)
    html_without_tables, placeholders = preserve_tables(html)
    markdown = html_fragment_to_markdown(html_without_tables)
    markdown = restore_tables(markdown, placeholders)
    markdown = normalize_generated_markdown(markdown)
    if not include_images:
        markdown = strip_markdown_images(markdown)
    return markdown


def convert_pdf_file(input_path: Path, include_images: bool) -> str:
    if not pdf_has_text_layer(input_path):
        raise ConversionError(
            code="pdf_text_layer_missing",
            message="PDF does not contain a selectable text layer",
            status_code=422,
            details={"filename": input_path.name},
        )

    attempts: list[dict[str, object]] = []
    errors: list[str] = []
    strategies = [
        ("pymupdf4llm", convert_pdf_with_pymupdf4llm),
        ("pymupdf_blocks", convert_pdf_with_pymupdf_blocks),
        ("pdfplumber", convert_pdf_with_pdfplumber),
        ("pdfminer", convert_pdf_with_pdfminer),
        ("pypdf", convert_pdf_with_pypdf),
    ]

    for name, strategy in strategies:
        try:
            markdown = strategy(input_path, include_images)
            if not markdown.strip():
                raise ValueError("empty markdown output")

            score = score_pdf_markdown(markdown)
            attempts.append({"name": name, "score": score, "markdown": markdown})
            if is_pdf_markdown_acceptable(markdown, score):
                return markdown
        except Exception as exc:  # noqa: BLE001
            errors.append(f"{name}: {exc}")

    if attempts:
        best_attempt = max(attempts, key=lambda item: int(item["score"]))
        return str(best_attempt["markdown"])

    raise ConversionError(
        code="pdf_text_extraction_failed",
        message="All non-OCR PDF extraction strategies failed",
        status_code=422,
        details={"filename": input_path.name, "errors": errors},
    )


def pdf_has_text_layer(input_path: Path, max_pages_to_check: int = 5) -> bool:
    import fitz

    document = fitz.open(input_path)
    try:
        page_limit = min(document.page_count, max_pages_to_check)
        if page_limit == 0:
            return False

        text_pages = 0
        total_chars = 0
        total_words = 0

        for page_index in range(page_limit):
            page = document.load_page(page_index)
            page_text = page.get_text("text").strip()
            word_count = len(page.get_text("words"))
            total_chars += len(page_text)
            total_words += word_count

            if len(page_text) >= 30 or word_count >= 10:
                text_pages += 1

        return (
            text_pages >= max(1, page_limit // 2)
            or total_words >= 50
            or total_chars >= 200
        )
    finally:
        document.close()


def convert_pdf_with_pymupdf4llm(input_path: Path, include_images: bool) -> str:
    import pymupdf4llm

    markdown = pymupdf4llm.to_markdown(
        str(input_path),
        embed_images=include_images,
        ignore_images=not include_images,
        show_progress=False,
        page_separators=False,
        force_text=True,
        table_strategy="lines_strict",
    )

    return finalize_pdf_markdown(
        markdown,
        input_path=input_path,
        include_images=include_images,
        append_images=False,
    )


def convert_pdf_with_pymupdf_blocks(input_path: Path, include_images: bool) -> str:
    import fitz

    document = fitz.open(input_path)
    try:
        page_parts: list[str] = []
        for page_index, page in enumerate(document):
            block_parts: list[str] = []
            page_height = float(page.rect.height or 0)

            for block in page.get_text("blocks", sort=True):
                x0, y0, x1, y1, text, _block_no, block_type = block
                if block_type != 0:
                    continue
                if should_skip_pdf_margin_block(float(y0), float(y1), page_height):
                    continue

                cleaned = normalize_pdf_block_text(str(text))
                if not cleaned:
                    continue
                block_parts.append(cleaned)

            if block_parts:
                page_parts.append("\n\n".join(block_parts))

        markdown = "\n\n".join(page_parts)
        return finalize_pdf_markdown(
            markdown,
            input_path=input_path,
            include_images=include_images,
            append_images=True,
        )
    finally:
        document.close()


def convert_pdf_with_pdfplumber(input_path: Path, include_images: bool) -> str:
    import pdfplumber

    page_parts: list[str] = []
    table_settings = {
        "vertical_strategy": "lines_strict",
        "horizontal_strategy": "lines_strict",
        "intersection_tolerance": 5,
        "snap_tolerance": 3,
    }

    with pdfplumber.open(input_path) as pdf:
        for page in pdf.pages:
            parts: list[str] = []
            text = page.extract_text(layout=True) or page.extract_text() or ""
            normalized_text = normalize_pdf_plain_text(text)
            if normalized_text:
                parts.append(normalized_text)

            for table in page.extract_tables(table_settings) or []:
                rendered_table = render_pdf_table_markdown(table)
                if rendered_table:
                    parts.append(rendered_table)

            if parts:
                page_parts.append("\n\n".join(parts))

    markdown = "\n\n".join(page_parts)
    return finalize_pdf_markdown(
        markdown,
        input_path=input_path,
        include_images=include_images,
        append_images=True,
    )


def convert_pdf_with_pdfminer(input_path: Path, include_images: bool) -> str:
    from pdfminer.high_level import extract_text
    from pdfminer.layout import LAParams

    markdown = extract_text(
        str(input_path),
        laparams=LAParams(line_margin=0.3, char_margin=2.0, word_margin=0.1),
    )
    markdown = normalize_pdf_plain_text(markdown)
    return finalize_pdf_markdown(
        markdown,
        input_path=input_path,
        include_images=include_images,
        append_images=True,
    )


def convert_pdf_with_pypdf(input_path: Path, include_images: bool) -> str:
    from pypdf import PdfReader

    reader = PdfReader(str(input_path))
    page_parts: list[str] = []
    for page in reader.pages:
        try:
            text = page.extract_text(extraction_mode="layout") or ""
        except TypeError:
            text = page.extract_text() or ""
        if not text:
            text = page.extract_text() or ""

        normalized_text = normalize_pdf_plain_text(text)
        if normalized_text:
            page_parts.append(normalized_text)

    markdown = "\n\n".join(page_parts)
    return finalize_pdf_markdown(
        markdown,
        input_path=input_path,
        include_images=include_images,
        append_images=True,
    )


def finalize_pdf_markdown(
    markdown: str,
    *,
    input_path: Path,
    include_images: bool,
    append_images: bool,
) -> str:
    if not include_images:
        markdown = strip_markdown_images(markdown)

    markdown = normalize_generated_markdown(markdown)
    if include_images and append_images:
        markdown = append_pdf_images_markdown(markdown, input_path)

    if not markdown.strip():
        raise ValueError("empty markdown output")
    return markdown


def append_pdf_images_markdown(markdown: str, input_path: Path) -> str:
    image_markdown = extract_pdf_images_markdown(input_path)
    if not image_markdown:
        return markdown

    base = markdown.rstrip()
    if not base:
        return ensure_trailing_newline(image_markdown)
    return f"{base}\n\n{image_markdown}\n"


def extract_pdf_images_markdown(input_path: Path) -> str:
    import fitz

    document = fitz.open(input_path)
    try:
        image_lines: list[str] = []
        for page_index, page in enumerate(document):
            seen_xrefs: set[int] = set()
            for image_index, image in enumerate(page.get_images(full=True), start=1):
                xref = int(image[0])
                if xref in seen_xrefs:
                    continue
                seen_xrefs.add(xref)

                image_data = document.extract_image(xref)
                if not image_data:
                    continue

                ext = image_data.get("ext", "bin")
                mime_type = mimetypes.guess_type(f"file.{ext}")[0] or f"image/{ext}"
                data_uri = make_data_uri(mime_type, image_data["image"])
                image_lines.append(
                    f"![Page {page_index + 1} Image {image_index}]({data_uri})"
                )
        return "\n\n".join(image_lines)
    finally:
        document.close()


def normalize_pdf_block_text(text: str) -> str:
    lines = [
        collapse_pdf_whitespace(line)
        for line in normalize_newlines_only(text).splitlines()
        if line.strip()
    ]
    if not lines:
        return ""

    merged = lines[0]
    for line in lines[1:]:
        merged = join_pdf_fragments(merged, line)
    return merged.strip()


def normalize_pdf_plain_text(text: str) -> str:
    text = normalize_newlines_only(text).replace("\x0c", "\n\n")
    lines = [collapse_pdf_whitespace(line) for line in text.splitlines()]
    text = "\n".join(line for line in lines if line or line == "")
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text.strip()


def collapse_pdf_whitespace(text: str) -> str:
    collapsed = re.sub(r"[ \t]+", " ", text).strip()
    collapsed = re.sub(r"(?<=[\u4e00-\u9fff])\s+(?=[\u4e00-\u9fff])", "", collapsed)
    collapsed = re.sub(r"\s+([,.;:!?%])", r"\1", collapsed)
    collapsed = re.sub(r"\s+([，。；：！？、％])", r"\1", collapsed)
    return collapsed


def join_pdf_fragments(left: str, right: str) -> str:
    if not left:
        return right
    if not right:
        return left

    if (
        left.endswith("-")
        and re.search(r"[A-Za-z]-$", left)
        and re.match(r"^[A-Za-z]", right)
    ):
        return f"{left[:-1]}{right}"

    if right[0] in ",.;:!?%)]}，。；：！？、％）》】】":
        separator = ""
    elif left[-1] in "([{《【":
        separator = ""
    elif re.search(r"[\u4e00-\u9fff]$", left) and re.match(r"^[\u4e00-\u9fff]", right):
        separator = ""
    else:
        separator = " "

    return f"{left}{separator}{right}".strip()


def should_skip_pdf_margin_block(y0: float, y1: float, page_height: float) -> bool:
    if page_height <= 0:
        return False
    top_margin = page_height * 0.03
    bottom_margin = page_height * 0.03
    return y0 <= top_margin or y1 >= (page_height - bottom_margin)


def render_pdf_table_markdown(table: list[list[object | None]]) -> str:
    cleaned_rows: list[list[str]] = []
    for row in table:
        cleaned_row = [normalize_pdf_table_cell(cell) for cell in row]
        if any(cell for cell in cleaned_row):
            cleaned_rows.append(cleaned_row)

    if not cleaned_rows:
        return ""

    width = max(len(row) for row in cleaned_rows)
    normalized_rows = [row + [""] * (width - len(row)) for row in cleaned_rows]
    header = normalized_rows[0]
    separator = ["---"] * width
    body = normalized_rows[1:]

    lines = [render_markdown_table_row(header), render_markdown_table_row(separator)]
    lines.extend(render_markdown_table_row(row) for row in body)
    return "\n".join(lines)


def normalize_pdf_table_cell(value: object | None) -> str:
    if value is None:
        return ""
    text = normalize_newlines_only(str(value)).strip()
    text = re.sub(r"\n+", "<br>", text)
    text = collapse_pdf_whitespace(text)
    return text.replace("|", r"\|")


def render_markdown_table_row(row: list[str]) -> str:
    return f"|{'|'.join(row)}|"


def score_pdf_markdown(markdown: str) -> int:
    probe = normalize_newlines_only(strip_markdown_images(markdown)).strip()
    if not probe:
        return -10000

    lines = [line.strip() for line in probe.splitlines() if line.strip()]
    cjk_count = len(re.findall(r"[\u4e00-\u9fff]", probe))
    latin_count = len(re.findall(r"[A-Za-z]", probe))
    digit_count = len(re.findall(r"\d", probe))
    weird_count = probe.count("�") + len(
        re.findall(r"[\x00-\x08\x0b\x0c\x0e-\x1f]", probe)
    )
    long_line_count = sum(len(line) >= 12 for line in lines)
    short_line_count = sum(0 < len(line) <= 2 for line in lines)
    table_count = probe.count("|---")
    heading_count = len(re.findall(r"(?m)^#{1,6}\s", probe))

    return (
        len(probe)
        + cjk_count * 2
        + latin_count
        + digit_count
        + long_line_count * 8
        + table_count * 40
        + heading_count * 20
        - short_line_count * 6
        - weird_count * 120
    )


def is_pdf_markdown_acceptable(markdown: str, score: int) -> bool:
    probe = normalize_newlines_only(strip_markdown_images(markdown)).strip()
    if not probe:
        return False

    informative_count = len(re.findall(r"[A-Za-z0-9\u4e00-\u9fff]", probe))
    return len(probe) >= 120 and informative_count >= 80 and score >= 320


def convert_ole_word_compatible_file(input_path: Path, include_images: bool) -> str:
    with tempfile.TemporaryDirectory(prefix="doc2md-ole-") as temp_dir:
        temp_dir_path = Path(temp_dir)
        legacy_input = temp_dir_path / f"{input_path.stem}.doc"
        normalized_input = temp_dir_path / f"{input_path.stem}.normalized.docx"
        shutil.copy2(input_path, legacy_input)
        normalize_ole_with_office(legacy_input, normalized_input)
        return convert_docx_file(normalized_input, include_images)


def normalize_ole_with_office(input_path: Path, output_path: Path) -> None:
    if os.name == "nt":
        try:
            normalize_with_word(input_path, output_path)
            return
        except Exception:  # noqa: BLE001
            pass

    normalize_with_libreoffice(input_path, output_path)


def normalize_with_word(input_path: Path, output_path: Path) -> None:
    import pythoncom
    import win32com.client

    pythoncom.CoInitialize()
    word = win32com.client.DispatchEx("Word.Application")
    word.Visible = False
    word.DisplayAlerts = 0
    document = None

    try:
        document = word.Documents.Open(
            str(input_path),
            ConfirmConversions=False,
            ReadOnly=True,
            AddToRecentFiles=False,
            Visible=False,
        )
        document.SaveAs2(str(output_path), FileFormat=16)
    finally:
        if document is not None:
            document.Close(False)
        word.Quit()
        pythoncom.CoUninitialize()


def normalize_with_libreoffice(input_path: Path, output_path: Path) -> None:
    soffice = find_libreoffice_command()
    if soffice is None:
        raise ConversionError(
            code="office_backend_missing",
            message="No available Office backend found for legacy document conversion",
            status_code=500,
            details={"filename": input_path.name},
        )

    output_dir = output_path.parent
    output_dir.mkdir(parents=True, exist_ok=True)

    with tempfile.TemporaryDirectory(prefix="doc2md-lo-profile-") as profile_dir:
        profile_uri = Path(profile_dir).resolve().as_uri()
        command = [
            soffice,
            "--headless",
            "--nologo",
            "--nolockcheck",
            "--nodefault",
            "--nofirststartwizard",
            f"-env:UserInstallation={profile_uri}",
            "--convert-to",
            "docx",
            "--outdir",
            str(output_dir),
            str(input_path),
        ]
        completed = subprocess.run(
            command,
            capture_output=True,
            text=True,
            encoding="utf-8",
            errors="ignore",
            timeout=settings.office_timeout_seconds,
            check=False,
        )

    produced_output = output_dir / f"{input_path.stem}.docx"
    if produced_output.exists():
        if produced_output != output_path:
            shutil.move(str(produced_output), str(output_path))
        return

    raise ConversionError(
        code="office_conversion_failed",
        message="LibreOffice failed to convert legacy document",
        status_code=500,
        details={
            "filename": input_path.name,
            "stdout": completed.stdout.strip(),
            "stderr": completed.stderr.strip(),
        },
    )


def build_image_converter(include_images: bool):
    def convert_image(image):
        attributes: dict[str, str] = {}
        alt_text = getattr(image, "alt_text", None)
        if alt_text:
            attributes["alt"] = alt_text

        if not include_images:
            attributes["src"] = ""
            return attributes

        with image.open() as image_bytes:
            data = image_bytes.read()
        attributes["src"] = make_data_uri(image.content_type, data)
        return attributes

    return mammoth.images.img_element(convert_image)


def make_data_uri(content_type: str | None, data: bytes) -> str:
    mime_type = content_type or "application/octet-stream"
    encoded = base64.b64encode(data).decode("ascii")
    return f"data:{mime_type};base64,{encoded}"


def clean_html(html: str, include_images: bool) -> str:
    soup = BeautifulSoup(html, "html.parser")

    for anchor in soup.find_all("a"):
        href = anchor.get("href", "")
        if not anchor.get_text(strip=True) and not anchor.find("img"):
            anchor.decompose()
            continue
        if href.startswith("#"):
            anchor.unwrap()

    if not include_images:
        for image in soup.find_all("img"):
            image.decompose()

    return str(soup)


def preserve_tables(html: str) -> tuple[str, dict[str, str]]:
    soup = BeautifulSoup(html, "html.parser")
    placeholders: dict[str, str] = {}

    for index, table in enumerate(soup.find_all("table"), start=1):
        placeholder = f"TABLEPLACEHOLDER{index:04d}"
        placeholders[placeholder] = str(table)
        table.replace_with(soup.new_string(placeholder))

    return str(soup), placeholders


def html_fragment_to_markdown(html: str) -> str:
    return html_to_markdown(
        html,
        heading_style="ATX",
        bullets="-",
        strong_em_symbol="*",
    )


def restore_tables(markdown: str, placeholders: dict[str, str]) -> str:
    for placeholder, table_html in placeholders.items():
        markdown = markdown.replace(placeholder, f"\n\n{table_html}\n\n")
    return markdown


def normalize_newlines_only(markdown: str) -> str:
    return markdown.replace("\r\n", "\n").replace("\r", "\n")


def normalize_generated_markdown(markdown: str) -> str:
    markdown = normalize_newlines_only(markdown)
    markdown = re.sub(r"\n{3,}", "\n\n", markdown)
    return ensure_trailing_newline(markdown.strip())


def ensure_trailing_newline(text: str) -> str:
    return text if text.endswith("\n") else f"{text}\n"


def strip_markdown_images(text: str) -> str:
    text = MARKDOWN_IMAGE_PATTERN.sub("", text)
    text = HTML_IMAGE_PATTERN.sub("", text)
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text


def inline_local_markdown_images(text: str, base_dir: Path) -> str:
    text = MARKDOWN_IMAGE_PATTERN.sub(
        lambda match: replace_markdown_image(match, base_dir), text
    )
    text = HTML_IMAGE_PATTERN.sub(
        lambda match: replace_html_image(match, base_dir), text
    )
    return text


def replace_markdown_image(match: re.Match[str], base_dir: Path) -> str:
    target = match.group("target")
    clean_target = (
        target[1:-1] if target.startswith("<") and target.endswith(">") else target
    )
    if is_remote_or_data_url(clean_target):
        return match.group(0)

    local_path = resolve_local_path(base_dir, clean_target)
    if local_path is None:
        return match.group(0)

    data_uri = path_to_data_uri(local_path)
    title = match.group("title") or ""
    alt = match.group("alt")
    return f"![{alt}]({data_uri}{title})"


def replace_html_image(match: re.Match[str], base_dir: Path) -> str:
    src = match.group("src")
    if is_remote_or_data_url(src):
        return match.group(0)

    local_path = resolve_local_path(base_dir, src)
    if local_path is None:
        return match.group(0)

    data_uri = path_to_data_uri(local_path)
    before = match.group("before")
    after = match.group("after")
    return f'<img{before}src="{data_uri}"{after}>'


def resolve_local_path(base_dir: Path, target: str) -> Path | None:
    candidate = Path(target)
    if not candidate.is_absolute():
        candidate = (base_dir / candidate).resolve()
    if not candidate.exists() or not candidate.is_file():
        return None
    return candidate


def path_to_data_uri(path: Path) -> str:
    mime_type = mimetypes.guess_type(path.name)[0] or "application/octet-stream"
    return make_data_uri(mime_type, path.read_bytes())


def is_remote_or_data_url(value: str) -> bool:
    parsed = urlparse(value)
    return parsed.scheme in {"http", "https", "data"}


def find_libreoffice_command() -> str | None:
    candidates = [
        shutil.which("soffice"),
        shutil.which("libreoffice"),
        "/usr/lib/libreoffice/program/soffice",
    ]

    if os.name == "nt":
        candidates.extend(
            [
                r"C:\Program Files\LibreOffice\program\soffice.exe",
                r"C:\Program Files (x86)\LibreOffice\program\soffice.exe",
            ]
        )

    for candidate in candidates:
        if candidate and Path(candidate).exists():
            return str(candidate)
    return None