feat(uploads): add pymupdf4llm PDF converter with auto-fallback and async offload (#1727)

* feat(uploads): add pymupdf4llm PDF converter with auto-fallback and async offload

- Introduce pymupdf4llm as an optional PDF converter with better heading
  detection and table preservation than MarkItDown
- Auto mode: prefer pymupdf4llm when installed; fall back to MarkItDown
  when output is suspiciously sparse (image-based / scanned PDFs)
- Sparsity check uses chars-per-page (< 50 chars/page) rather than an
  absolute threshold, correctly handling both short and long documents
- Large files (> 1 MB) are offloaded to asyncio.to_thread() to avoid
  blocking the event loop (related: #1569)
- Add UploadsConfig with pdf_converter field (auto/pymupdf4llm/markitdown)
- Add pymupdf4llm as optional dependency: pip install deerflow-harness[pymupdf]
- Add 14 unit tests covering sparsity heuristic, routing logic, and async path

* fix(uploads): address Copilot review comments on PDF converter

- Fix docstring: MIN_CHARS_PYMUPDF -> _MIN_CHARS_PER_PAGE (typo)
- Fix file handle leak: wrap pymupdf.open in try/finally to ensure doc.close()
- Fix silent fallback gap: _convert_pdf_with_pymupdf4llm now catches all
  conversion exceptions (not just ImportError), so encrypted/corrupt PDFs
  fall back to MarkItDown instead of propagating
- Tighten type: pdf_converter field changed from str to Literal[auto|pymupdf4llm|markitdown]
- Normalize config value: _get_pdf_converter() strips and lowercases the raw
  config string, warns and falls back to 'auto' on unknown values
This commit is contained in:
SHIYAO ZHANG
2026-04-03 21:59:45 +08:00
committed by GitHub
parent 5ff230eafd
commit ddfc988bef
5 changed files with 461 additions and 14 deletions
@@ -1,7 +1,7 @@
import logging
import os
from pathlib import Path
from typing import Any, Self
from typing import Any, Literal, Self
import yaml
from dotenv import load_dotenv
@@ -28,11 +28,26 @@ load_dotenv()
logger = logging.getLogger(__name__)
class UploadsConfig(BaseModel):
"""Configuration for file upload handling."""
pdf_converter: Literal["auto", "pymupdf4llm", "markitdown"] = Field(
default="auto",
description=(
"PDF-to-Markdown converter. "
"'auto': prefer pymupdf4llm when installed, fall back to MarkItDown for image-based PDFs; "
"'pymupdf4llm': always use pymupdf4llm (must be installed); "
"'markitdown': always use MarkItDown (original behaviour)."
),
)
class AppConfig(BaseModel):
"""Config for the DeerFlow application"""
log_level: str = Field(default="info", description="Logging level for deerflow modules (debug/info/warning/error)")
token_usage: TokenUsageConfig = Field(default_factory=TokenUsageConfig, description="Token usage tracking configuration")
uploads: UploadsConfig = Field(default_factory=UploadsConfig, description="File upload handling configuration")
models: list[ModelConfig] = Field(default_factory=list, description="Available models")
sandbox: SandboxConfig = Field(description="Sandbox configuration")
tools: list[ToolConfig] = Field(default_factory=list, description="Available tools")
@@ -1,9 +1,20 @@
"""File conversion utilities.
Converts document files (PDF, PPT, Excel, Word) to Markdown using markitdown.
Converts document files (PDF, PPT, Excel, Word) to Markdown.
PDF conversion strategy (auto mode):
1. Try pymupdf4llm if installed — better heading detection, faster on most files.
2. If output is suspiciously short (< _MIN_CHARS_PER_PAGE chars/page, or < 200 chars
total when page count is unavailable), treat as image-based and fall back to MarkItDown.
3. If pymupdf4llm is not installed, use MarkItDown directly (existing behaviour).
Large files (> ASYNC_THRESHOLD_BYTES) are converted in a thread pool via
asyncio.to_thread() to avoid blocking the event loop (fixes #1569).
No FastAPI or HTTP dependencies — pure utility functions.
"""
import asyncio
import logging
import re
from pathlib import Path
@@ -21,30 +32,136 @@ CONVERTIBLE_EXTENSIONS = {
".docx",
}
# Files larger than this threshold are converted in a background thread.
# Small files complete in < 1s synchronously; spawning a thread adds unnecessary
# scheduling overhead for them.
_ASYNC_THRESHOLD_BYTES = 1 * 1024 * 1024 # 1 MB
# If pymupdf4llm produces fewer characters *per page* than this threshold,
# the PDF is likely image-based or encrypted — fall back to MarkItDown.
# Rationale: normal text PDFs yield 200-2000 chars/page; image-based PDFs
# yield close to 0. 50 chars/page gives a wide safety margin.
# Falls back to absolute 200-char check when page count is unavailable.
_MIN_CHARS_PER_PAGE = 50
def _pymupdf_output_too_sparse(text: str, file_path: Path) -> bool:
"""Return True if pymupdf4llm output is suspiciously short (image-based PDF).
Uses chars-per-page rather than an absolute threshold so that both short
documents (few pages, few chars) and long documents (many pages, many chars)
are handled correctly.
"""
chars = len(text.strip())
doc = None
pages: int | None = None
try:
import pymupdf
doc = pymupdf.open(str(file_path))
pages = len(doc)
except Exception:
pass
finally:
if doc is not None:
try:
doc.close()
except Exception:
pass
if pages is not None and pages > 0:
return (chars / pages) < _MIN_CHARS_PER_PAGE
# Fallback: absolute threshold when page count is unavailable
return chars < 200
def _convert_pdf_with_pymupdf4llm(file_path: Path) -> str | None:
"""Attempt PDF conversion with pymupdf4llm.
Returns the markdown text, or None if pymupdf4llm is not installed or
if conversion fails (e.g. encrypted/corrupt PDF).
"""
try:
import pymupdf4llm
except ImportError:
return None
try:
return pymupdf4llm.to_markdown(str(file_path))
except Exception:
logger.exception("pymupdf4llm failed to convert %s; falling back to MarkItDown", file_path.name)
return None
def _convert_with_markitdown(file_path: Path) -> str:
"""Convert any supported file to markdown text using MarkItDown."""
from markitdown import MarkItDown
md = MarkItDown()
return md.convert(str(file_path)).text_content
def _do_convert(file_path: Path, pdf_converter: str) -> str:
"""Synchronous conversion — called directly or via asyncio.to_thread.
Args:
file_path: Path to the file.
pdf_converter: "auto" | "pymupdf4llm" | "markitdown"
"""
is_pdf = file_path.suffix.lower() == ".pdf"
if is_pdf and pdf_converter != "markitdown":
# Try pymupdf4llm first (auto or explicit)
pymupdf_text = _convert_pdf_with_pymupdf4llm(file_path)
if pymupdf_text is not None:
# pymupdf4llm is installed
if pdf_converter == "pymupdf4llm":
# Explicit — use as-is regardless of output length
return pymupdf_text
# auto mode: fall back if output looks like a failed parse.
# Use chars-per-page to distinguish image-based PDFs (near 0) from
# legitimately short documents.
if not _pymupdf_output_too_sparse(pymupdf_text, file_path):
return pymupdf_text
logger.warning(
"pymupdf4llm produced only %d chars for %s (likely image-based PDF); falling back to MarkItDown",
len(pymupdf_text.strip()),
file_path.name,
)
# pymupdf4llm not installed or fallback triggered → use MarkItDown
return _convert_with_markitdown(file_path)
async def convert_file_to_markdown(file_path: Path) -> Path | None:
"""Convert a file to markdown using markitdown.
"""Convert a supported document file to Markdown.
PDF files are handled with a two-converter strategy (see module docstring).
Large files (> 1 MB) are offloaded to a thread pool to avoid blocking the
event loop.
Args:
file_path: Path to the file to convert.
Returns:
Path to the markdown file if conversion was successful, None otherwise.
Path to the generated .md file, or None if conversion failed.
"""
try:
from markitdown import MarkItDown
pdf_converter = _get_pdf_converter()
file_size = file_path.stat().st_size
md = MarkItDown()
result = md.convert(str(file_path))
if file_size > _ASYNC_THRESHOLD_BYTES:
text = await asyncio.to_thread(_do_convert, file_path, pdf_converter)
else:
text = _do_convert(file_path, pdf_converter)
# Save as .md file with same name
md_path = file_path.with_suffix(".md")
md_path.write_text(result.text_content, encoding="utf-8")
md_path.write_text(text, encoding="utf-8")
logger.info(f"Converted {file_path.name} to markdown: {md_path.name}")
logger.info("Converted %s to markdown: %s (%d chars)", file_path.name, md_path.name, len(text))
return md_path
except Exception as e:
logger.error(f"Failed to convert {file_path.name} to markdown: {e}")
logger.error("Failed to convert %s to markdown: %s", file_path.name, e)
return None
@@ -69,6 +186,8 @@ _BOLD_HEADING_RE = re.compile(r"^\*\*((ITEM|PART|SECTION|SCHEDULE|EXHIBIT|APPEND
# Keeps prompt size bounded even for very long documents.
MAX_OUTLINE_ENTRIES = 50
_ALLOWED_PDF_CONVERTERS = {"auto", "pymupdf4llm", "markitdown"}
def extract_outline(md_path: Path) -> list[dict]:
"""Extract document outline (headings) from a Markdown file.
@@ -122,14 +241,23 @@ def extract_outline(md_path: Path) -> list[dict]:
def _get_pdf_converter() -> str:
"""Read pdf_converter setting from app config, defaulting to 'auto'."""
"""Read pdf_converter setting from app config, defaulting to 'auto'.
Normalizes the value to lowercase and validates it against the allowed set
so that values like 'AUTO' or 'MarkItDown' from config.yaml don't silently
fall through to unexpected behaviour.
"""
try:
from deerflow.config.app_config import get_app_config
cfg = get_app_config()
uploads_cfg = getattr(cfg, "uploads", None)
if uploads_cfg is not None:
return str(getattr(uploads_cfg, "pdf_converter", "auto"))
raw = str(getattr(uploads_cfg, "pdf_converter", "auto")).strip().lower()
if raw not in _ALLOWED_PDF_CONVERTERS:
logger.warning("Invalid pdf_converter value %r; falling back to 'auto'", raw)
return "auto"
return raw
except Exception:
pass
return "auto"
+3
View File
@@ -34,6 +34,9 @@ dependencies = [
"langgraph-sdk>=0.1.51",
]
[project.optional-dependencies]
pymupdf = ["pymupdf4llm>=0.0.17"]
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"