mirror of
https://github.com/bytedance/deer-flow.git
synced 2026-05-24 08:55:59 +00:00
feat(uploads): add pymupdf4llm PDF converter with auto-fallback and async offload (#1727)
* feat(uploads): add pymupdf4llm PDF converter with auto-fallback and async offload - Introduce pymupdf4llm as an optional PDF converter with better heading detection and table preservation than MarkItDown - Auto mode: prefer pymupdf4llm when installed; fall back to MarkItDown when output is suspiciously sparse (image-based / scanned PDFs) - Sparsity check uses chars-per-page (< 50 chars/page) rather than an absolute threshold, correctly handling both short and long documents - Large files (> 1 MB) are offloaded to asyncio.to_thread() to avoid blocking the event loop (related: #1569) - Add UploadsConfig with pdf_converter field (auto/pymupdf4llm/markitdown) - Add pymupdf4llm as optional dependency: pip install deerflow-harness[pymupdf] - Add 14 unit tests covering sparsity heuristic, routing logic, and async path * fix(uploads): address Copilot review comments on PDF converter - Fix docstring: MIN_CHARS_PYMUPDF -> _MIN_CHARS_PER_PAGE (typo) - Fix file handle leak: wrap pymupdf.open in try/finally to ensure doc.close() - Fix silent fallback gap: _convert_pdf_with_pymupdf4llm now catches all conversion exceptions (not just ImportError), so encrypted/corrupt PDFs fall back to MarkItDown instead of propagating - Tighten type: pdf_converter field changed from str to Literal[auto|pymupdf4llm|markitdown] - Normalize config value: _get_pdf_converter() strips and lowercases the raw config string, warns and falls back to 'auto' on unknown values
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Any, Self
|
||||
from typing import Any, Literal, Self
|
||||
|
||||
import yaml
|
||||
from dotenv import load_dotenv
|
||||
@@ -28,11 +28,26 @@ load_dotenv()
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class UploadsConfig(BaseModel):
|
||||
"""Configuration for file upload handling."""
|
||||
|
||||
pdf_converter: Literal["auto", "pymupdf4llm", "markitdown"] = Field(
|
||||
default="auto",
|
||||
description=(
|
||||
"PDF-to-Markdown converter. "
|
||||
"'auto': prefer pymupdf4llm when installed, fall back to MarkItDown for image-based PDFs; "
|
||||
"'pymupdf4llm': always use pymupdf4llm (must be installed); "
|
||||
"'markitdown': always use MarkItDown (original behaviour)."
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
class AppConfig(BaseModel):
|
||||
"""Config for the DeerFlow application"""
|
||||
|
||||
log_level: str = Field(default="info", description="Logging level for deerflow modules (debug/info/warning/error)")
|
||||
token_usage: TokenUsageConfig = Field(default_factory=TokenUsageConfig, description="Token usage tracking configuration")
|
||||
uploads: UploadsConfig = Field(default_factory=UploadsConfig, description="File upload handling configuration")
|
||||
models: list[ModelConfig] = Field(default_factory=list, description="Available models")
|
||||
sandbox: SandboxConfig = Field(description="Sandbox configuration")
|
||||
tools: list[ToolConfig] = Field(default_factory=list, description="Available tools")
|
||||
|
||||
Reference in New Issue
Block a user