diff --git a/backend/app/gateway/routers/uploads.py b/backend/app/gateway/routers/uploads.py index d03639c74..6f8be52a1 100644 --- a/backend/app/gateway/routers/uploads.py +++ b/backend/app/gateway/routers/uploads.py @@ -7,6 +7,7 @@ import stat from fastapi import APIRouter, File, HTTPException, UploadFile from pydantic import BaseModel +from deerflow.config.app_config import get_app_config from deerflow.config.paths import get_paths from deerflow.sandbox.sandbox_provider import SandboxProvider, get_sandbox_provider from deerflow.uploads.manager import ( @@ -57,6 +58,30 @@ def _uses_thread_data_mounts(sandbox_provider: SandboxProvider) -> bool: return bool(getattr(sandbox_provider, "uses_thread_data_mounts", False)) +def _get_uploads_config_value(key: str, default: object) -> object: + """Read a value from the uploads config, supporting dict and attribute access.""" + cfg = get_app_config() + uploads_cfg = getattr(cfg, "uploads", None) + if isinstance(uploads_cfg, dict): + return uploads_cfg.get(key, default) + return getattr(uploads_cfg, key, default) + + +def _auto_convert_documents_enabled() -> bool: + """Return whether automatic host-side document conversion is enabled. + + The secure default is disabled unless an operator explicitly opts in via + uploads.auto_convert_documents in config.yaml. + """ + try: + raw = _get_uploads_config_value("auto_convert_documents", False) + if isinstance(raw, str): + return raw.strip().lower() in {"1", "true", "yes", "on"} + return bool(raw) + except Exception: + return False + + @router.post("", response_model=UploadResponse) async def upload_files( thread_id: str, @@ -79,6 +104,7 @@ async def upload_files( if sync_to_sandbox: sandbox_id = sandbox_provider.acquire(thread_id) sandbox = sandbox_provider.get(sandbox_id) + auto_convert_documents = _auto_convert_documents_enabled() for file in files: if not file.filename: @@ -112,7 +138,7 @@ async def upload_files( logger.info(f"Saved file: {safe_filename} ({len(content)} bytes) to {file_info['path']}") file_ext = file_path.suffix.lower() - if file_ext in CONVERTIBLE_EXTENSIONS: + if auto_convert_documents and file_ext in CONVERTIBLE_EXTENSIONS: md_path = await convert_file_to_markdown(file_path) if md_path: md_virtual_path = upload_virtual_path(md_path.name) diff --git a/backend/docs/FILE_UPLOAD.md b/backend/docs/FILE_UPLOAD.md index f19a9d7a5..bf6962e4f 100644 --- a/backend/docs/FILE_UPLOAD.md +++ b/backend/docs/FILE_UPLOAD.md @@ -2,12 +2,12 @@ ## 概述 -DeerFlow 后端提供了完整的文件上传功能,支持多文件上传,并自动将 Office 文档和 PDF 转换为 Markdown 格式。 +DeerFlow 后端提供了完整的文件上传功能,支持多文件上传,并可选地将 Office 文档和 PDF 转换为 Markdown 格式。 ## 功能特性 - ✅ 支持多文件同时上传 -- ✅ 自动转换文档为 Markdown(PDF、PPT、Excel、Word) +- ✅ 可选地转换文档为 Markdown(PDF、PPT、Excel、Word) - ✅ 文件存储在线程隔离的目录中 - ✅ Agent 自动感知已上传的文件 - ✅ 支持文件列表查询和删除 @@ -86,7 +86,7 @@ DELETE /api/threads/{thread_id}/uploads/{filename} ## 支持的文档格式 -以下格式会自动转换为 Markdown: +以下格式在显式启用 `uploads.auto_convert_documents: true` 时会自动转换为 Markdown: - PDF (`.pdf`) - PowerPoint (`.ppt`, `.pptx`) - Excel (`.xls`, `.xlsx`) @@ -94,6 +94,8 @@ DELETE /api/threads/{thread_id}/uploads/{filename} 转换后的 Markdown 文件会保存在同一目录下,文件名为原文件名 + `.md` 扩展名。 +默认情况下,自动转换是关闭的,以避免在网关主机上对不受信任的 Office/PDF 上传执行解析。只有在受信任部署中明确接受此风险时,才应将 `uploads.auto_convert_documents` 设置为 `true`。 + ## Agent 集成 ### 自动文件列举 @@ -207,6 +209,7 @@ backend/.deer-flow/threads/ - 最大文件大小:100MB(可在 nginx.conf 中配置 `client_max_body_size`) - 文件名安全性:系统会自动验证文件路径,防止目录遍历攻击 - 线程隔离:每个线程的上传文件相互隔离,无法跨线程访问 +- 自动文档转换默认关闭;如需启用,需在 `config.yaml` 中显式设置 `uploads.auto_convert_documents: true` ## 技术实现 diff --git a/backend/packages/harness/deerflow/utils/file_conversion.py b/backend/packages/harness/deerflow/utils/file_conversion.py index 68755b675..f51b47caa 100644 --- a/backend/packages/harness/deerflow/utils/file_conversion.py +++ b/backend/packages/harness/deerflow/utils/file_conversion.py @@ -19,6 +19,8 @@ import logging import re from pathlib import Path +from deerflow.config.app_config import get_app_config + logger = logging.getLogger(__name__) # File extensions that should be converted to markdown @@ -286,6 +288,15 @@ def extract_outline(md_path: Path) -> list[dict]: return outline +def _get_uploads_config_value(key: str, default: object) -> object: + """Read a value from the uploads config, supporting dict and attribute access.""" + cfg = get_app_config() + uploads_cfg = getattr(cfg, "uploads", None) + if isinstance(uploads_cfg, dict): + return uploads_cfg.get(key, default) + return getattr(uploads_cfg, key, default) + + def _get_pdf_converter() -> str: """Read pdf_converter setting from app config, defaulting to 'auto'. @@ -294,16 +305,11 @@ def _get_pdf_converter() -> str: fall through to unexpected behaviour. """ try: - from deerflow.config.app_config import get_app_config - - cfg = get_app_config() - uploads_cfg = getattr(cfg, "uploads", None) - if uploads_cfg is not None: - raw = str(getattr(uploads_cfg, "pdf_converter", "auto")).strip().lower() - if raw not in _ALLOWED_PDF_CONVERTERS: - logger.warning("Invalid pdf_converter value %r; falling back to 'auto'", raw) - return "auto" - return raw + raw = str(_get_uploads_config_value("pdf_converter", "auto")).strip().lower() + if raw not in _ALLOWED_PDF_CONVERTERS: + logger.warning("Invalid pdf_converter value %r; falling back to 'auto'", raw) + return "auto" + return raw except Exception: pass return "auto" diff --git a/backend/tests/test_file_conversion.py b/backend/tests/test_file_conversion.py index 42abd3b4a..4c2189bc2 100644 --- a/backend/tests/test_file_conversion.py +++ b/backend/tests/test_file_conversion.py @@ -12,6 +12,7 @@ from deerflow.utils.file_conversion import ( _MIN_CHARS_PER_PAGE, MAX_OUTLINE_ENTRIES, _do_convert, + _get_pdf_converter, _pymupdf_output_too_sparse, convert_file_to_markdown, extract_outline, @@ -214,9 +215,27 @@ class TestDoConvert: assert result == "MarkItDown fallback" -# --------------------------------------------------------------------------- -# convert_file_to_markdown — async + file writing -# --------------------------------------------------------------------------- +class TestGetPdfConverter: + def test_reads_dict_backed_uploads_config(self): + cfg = MagicMock() + cfg.uploads = {"pdf_converter": "markitdown"} + + with patch("deerflow.utils.file_conversion.get_app_config", return_value=cfg): + assert _get_pdf_converter() == "markitdown" + + def test_reads_attribute_backed_uploads_config(self): + cfg = MagicMock() + cfg.uploads = MagicMock(pdf_converter="pymupdf4llm") + + with patch("deerflow.utils.file_conversion.get_app_config", return_value=cfg): + assert _get_pdf_converter() == "pymupdf4llm" + + def test_invalid_value_falls_back_to_auto(self): + cfg = MagicMock() + cfg.uploads = {"pdf_converter": "not-a-real-converter"} + + with patch("deerflow.utils.file_conversion.get_app_config", return_value=cfg): + assert _get_pdf_converter() == "auto" class TestConvertFileToMarkdown: diff --git a/backend/tests/test_uploads_router.py b/backend/tests/test_uploads_router.py index cc7090a01..f305b998f 100644 --- a/backend/tests/test_uploads_router.py +++ b/backend/tests/test_uploads_router.py @@ -56,6 +56,34 @@ def test_upload_files_skips_acquire_when_thread_data_is_mounted(tmp_path): provider.get.assert_not_called() +def test_upload_files_does_not_auto_convert_documents_by_default(tmp_path): + thread_uploads_dir = tmp_path / "uploads" + thread_uploads_dir.mkdir(parents=True) + + provider = MagicMock() + provider.uses_thread_data_mounts = True + provider.acquire.return_value = "local" + sandbox = MagicMock() + provider.get.return_value = sandbox + + with ( + patch.object(uploads, "get_uploads_dir", return_value=thread_uploads_dir), + patch.object(uploads, "ensure_uploads_dir", return_value=thread_uploads_dir), + patch.object(uploads, "get_sandbox_provider", return_value=provider), + patch.object(uploads, "_auto_convert_documents_enabled", return_value=False), + patch.object(uploads, "convert_file_to_markdown", AsyncMock()) as convert_mock, + ): + file = UploadFile(filename="report.pdf", file=BytesIO(b"pdf-bytes")) + result = asyncio.run(uploads.upload_files("thread-local", files=[file])) + + assert result.success is True + assert len(result.files) == 1 + assert result.files[0]["filename"] == "report.pdf" + assert "markdown_file" not in result.files[0] + convert_mock.assert_not_called() + assert not (thread_uploads_dir / "report.md").exists() + + def test_upload_files_syncs_non_local_sandbox_and_marks_markdown_file(tmp_path): thread_uploads_dir = tmp_path / "uploads" thread_uploads_dir.mkdir(parents=True) @@ -75,6 +103,7 @@ def test_upload_files_syncs_non_local_sandbox_and_marks_markdown_file(tmp_path): patch.object(uploads, "get_uploads_dir", return_value=thread_uploads_dir), patch.object(uploads, "ensure_uploads_dir", return_value=thread_uploads_dir), patch.object(uploads, "get_sandbox_provider", return_value=provider), + patch.object(uploads, "_auto_convert_documents_enabled", return_value=True), patch.object(uploads, "convert_file_to_markdown", AsyncMock(side_effect=fake_convert)), ): file = UploadFile(filename="report.pdf", file=BytesIO(b"pdf-bytes")) @@ -112,6 +141,7 @@ def test_upload_files_makes_non_local_files_sandbox_writable(tmp_path): patch.object(uploads, "get_uploads_dir", return_value=thread_uploads_dir), patch.object(uploads, "ensure_uploads_dir", return_value=thread_uploads_dir), patch.object(uploads, "get_sandbox_provider", return_value=provider), + patch.object(uploads, "_auto_convert_documents_enabled", return_value=True), patch.object(uploads, "convert_file_to_markdown", AsyncMock(side_effect=fake_convert)), patch.object(uploads, "_make_file_sandbox_writable") as make_writable, ): @@ -218,3 +248,39 @@ def test_delete_uploaded_file_removes_generated_markdown_companion(tmp_path): assert result == {"success": True, "message": "Deleted report.pdf"} assert not (thread_uploads_dir / "report.pdf").exists() assert not (thread_uploads_dir / "report.md").exists() + + +def test_auto_convert_documents_enabled_defaults_to_false_on_config_errors(): + with patch.object(uploads, "get_app_config", side_effect=RuntimeError("boom")): + assert uploads._auto_convert_documents_enabled() is False + + +def test_auto_convert_documents_enabled_reads_dict_backed_uploads_config(): + cfg = MagicMock() + cfg.uploads = {"auto_convert_documents": True} + + with patch.object(uploads, "get_app_config", return_value=cfg): + assert uploads._auto_convert_documents_enabled() is True + + +def test_auto_convert_documents_enabled_accepts_boolean_and_string_truthy_values(): + false_cfg = MagicMock() + false_cfg.uploads = MagicMock(auto_convert_documents=False) + + true_cfg = MagicMock() + true_cfg.uploads = MagicMock(auto_convert_documents=True) + + string_true_cfg = MagicMock() + string_true_cfg.uploads = MagicMock(auto_convert_documents="YES") + + string_false_cfg = MagicMock() + string_false_cfg.uploads = MagicMock(auto_convert_documents="false") + + with patch.object(uploads, "get_app_config", return_value=false_cfg): + assert uploads._auto_convert_documents_enabled() is False + with patch.object(uploads, "get_app_config", return_value=true_cfg): + assert uploads._auto_convert_documents_enabled() is True + with patch.object(uploads, "get_app_config", return_value=string_true_cfg): + assert uploads._auto_convert_documents_enabled() is True + with patch.object(uploads, "get_app_config", return_value=string_false_cfg): + assert uploads._auto_convert_documents_enabled() is False diff --git a/config.example.yaml b/config.example.yaml index dbaa0e160..fc3d9c8c1 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -480,7 +480,13 @@ tool_search: # Option 1: Local Sandbox (Default) # Executes commands directly on the host machine uploads: - # PDF-to-Markdown converter used when a PDF is uploaded. + # Automatic Office/PDF conversion runs on the backend host before sandbox + # isolation applies. Keep this disabled unless uploads come from a fully + # trusted source and you intentionally accept host-side parser risk. + auto_convert_documents: false + # Controls which PDF-to-Markdown converter is used whenever PDF conversion + # runs. Automatic upload conversion is gated separately by + # auto_convert_documents. # auto — prefer pymupdf4llm when installed; fall back to MarkItDown for # image-based or encrypted PDFs (recommended default). # pymupdf4llm — always use pymupdf4llm (must be installed: uv add pymupdf4llm).