mirror of
https://github.com/bytedance/deer-flow.git
synced 2026-05-22 07:56:48 +00:00
[security] fix(uploads): require explicit opt-in for host-side document conversion (#2332)
* fix: disable host-side upload conversion by default * fix: address PR review comments on upload conversion gate
This commit is contained in:
@@ -7,6 +7,7 @@ import stat
|
|||||||
from fastapi import APIRouter, File, HTTPException, UploadFile
|
from fastapi import APIRouter, File, HTTPException, UploadFile
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
from deerflow.config.app_config import get_app_config
|
||||||
from deerflow.config.paths import get_paths
|
from deerflow.config.paths import get_paths
|
||||||
from deerflow.sandbox.sandbox_provider import SandboxProvider, get_sandbox_provider
|
from deerflow.sandbox.sandbox_provider import SandboxProvider, get_sandbox_provider
|
||||||
from deerflow.uploads.manager import (
|
from deerflow.uploads.manager import (
|
||||||
@@ -57,6 +58,30 @@ def _uses_thread_data_mounts(sandbox_provider: SandboxProvider) -> bool:
|
|||||||
return bool(getattr(sandbox_provider, "uses_thread_data_mounts", False))
|
return bool(getattr(sandbox_provider, "uses_thread_data_mounts", False))
|
||||||
|
|
||||||
|
|
||||||
|
def _get_uploads_config_value(key: str, default: object) -> object:
|
||||||
|
"""Read a value from the uploads config, supporting dict and attribute access."""
|
||||||
|
cfg = get_app_config()
|
||||||
|
uploads_cfg = getattr(cfg, "uploads", None)
|
||||||
|
if isinstance(uploads_cfg, dict):
|
||||||
|
return uploads_cfg.get(key, default)
|
||||||
|
return getattr(uploads_cfg, key, default)
|
||||||
|
|
||||||
|
|
||||||
|
def _auto_convert_documents_enabled() -> bool:
|
||||||
|
"""Return whether automatic host-side document conversion is enabled.
|
||||||
|
|
||||||
|
The secure default is disabled unless an operator explicitly opts in via
|
||||||
|
uploads.auto_convert_documents in config.yaml.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
raw = _get_uploads_config_value("auto_convert_documents", False)
|
||||||
|
if isinstance(raw, str):
|
||||||
|
return raw.strip().lower() in {"1", "true", "yes", "on"}
|
||||||
|
return bool(raw)
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
@router.post("", response_model=UploadResponse)
|
@router.post("", response_model=UploadResponse)
|
||||||
async def upload_files(
|
async def upload_files(
|
||||||
thread_id: str,
|
thread_id: str,
|
||||||
@@ -79,6 +104,7 @@ async def upload_files(
|
|||||||
if sync_to_sandbox:
|
if sync_to_sandbox:
|
||||||
sandbox_id = sandbox_provider.acquire(thread_id)
|
sandbox_id = sandbox_provider.acquire(thread_id)
|
||||||
sandbox = sandbox_provider.get(sandbox_id)
|
sandbox = sandbox_provider.get(sandbox_id)
|
||||||
|
auto_convert_documents = _auto_convert_documents_enabled()
|
||||||
|
|
||||||
for file in files:
|
for file in files:
|
||||||
if not file.filename:
|
if not file.filename:
|
||||||
@@ -112,7 +138,7 @@ async def upload_files(
|
|||||||
logger.info(f"Saved file: {safe_filename} ({len(content)} bytes) to {file_info['path']}")
|
logger.info(f"Saved file: {safe_filename} ({len(content)} bytes) to {file_info['path']}")
|
||||||
|
|
||||||
file_ext = file_path.suffix.lower()
|
file_ext = file_path.suffix.lower()
|
||||||
if file_ext in CONVERTIBLE_EXTENSIONS:
|
if auto_convert_documents and file_ext in CONVERTIBLE_EXTENSIONS:
|
||||||
md_path = await convert_file_to_markdown(file_path)
|
md_path = await convert_file_to_markdown(file_path)
|
||||||
if md_path:
|
if md_path:
|
||||||
md_virtual_path = upload_virtual_path(md_path.name)
|
md_virtual_path = upload_virtual_path(md_path.name)
|
||||||
|
|||||||
@@ -2,12 +2,12 @@
|
|||||||
|
|
||||||
## 概述
|
## 概述
|
||||||
|
|
||||||
DeerFlow 后端提供了完整的文件上传功能,支持多文件上传,并自动将 Office 文档和 PDF 转换为 Markdown 格式。
|
DeerFlow 后端提供了完整的文件上传功能,支持多文件上传,并可选地将 Office 文档和 PDF 转换为 Markdown 格式。
|
||||||
|
|
||||||
## 功能特性
|
## 功能特性
|
||||||
|
|
||||||
- ✅ 支持多文件同时上传
|
- ✅ 支持多文件同时上传
|
||||||
- ✅ 自动转换文档为 Markdown(PDF、PPT、Excel、Word)
|
- ✅ 可选地转换文档为 Markdown(PDF、PPT、Excel、Word)
|
||||||
- ✅ 文件存储在线程隔离的目录中
|
- ✅ 文件存储在线程隔离的目录中
|
||||||
- ✅ Agent 自动感知已上传的文件
|
- ✅ Agent 自动感知已上传的文件
|
||||||
- ✅ 支持文件列表查询和删除
|
- ✅ 支持文件列表查询和删除
|
||||||
@@ -86,7 +86,7 @@ DELETE /api/threads/{thread_id}/uploads/{filename}
|
|||||||
|
|
||||||
## 支持的文档格式
|
## 支持的文档格式
|
||||||
|
|
||||||
以下格式会自动转换为 Markdown:
|
以下格式在显式启用 `uploads.auto_convert_documents: true` 时会自动转换为 Markdown:
|
||||||
- PDF (`.pdf`)
|
- PDF (`.pdf`)
|
||||||
- PowerPoint (`.ppt`, `.pptx`)
|
- PowerPoint (`.ppt`, `.pptx`)
|
||||||
- Excel (`.xls`, `.xlsx`)
|
- Excel (`.xls`, `.xlsx`)
|
||||||
@@ -94,6 +94,8 @@ DELETE /api/threads/{thread_id}/uploads/{filename}
|
|||||||
|
|
||||||
转换后的 Markdown 文件会保存在同一目录下,文件名为原文件名 + `.md` 扩展名。
|
转换后的 Markdown 文件会保存在同一目录下,文件名为原文件名 + `.md` 扩展名。
|
||||||
|
|
||||||
|
默认情况下,自动转换是关闭的,以避免在网关主机上对不受信任的 Office/PDF 上传执行解析。只有在受信任部署中明确接受此风险时,才应将 `uploads.auto_convert_documents` 设置为 `true`。
|
||||||
|
|
||||||
## Agent 集成
|
## Agent 集成
|
||||||
|
|
||||||
### 自动文件列举
|
### 自动文件列举
|
||||||
@@ -207,6 +209,7 @@ backend/.deer-flow/threads/
|
|||||||
- 最大文件大小:100MB(可在 nginx.conf 中配置 `client_max_body_size`)
|
- 最大文件大小:100MB(可在 nginx.conf 中配置 `client_max_body_size`)
|
||||||
- 文件名安全性:系统会自动验证文件路径,防止目录遍历攻击
|
- 文件名安全性:系统会自动验证文件路径,防止目录遍历攻击
|
||||||
- 线程隔离:每个线程的上传文件相互隔离,无法跨线程访问
|
- 线程隔离:每个线程的上传文件相互隔离,无法跨线程访问
|
||||||
|
- 自动文档转换默认关闭;如需启用,需在 `config.yaml` 中显式设置 `uploads.auto_convert_documents: true`
|
||||||
|
|
||||||
## 技术实现
|
## 技术实现
|
||||||
|
|
||||||
|
|||||||
@@ -19,6 +19,8 @@ import logging
|
|||||||
import re
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
from deerflow.config.app_config import get_app_config
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# File extensions that should be converted to markdown
|
# File extensions that should be converted to markdown
|
||||||
@@ -286,6 +288,15 @@ def extract_outline(md_path: Path) -> list[dict]:
|
|||||||
return outline
|
return outline
|
||||||
|
|
||||||
|
|
||||||
|
def _get_uploads_config_value(key: str, default: object) -> object:
|
||||||
|
"""Read a value from the uploads config, supporting dict and attribute access."""
|
||||||
|
cfg = get_app_config()
|
||||||
|
uploads_cfg = getattr(cfg, "uploads", None)
|
||||||
|
if isinstance(uploads_cfg, dict):
|
||||||
|
return uploads_cfg.get(key, default)
|
||||||
|
return getattr(uploads_cfg, key, default)
|
||||||
|
|
||||||
|
|
||||||
def _get_pdf_converter() -> str:
|
def _get_pdf_converter() -> str:
|
||||||
"""Read pdf_converter setting from app config, defaulting to 'auto'.
|
"""Read pdf_converter setting from app config, defaulting to 'auto'.
|
||||||
|
|
||||||
@@ -294,16 +305,11 @@ def _get_pdf_converter() -> str:
|
|||||||
fall through to unexpected behaviour.
|
fall through to unexpected behaviour.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
from deerflow.config.app_config import get_app_config
|
raw = str(_get_uploads_config_value("pdf_converter", "auto")).strip().lower()
|
||||||
|
if raw not in _ALLOWED_PDF_CONVERTERS:
|
||||||
cfg = get_app_config()
|
logger.warning("Invalid pdf_converter value %r; falling back to 'auto'", raw)
|
||||||
uploads_cfg = getattr(cfg, "uploads", None)
|
return "auto"
|
||||||
if uploads_cfg is not None:
|
return raw
|
||||||
raw = str(getattr(uploads_cfg, "pdf_converter", "auto")).strip().lower()
|
|
||||||
if raw not in _ALLOWED_PDF_CONVERTERS:
|
|
||||||
logger.warning("Invalid pdf_converter value %r; falling back to 'auto'", raw)
|
|
||||||
return "auto"
|
|
||||||
return raw
|
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
return "auto"
|
return "auto"
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ from deerflow.utils.file_conversion import (
|
|||||||
_MIN_CHARS_PER_PAGE,
|
_MIN_CHARS_PER_PAGE,
|
||||||
MAX_OUTLINE_ENTRIES,
|
MAX_OUTLINE_ENTRIES,
|
||||||
_do_convert,
|
_do_convert,
|
||||||
|
_get_pdf_converter,
|
||||||
_pymupdf_output_too_sparse,
|
_pymupdf_output_too_sparse,
|
||||||
convert_file_to_markdown,
|
convert_file_to_markdown,
|
||||||
extract_outline,
|
extract_outline,
|
||||||
@@ -214,9 +215,27 @@ class TestDoConvert:
|
|||||||
assert result == "MarkItDown fallback"
|
assert result == "MarkItDown fallback"
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
class TestGetPdfConverter:
|
||||||
# convert_file_to_markdown — async + file writing
|
def test_reads_dict_backed_uploads_config(self):
|
||||||
# ---------------------------------------------------------------------------
|
cfg = MagicMock()
|
||||||
|
cfg.uploads = {"pdf_converter": "markitdown"}
|
||||||
|
|
||||||
|
with patch("deerflow.utils.file_conversion.get_app_config", return_value=cfg):
|
||||||
|
assert _get_pdf_converter() == "markitdown"
|
||||||
|
|
||||||
|
def test_reads_attribute_backed_uploads_config(self):
|
||||||
|
cfg = MagicMock()
|
||||||
|
cfg.uploads = MagicMock(pdf_converter="pymupdf4llm")
|
||||||
|
|
||||||
|
with patch("deerflow.utils.file_conversion.get_app_config", return_value=cfg):
|
||||||
|
assert _get_pdf_converter() == "pymupdf4llm"
|
||||||
|
|
||||||
|
def test_invalid_value_falls_back_to_auto(self):
|
||||||
|
cfg = MagicMock()
|
||||||
|
cfg.uploads = {"pdf_converter": "not-a-real-converter"}
|
||||||
|
|
||||||
|
with patch("deerflow.utils.file_conversion.get_app_config", return_value=cfg):
|
||||||
|
assert _get_pdf_converter() == "auto"
|
||||||
|
|
||||||
|
|
||||||
class TestConvertFileToMarkdown:
|
class TestConvertFileToMarkdown:
|
||||||
|
|||||||
@@ -56,6 +56,34 @@ def test_upload_files_skips_acquire_when_thread_data_is_mounted(tmp_path):
|
|||||||
provider.get.assert_not_called()
|
provider.get.assert_not_called()
|
||||||
|
|
||||||
|
|
||||||
|
def test_upload_files_does_not_auto_convert_documents_by_default(tmp_path):
|
||||||
|
thread_uploads_dir = tmp_path / "uploads"
|
||||||
|
thread_uploads_dir.mkdir(parents=True)
|
||||||
|
|
||||||
|
provider = MagicMock()
|
||||||
|
provider.uses_thread_data_mounts = True
|
||||||
|
provider.acquire.return_value = "local"
|
||||||
|
sandbox = MagicMock()
|
||||||
|
provider.get.return_value = sandbox
|
||||||
|
|
||||||
|
with (
|
||||||
|
patch.object(uploads, "get_uploads_dir", return_value=thread_uploads_dir),
|
||||||
|
patch.object(uploads, "ensure_uploads_dir", return_value=thread_uploads_dir),
|
||||||
|
patch.object(uploads, "get_sandbox_provider", return_value=provider),
|
||||||
|
patch.object(uploads, "_auto_convert_documents_enabled", return_value=False),
|
||||||
|
patch.object(uploads, "convert_file_to_markdown", AsyncMock()) as convert_mock,
|
||||||
|
):
|
||||||
|
file = UploadFile(filename="report.pdf", file=BytesIO(b"pdf-bytes"))
|
||||||
|
result = asyncio.run(uploads.upload_files("thread-local", files=[file]))
|
||||||
|
|
||||||
|
assert result.success is True
|
||||||
|
assert len(result.files) == 1
|
||||||
|
assert result.files[0]["filename"] == "report.pdf"
|
||||||
|
assert "markdown_file" not in result.files[0]
|
||||||
|
convert_mock.assert_not_called()
|
||||||
|
assert not (thread_uploads_dir / "report.md").exists()
|
||||||
|
|
||||||
|
|
||||||
def test_upload_files_syncs_non_local_sandbox_and_marks_markdown_file(tmp_path):
|
def test_upload_files_syncs_non_local_sandbox_and_marks_markdown_file(tmp_path):
|
||||||
thread_uploads_dir = tmp_path / "uploads"
|
thread_uploads_dir = tmp_path / "uploads"
|
||||||
thread_uploads_dir.mkdir(parents=True)
|
thread_uploads_dir.mkdir(parents=True)
|
||||||
@@ -75,6 +103,7 @@ def test_upload_files_syncs_non_local_sandbox_and_marks_markdown_file(tmp_path):
|
|||||||
patch.object(uploads, "get_uploads_dir", return_value=thread_uploads_dir),
|
patch.object(uploads, "get_uploads_dir", return_value=thread_uploads_dir),
|
||||||
patch.object(uploads, "ensure_uploads_dir", return_value=thread_uploads_dir),
|
patch.object(uploads, "ensure_uploads_dir", return_value=thread_uploads_dir),
|
||||||
patch.object(uploads, "get_sandbox_provider", return_value=provider),
|
patch.object(uploads, "get_sandbox_provider", return_value=provider),
|
||||||
|
patch.object(uploads, "_auto_convert_documents_enabled", return_value=True),
|
||||||
patch.object(uploads, "convert_file_to_markdown", AsyncMock(side_effect=fake_convert)),
|
patch.object(uploads, "convert_file_to_markdown", AsyncMock(side_effect=fake_convert)),
|
||||||
):
|
):
|
||||||
file = UploadFile(filename="report.pdf", file=BytesIO(b"pdf-bytes"))
|
file = UploadFile(filename="report.pdf", file=BytesIO(b"pdf-bytes"))
|
||||||
@@ -112,6 +141,7 @@ def test_upload_files_makes_non_local_files_sandbox_writable(tmp_path):
|
|||||||
patch.object(uploads, "get_uploads_dir", return_value=thread_uploads_dir),
|
patch.object(uploads, "get_uploads_dir", return_value=thread_uploads_dir),
|
||||||
patch.object(uploads, "ensure_uploads_dir", return_value=thread_uploads_dir),
|
patch.object(uploads, "ensure_uploads_dir", return_value=thread_uploads_dir),
|
||||||
patch.object(uploads, "get_sandbox_provider", return_value=provider),
|
patch.object(uploads, "get_sandbox_provider", return_value=provider),
|
||||||
|
patch.object(uploads, "_auto_convert_documents_enabled", return_value=True),
|
||||||
patch.object(uploads, "convert_file_to_markdown", AsyncMock(side_effect=fake_convert)),
|
patch.object(uploads, "convert_file_to_markdown", AsyncMock(side_effect=fake_convert)),
|
||||||
patch.object(uploads, "_make_file_sandbox_writable") as make_writable,
|
patch.object(uploads, "_make_file_sandbox_writable") as make_writable,
|
||||||
):
|
):
|
||||||
@@ -218,3 +248,39 @@ def test_delete_uploaded_file_removes_generated_markdown_companion(tmp_path):
|
|||||||
assert result == {"success": True, "message": "Deleted report.pdf"}
|
assert result == {"success": True, "message": "Deleted report.pdf"}
|
||||||
assert not (thread_uploads_dir / "report.pdf").exists()
|
assert not (thread_uploads_dir / "report.pdf").exists()
|
||||||
assert not (thread_uploads_dir / "report.md").exists()
|
assert not (thread_uploads_dir / "report.md").exists()
|
||||||
|
|
||||||
|
|
||||||
|
def test_auto_convert_documents_enabled_defaults_to_false_on_config_errors():
|
||||||
|
with patch.object(uploads, "get_app_config", side_effect=RuntimeError("boom")):
|
||||||
|
assert uploads._auto_convert_documents_enabled() is False
|
||||||
|
|
||||||
|
|
||||||
|
def test_auto_convert_documents_enabled_reads_dict_backed_uploads_config():
|
||||||
|
cfg = MagicMock()
|
||||||
|
cfg.uploads = {"auto_convert_documents": True}
|
||||||
|
|
||||||
|
with patch.object(uploads, "get_app_config", return_value=cfg):
|
||||||
|
assert uploads._auto_convert_documents_enabled() is True
|
||||||
|
|
||||||
|
|
||||||
|
def test_auto_convert_documents_enabled_accepts_boolean_and_string_truthy_values():
|
||||||
|
false_cfg = MagicMock()
|
||||||
|
false_cfg.uploads = MagicMock(auto_convert_documents=False)
|
||||||
|
|
||||||
|
true_cfg = MagicMock()
|
||||||
|
true_cfg.uploads = MagicMock(auto_convert_documents=True)
|
||||||
|
|
||||||
|
string_true_cfg = MagicMock()
|
||||||
|
string_true_cfg.uploads = MagicMock(auto_convert_documents="YES")
|
||||||
|
|
||||||
|
string_false_cfg = MagicMock()
|
||||||
|
string_false_cfg.uploads = MagicMock(auto_convert_documents="false")
|
||||||
|
|
||||||
|
with patch.object(uploads, "get_app_config", return_value=false_cfg):
|
||||||
|
assert uploads._auto_convert_documents_enabled() is False
|
||||||
|
with patch.object(uploads, "get_app_config", return_value=true_cfg):
|
||||||
|
assert uploads._auto_convert_documents_enabled() is True
|
||||||
|
with patch.object(uploads, "get_app_config", return_value=string_true_cfg):
|
||||||
|
assert uploads._auto_convert_documents_enabled() is True
|
||||||
|
with patch.object(uploads, "get_app_config", return_value=string_false_cfg):
|
||||||
|
assert uploads._auto_convert_documents_enabled() is False
|
||||||
|
|||||||
+7
-1
@@ -480,7 +480,13 @@ tool_search:
|
|||||||
# Option 1: Local Sandbox (Default)
|
# Option 1: Local Sandbox (Default)
|
||||||
# Executes commands directly on the host machine
|
# Executes commands directly on the host machine
|
||||||
uploads:
|
uploads:
|
||||||
# PDF-to-Markdown converter used when a PDF is uploaded.
|
# Automatic Office/PDF conversion runs on the backend host before sandbox
|
||||||
|
# isolation applies. Keep this disabled unless uploads come from a fully
|
||||||
|
# trusted source and you intentionally accept host-side parser risk.
|
||||||
|
auto_convert_documents: false
|
||||||
|
# Controls which PDF-to-Markdown converter is used whenever PDF conversion
|
||||||
|
# runs. Automatic upload conversion is gated separately by
|
||||||
|
# auto_convert_documents.
|
||||||
# auto — prefer pymupdf4llm when installed; fall back to MarkItDown for
|
# auto — prefer pymupdf4llm when installed; fall back to MarkItDown for
|
||||||
# image-based or encrypted PDFs (recommended default).
|
# image-based or encrypted PDFs (recommended default).
|
||||||
# pymupdf4llm — always use pymupdf4llm (must be installed: uv add pymupdf4llm).
|
# pymupdf4llm — always use pymupdf4llm (must be installed: uv add pymupdf4llm).
|
||||||
|
|||||||
Reference in New Issue
Block a user