[security] fix(uploads): require explicit opt-in for host-side document conversion (#2332)

* fix: disable host-side upload conversion by default

* fix: address PR review comments on upload conversion gate
This commit is contained in:
Hinotobi
2026-04-18 22:47:42 +08:00
committed by GitHub
parent 5656f90792
commit 80e210f5bb
6 changed files with 144 additions and 18 deletions
+22 -3
View File
@@ -12,6 +12,7 @@ from deerflow.utils.file_conversion import (
_MIN_CHARS_PER_PAGE,
MAX_OUTLINE_ENTRIES,
_do_convert,
_get_pdf_converter,
_pymupdf_output_too_sparse,
convert_file_to_markdown,
extract_outline,
@@ -214,9 +215,27 @@ class TestDoConvert:
assert result == "MarkItDown fallback"
# ---------------------------------------------------------------------------
# convert_file_to_markdown — async + file writing
# ---------------------------------------------------------------------------
class TestGetPdfConverter:
def test_reads_dict_backed_uploads_config(self):
cfg = MagicMock()
cfg.uploads = {"pdf_converter": "markitdown"}
with patch("deerflow.utils.file_conversion.get_app_config", return_value=cfg):
assert _get_pdf_converter() == "markitdown"
def test_reads_attribute_backed_uploads_config(self):
cfg = MagicMock()
cfg.uploads = MagicMock(pdf_converter="pymupdf4llm")
with patch("deerflow.utils.file_conversion.get_app_config", return_value=cfg):
assert _get_pdf_converter() == "pymupdf4llm"
def test_invalid_value_falls_back_to_auto(self):
cfg = MagicMock()
cfg.uploads = {"pdf_converter": "not-a-real-converter"}
with patch("deerflow.utils.file_conversion.get_app_config", return_value=cfg):
assert _get_pdf_converter() == "auto"
class TestConvertFileToMarkdown:
+66
View File
@@ -56,6 +56,34 @@ def test_upload_files_skips_acquire_when_thread_data_is_mounted(tmp_path):
provider.get.assert_not_called()
def test_upload_files_does_not_auto_convert_documents_by_default(tmp_path):
thread_uploads_dir = tmp_path / "uploads"
thread_uploads_dir.mkdir(parents=True)
provider = MagicMock()
provider.uses_thread_data_mounts = True
provider.acquire.return_value = "local"
sandbox = MagicMock()
provider.get.return_value = sandbox
with (
patch.object(uploads, "get_uploads_dir", return_value=thread_uploads_dir),
patch.object(uploads, "ensure_uploads_dir", return_value=thread_uploads_dir),
patch.object(uploads, "get_sandbox_provider", return_value=provider),
patch.object(uploads, "_auto_convert_documents_enabled", return_value=False),
patch.object(uploads, "convert_file_to_markdown", AsyncMock()) as convert_mock,
):
file = UploadFile(filename="report.pdf", file=BytesIO(b"pdf-bytes"))
result = asyncio.run(uploads.upload_files("thread-local", files=[file]))
assert result.success is True
assert len(result.files) == 1
assert result.files[0]["filename"] == "report.pdf"
assert "markdown_file" not in result.files[0]
convert_mock.assert_not_called()
assert not (thread_uploads_dir / "report.md").exists()
def test_upload_files_syncs_non_local_sandbox_and_marks_markdown_file(tmp_path):
thread_uploads_dir = tmp_path / "uploads"
thread_uploads_dir.mkdir(parents=True)
@@ -75,6 +103,7 @@ def test_upload_files_syncs_non_local_sandbox_and_marks_markdown_file(tmp_path):
patch.object(uploads, "get_uploads_dir", return_value=thread_uploads_dir),
patch.object(uploads, "ensure_uploads_dir", return_value=thread_uploads_dir),
patch.object(uploads, "get_sandbox_provider", return_value=provider),
patch.object(uploads, "_auto_convert_documents_enabled", return_value=True),
patch.object(uploads, "convert_file_to_markdown", AsyncMock(side_effect=fake_convert)),
):
file = UploadFile(filename="report.pdf", file=BytesIO(b"pdf-bytes"))
@@ -112,6 +141,7 @@ def test_upload_files_makes_non_local_files_sandbox_writable(tmp_path):
patch.object(uploads, "get_uploads_dir", return_value=thread_uploads_dir),
patch.object(uploads, "ensure_uploads_dir", return_value=thread_uploads_dir),
patch.object(uploads, "get_sandbox_provider", return_value=provider),
patch.object(uploads, "_auto_convert_documents_enabled", return_value=True),
patch.object(uploads, "convert_file_to_markdown", AsyncMock(side_effect=fake_convert)),
patch.object(uploads, "_make_file_sandbox_writable") as make_writable,
):
@@ -218,3 +248,39 @@ def test_delete_uploaded_file_removes_generated_markdown_companion(tmp_path):
assert result == {"success": True, "message": "Deleted report.pdf"}
assert not (thread_uploads_dir / "report.pdf").exists()
assert not (thread_uploads_dir / "report.md").exists()
def test_auto_convert_documents_enabled_defaults_to_false_on_config_errors():
with patch.object(uploads, "get_app_config", side_effect=RuntimeError("boom")):
assert uploads._auto_convert_documents_enabled() is False
def test_auto_convert_documents_enabled_reads_dict_backed_uploads_config():
cfg = MagicMock()
cfg.uploads = {"auto_convert_documents": True}
with patch.object(uploads, "get_app_config", return_value=cfg):
assert uploads._auto_convert_documents_enabled() is True
def test_auto_convert_documents_enabled_accepts_boolean_and_string_truthy_values():
false_cfg = MagicMock()
false_cfg.uploads = MagicMock(auto_convert_documents=False)
true_cfg = MagicMock()
true_cfg.uploads = MagicMock(auto_convert_documents=True)
string_true_cfg = MagicMock()
string_true_cfg.uploads = MagicMock(auto_convert_documents="YES")
string_false_cfg = MagicMock()
string_false_cfg.uploads = MagicMock(auto_convert_documents="false")
with patch.object(uploads, "get_app_config", return_value=false_cfg):
assert uploads._auto_convert_documents_enabled() is False
with patch.object(uploads, "get_app_config", return_value=true_cfg):
assert uploads._auto_convert_documents_enabled() is True
with patch.object(uploads, "get_app_config", return_value=string_true_cfg):
assert uploads._auto_convert_documents_enabled() is True
with patch.object(uploads, "get_app_config", return_value=string_false_cfg):
assert uploads._auto_convert_documents_enabled() is False