diff --git a/backend/packages/harness/deerflow/community/aio_sandbox/aio_sandbox.py b/backend/packages/harness/deerflow/community/aio_sandbox/aio_sandbox.py index 97da4144d..cdc8e1b77 100644 --- a/backend/packages/harness/deerflow/community/aio_sandbox/aio_sandbox.py +++ b/backend/packages/harness/deerflow/community/aio_sandbox/aio_sandbox.py @@ -1,4 +1,5 @@ import base64 +import errno import logging import shlex import threading @@ -6,11 +7,14 @@ import uuid from agent_sandbox import Sandbox as AioSandboxClient +from deerflow.config.paths import VIRTUAL_PATH_PREFIX from deerflow.sandbox.sandbox import Sandbox from deerflow.sandbox.search import GrepMatch, path_matches, should_ignore_path, truncate_line logger = logging.getLogger(__name__) +_MAX_DOWNLOAD_SIZE = 100 * 1024 * 1024 # 100 MB + _ERROR_OBSERVATION_SIGNATURE = "'ErrorObservation' object has no attribute 'exit_code'" @@ -102,6 +106,49 @@ class AioSandbox(Sandbox): logger.error(f"Failed to read file in sandbox: {e}") return f"Error: {e}" + def download_file(self, path: str) -> bytes: + """Download file bytes from the sandbox. + + Raises: + PermissionError: If the path contains '..' traversal segments or is + outside ``VIRTUAL_PATH_PREFIX``. + OSError: If the file cannot be retrieved from the sandbox. + """ + # Reject path traversal before sending to the container API. + # LocalSandbox gets this implicitly via _resolve_path; + # here the path is forwarded verbatim so we must check explicitly. + normalised = path.replace("\\", "/") + for segment in normalised.split("/"): + if segment == "..": + logger.error(f"Refused download due to path traversal: {path}") + raise PermissionError(f"Access denied: path traversal detected in '{path}'") + + stripped_path = normalised.lstrip("/") + allowed_prefix = VIRTUAL_PATH_PREFIX.lstrip("/") + if stripped_path != allowed_prefix and not stripped_path.startswith(f"{allowed_prefix}/"): + logger.error("Refused download outside allowed directory: path=%s, allowed_prefix=%s", path, VIRTUAL_PATH_PREFIX) + raise PermissionError(f"Access denied: path must be under '{VIRTUAL_PATH_PREFIX}': '{path}'") + + with self._lock: + try: + chunks: list[bytes] = [] + total = 0 + for chunk in self._client.file.download_file(path=path): + total += len(chunk) + if total > _MAX_DOWNLOAD_SIZE: + raise OSError( + errno.EFBIG, + f"File exceeds maximum download size of {_MAX_DOWNLOAD_SIZE} bytes", + path, + ) + chunks.append(chunk) + return b"".join(chunks) + except OSError: + raise + except Exception as e: + logger.error(f"Failed to download file in sandbox: {e}") + raise OSError(f"Failed to download file '{path}' from sandbox: {e}") from e + def list_dir(self, path: str, max_depth: int = 2) -> list[str]: """List the contents of a directory in the sandbox. diff --git a/backend/packages/harness/deerflow/sandbox/local/local_sandbox.py b/backend/packages/harness/deerflow/sandbox/local/local_sandbox.py index 62577abb9..0d7682733 100644 --- a/backend/packages/harness/deerflow/sandbox/local/local_sandbox.py +++ b/backend/packages/harness/deerflow/sandbox/local/local_sandbox.py @@ -1,4 +1,5 @@ import errno +import logging import ntpath import os import shutil @@ -7,10 +8,13 @@ from dataclasses import dataclass from pathlib import Path from typing import NamedTuple +from deerflow.config.paths import VIRTUAL_PATH_PREFIX from deerflow.sandbox.local.list_dir import list_dir from deerflow.sandbox.sandbox import Sandbox from deerflow.sandbox.search import GrepMatch, find_glob_matches, find_grep_matches +logger = logging.getLogger(__name__) + @dataclass(frozen=True) class PathMapping: @@ -379,6 +383,28 @@ class LocalSandbox(Sandbox): # Re-raise with the original path for clearer error messages, hiding internal resolved paths raise type(e)(e.errno, e.strerror, path) from None + def download_file(self, path: str) -> bytes: + normalised = path.replace("\\", "/") + stripped_path = normalised.lstrip("/") + allowed_prefix = VIRTUAL_PATH_PREFIX.lstrip("/") + if stripped_path != allowed_prefix and not stripped_path.startswith(f"{allowed_prefix}/"): + logger.error("Refused download outside allowed directory: path=%s, allowed_prefix=%s", path, VIRTUAL_PATH_PREFIX) + raise PermissionError(errno.EACCES, f"Access denied: path must be under '{VIRTUAL_PATH_PREFIX}'", path) + + resolved_path = self._resolve_path(path) + max_download_size = 100 * 1024 * 1024 + try: + file_size = os.path.getsize(resolved_path) + if file_size > max_download_size: + raise OSError(errno.EFBIG, f"File exceeds maximum download size of {max_download_size} bytes", path) + # TOCTOU note: the file could grow between getsize() and read(); accepted + # tradeoff since this is a controlled sandbox environment. + with open(resolved_path, "rb") as f: + return f.read() + except OSError as e: + # Re-raise with the original path for clearer error messages, hiding internal resolved paths + raise type(e)(e.errno, e.strerror, path) from None + def write_file(self, path: str, content: str, append: bool = False) -> None: resolved = self._resolve_path_with_mapping(path) resolved_path = resolved.path diff --git a/backend/packages/harness/deerflow/sandbox/sandbox.py b/backend/packages/harness/deerflow/sandbox/sandbox.py index dc567b503..50322f419 100644 --- a/backend/packages/harness/deerflow/sandbox/sandbox.py +++ b/backend/packages/harness/deerflow/sandbox/sandbox.py @@ -39,6 +39,25 @@ class Sandbox(ABC): """ pass + @abstractmethod + def download_file(self, path: str) -> bytes: + """Download the binary content of a file. + + Args: + path: The absolute path of the file to download. + + Returns: + Raw file bytes. + + Raises: + PermissionError: If path traversal is detected or the path is outside + the allowed virtual prefix. + OSError: If the file cannot be read or does not exist. Both local + and remote implementations must raise ``OSError`` so callers + have a single exception type to handle. + """ + pass + @abstractmethod def list_dir(self, path: str, max_depth=2) -> list[str]: """List the contents of a directory. diff --git a/backend/tests/test_aio_sandbox.py b/backend/tests/test_aio_sandbox.py index c6acb46eb..3b0a44f05 100644 --- a/backend/tests/test_aio_sandbox.py +++ b/backend/tests/test_aio_sandbox.py @@ -233,3 +233,88 @@ class TestConcurrentFileWrites: thread.join() assert storage["content"] in {"seed\nA\nB\n", "seed\nB\nA\n"} + + +class TestDownloadFile: + """Tests for AioSandbox.download_file.""" + + def test_returns_concatenated_bytes(self, sandbox): + """download_file should join chunks from the client iterator into bytes.""" + sandbox._client.file.download_file = MagicMock(return_value=[b"hel", b"lo"]) + + result = sandbox.download_file("/mnt/user-data/outputs/file.bin") + + assert result == b"hello" + sandbox._client.file.download_file.assert_called_once_with(path="/mnt/user-data/outputs/file.bin") + + def test_returns_empty_bytes_for_empty_file(self, sandbox): + """download_file should return b'' when the iterator yields nothing.""" + sandbox._client.file.download_file = MagicMock(return_value=iter([])) + + result = sandbox.download_file("/mnt/user-data/outputs/empty.bin") + + assert result == b"" + + def test_uses_lock_during_download(self, sandbox): + """download_file should hold the lock while calling the client.""" + lock_was_held = [] + + def tracking_download(path): + lock_was_held.append(sandbox._lock.locked()) + return iter([b"data"]) + + sandbox._client.file.download_file = tracking_download + + sandbox.download_file("/mnt/user-data/outputs/file.bin") + + assert lock_was_held == [True], "download_file must hold the lock during client call" + + def test_raises_oserror_on_client_error(self, sandbox): + """download_file should wrap client exceptions as OSError.""" + sandbox._client.file.download_file = MagicMock(side_effect=RuntimeError("network error")) + + with pytest.raises(OSError, match="network error"): + sandbox.download_file("/mnt/user-data/outputs/file.bin") + + def test_preserves_oserror_from_client(self, sandbox): + """OSError raised by the client should propagate without re-wrapping.""" + sandbox._client.file.download_file = MagicMock(side_effect=OSError("disk error")) + + with pytest.raises(OSError, match="disk error"): + sandbox.download_file("/mnt/user-data/outputs/file.bin") + + def test_rejects_path_outside_virtual_prefix_and_logs_error(self, sandbox, caplog): + """download_file must reject downloads outside /mnt/user-data and log the reason.""" + sandbox._client.file.download_file = MagicMock() + + with caplog.at_level("ERROR"): + with pytest.raises(PermissionError, match="must be under"): + sandbox.download_file("/etc/passwd") + + assert "outside allowed directory" in caplog.text + sandbox._client.file.download_file.assert_not_called() + + @pytest.mark.parametrize( + "path", + [ + "/mnt/workspace/../../etc/passwd", + "../secret", + "/a/b/../../../etc/shadow", + ], + ) + def test_rejects_path_traversal(self, sandbox, path): + """download_file must reject paths containing '..' before calling the client.""" + sandbox._client.file.download_file = MagicMock() + + with pytest.raises(PermissionError, match="path traversal"): + sandbox.download_file(path) + + sandbox._client.file.download_file.assert_not_called() + + def test_single_chunk(self, sandbox): + """download_file should work correctly with a single-chunk response.""" + sandbox._client.file.download_file = MagicMock(return_value=[b"single-chunk"]) + + result = sandbox.download_file("/mnt/user-data/outputs/single.bin") + + assert result == b"single-chunk" diff --git a/backend/tests/test_local_sandbox_provider_mounts.py b/backend/tests/test_local_sandbox_provider_mounts.py index 5e7a06b6d..add5c4ea6 100644 --- a/backend/tests/test_local_sandbox_provider_mounts.py +++ b/backend/tests/test_local_sandbox_provider_mounts.py @@ -204,6 +204,26 @@ class TestSymlinkEscapes: assert exc_info.value.errno == errno.EACCES + def test_download_file_blocks_symlink_escape_from_mount(self, tmp_path): + mount_dir = tmp_path / "mount" + mount_dir.mkdir() + outside_dir = tmp_path / "outside" + outside_dir.mkdir() + (outside_dir / "secret.bin").write_bytes(b"\x00secret") + _symlink_to(outside_dir, mount_dir / "escape", target_is_directory=True) + + sandbox = LocalSandbox( + "test", + [ + PathMapping(container_path="/mnt/user-data", local_path=str(mount_dir), read_only=False), + ], + ) + + with pytest.raises(PermissionError) as exc_info: + sandbox.download_file("/mnt/user-data/escape/secret.bin") + + assert exc_info.value.errno == errno.EACCES + def test_write_file_blocks_symlink_escape_from_mount(self, tmp_path): mount_dir = tmp_path / "mount" mount_dir.mkdir() @@ -334,6 +354,74 @@ class TestSymlinkEscapes: assert existing.read_bytes() == b"original" +class TestDownloadFileMappings: + """download_file must use _resolve_path_with_mapping so path resolution, symlink + containment, and read-only awareness are consistent with read_file.""" + + def test_resolves_container_path_via_mapping(self, tmp_path): + """download_file should resolve container paths through path mappings.""" + data_dir = tmp_path / "data" + data_dir.mkdir() + (data_dir / "asset.bin").write_bytes(b"\x01\x02\x03") + + sandbox = LocalSandbox( + "test", + [PathMapping(container_path="/mnt/user-data", local_path=str(data_dir))], + ) + + result = sandbox.download_file("/mnt/user-data/asset.bin") + + assert result == b"\x01\x02\x03" + + def test_raises_oserror_with_original_path_when_missing(self, tmp_path): + """OSError filename should show the container path, not the resolved host path.""" + data_dir = tmp_path / "data" + data_dir.mkdir() + + sandbox = LocalSandbox( + "test", + [PathMapping(container_path="/mnt/user-data", local_path=str(data_dir))], + ) + + with pytest.raises(OSError) as exc_info: + sandbox.download_file("/mnt/user-data/missing.bin") + + assert exc_info.value.filename == "/mnt/user-data/missing.bin" + + def test_rejects_path_outside_virtual_prefix_and_logs_error(self, tmp_path, caplog): + """download_file must reject paths outside /mnt/user-data and log the reason.""" + data_dir = tmp_path / "data" + data_dir.mkdir() + (data_dir / "model.bin").write_bytes(b"weights") + + sandbox = LocalSandbox( + "test", + [PathMapping(container_path="/mnt/user-data", local_path=str(data_dir), read_only=True)], + ) + + with caplog.at_level("ERROR"): + with pytest.raises(PermissionError) as exc_info: + sandbox.download_file("/mnt/skills/model.bin") + + assert exc_info.value.errno == errno.EACCES + assert "outside allowed directory" in caplog.text + + def test_readable_from_read_only_mount(self, tmp_path): + """Read-only mounts must not block download_file — read-only only restricts writes.""" + skills_dir = tmp_path / "skills" + skills_dir.mkdir() + (skills_dir / "model.bin").write_bytes(b"weights") + + sandbox = LocalSandbox( + "test", + [PathMapping(container_path="/mnt/user-data", local_path=str(skills_dir), read_only=True)], + ) + + result = sandbox.download_file("/mnt/user-data/model.bin") + + assert result == b"weights" + + class TestMultipleMounts: def test_multiple_read_write_mounts(self, tmp_path): skills_dir = tmp_path / "skills"