From 6a4a30fa2be281561b5d3da0de2ed10239962121 Mon Sep 17 00:00:00 2001 From: Xinmin Zeng <135568692+fancyboi999@users.noreply.github.com> Date: Wed, 17 Jun 2026 21:11:44 +0800 Subject: [PATCH] fix(sandbox): return actionable hint when read_file hits a binary file (#3624) read_file decodes with UTF-8. Binary uploads (.xlsx, images, ...) raise UnicodeDecodeError in the sandbox layer. UnicodeDecodeError is a ValueError subclass, not an OSError, so it bypassed the typed handlers and fell through to the generic except, surfacing a vague "Unexpected error reading file" message. The model could not tell the file was binary, so it retried read_file instead of switching to bash + pandas/openpyxl, burning LLM round-trips and bloating context with repeated failures. Add a dedicated UnicodeDecodeError handler that tells the model the file is binary and to use bash with a suitable library (or view_image for images). --- .../harness/deerflow/sandbox/tools.py | 6 ++ backend/tests/test_read_file_tool_binary.py | 65 +++++++++++++++++++ 2 files changed, 71 insertions(+) create mode 100644 backend/tests/test_read_file_tool_binary.py diff --git a/backend/packages/harness/deerflow/sandbox/tools.py b/backend/packages/harness/deerflow/sandbox/tools.py index 4c04e3ac7..c9bec1569 100644 --- a/backend/packages/harness/deerflow/sandbox/tools.py +++ b/backend/packages/harness/deerflow/sandbox/tools.py @@ -1665,6 +1665,12 @@ def read_file_tool( return f"Error: Permission denied reading file: {requested_path}" except IsADirectoryError: return f"Error: Path is a directory, not a file: {requested_path}" + except UnicodeDecodeError: + return ( + f"Error: cannot read '{requested_path}' as text — it appears to be a binary file " + "(e.g. .xlsx, .pdf, or an image). read_file only supports UTF-8 text. Use bash with a " + "suitable library instead (pandas/openpyxl for spreadsheets), or view_image for images." + ) except Exception as e: return f"Error: Unexpected error reading file: {_sanitize_error(e, runtime)}" diff --git a/backend/tests/test_read_file_tool_binary.py b/backend/tests/test_read_file_tool_binary.py new file mode 100644 index 000000000..31a6f9129 --- /dev/null +++ b/backend/tests/test_read_file_tool_binary.py @@ -0,0 +1,65 @@ +"""read_file tool behaviour on binary files. + +``read_file`` decodes with UTF-8. Binary uploads (``.xlsx``, images, ...) raise +``UnicodeDecodeError`` deep in the sandbox layer, which previously surfaced to +the model as a vague ``Unexpected error reading file`` message. The model could +not tell that the file was binary, so it retried ``read_file`` instead of +switching to ``bash`` + pandas/openpyxl — burning LLM round-trips. These tests +pin the actionable error contract and guard the normal text path. +""" + +from pathlib import Path +from types import SimpleNamespace + +from deerflow.sandbox.local.local_sandbox import LocalSandbox +from deerflow.sandbox.tools import read_file_tool + + +def _local_runtime(tmp_path: Path) -> SimpleNamespace: + for sub in ("workspace", "uploads", "outputs"): + (tmp_path / sub).mkdir(parents=True, exist_ok=True) + thread_data = { + "workspace_path": str(tmp_path / "workspace"), + "uploads_path": str(tmp_path / "uploads"), + "outputs_path": str(tmp_path / "outputs"), + } + return SimpleNamespace( + state={"sandbox": {"sandbox_id": "local:t1"}, "thread_data": thread_data}, + context={"thread_id": "t1"}, + ) + + +def test_read_file_tool_binary_file_returns_actionable_hint(tmp_path, monkeypatch) -> None: + runtime = _local_runtime(tmp_path) + # .xlsx is a zip container: header bytes PK\x03\x04 plus a non-UTF-8 byte 0x82 + # that makes strict UTF-8 decoding fail (the exact byte seen in the field logs). + (tmp_path / "uploads" / "data.xlsx").write_bytes(b"PK\x03\x04\x14\x00\x00\x00\x08\x00\x82\x6a\xb1\x55") + monkeypatch.setattr("deerflow.sandbox.tools.ensure_sandbox_initialized", lambda runtime: LocalSandbox("t1")) + monkeypatch.setattr("deerflow.sandbox.tools.ensure_thread_directories_exist", lambda runtime: None) + + result = read_file_tool.func( + runtime=runtime, + description="read uploaded excel", + path="/mnt/user-data/uploads/data.xlsx", + ) + + assert "Unexpected error" not in result, result + assert "binary" in result.lower(), result + # The model must be steered to bash + pandas/openpyxl, not another read_file. + assert "bash" in result.lower(), result + + +def test_read_file_tool_text_file_unaffected(tmp_path, monkeypatch) -> None: + runtime = _local_runtime(tmp_path) + (tmp_path / "uploads" / "notes.txt").write_text("hello 你好\nsecond line", encoding="utf-8") + monkeypatch.setattr("deerflow.sandbox.tools.ensure_sandbox_initialized", lambda runtime: LocalSandbox("t1")) + monkeypatch.setattr("deerflow.sandbox.tools.ensure_thread_directories_exist", lambda runtime: None) + + result = read_file_tool.func( + runtime=runtime, + description="read notes", + path="/mnt/user-data/uploads/notes.txt", + ) + + assert "hello 你好" in result, result + assert "binary" not in result.lower(), result