mirror of
https://github.com/bytedance/deer-flow.git
synced 2026-06-18 13:46:02 +00:00
fix(sandbox): return actionable hint when read_file hits a binary file (#3624)
read_file decodes with UTF-8. Binary uploads (.xlsx, images, ...) raise UnicodeDecodeError in the sandbox layer. UnicodeDecodeError is a ValueError subclass, not an OSError, so it bypassed the typed handlers and fell through to the generic except, surfacing a vague "Unexpected error reading file" message. The model could not tell the file was binary, so it retried read_file instead of switching to bash + pandas/openpyxl, burning LLM round-trips and bloating context with repeated failures. Add a dedicated UnicodeDecodeError handler that tells the model the file is binary and to use bash with a suitable library (or view_image for images).
This commit is contained in:
@@ -1665,6 +1665,12 @@ def read_file_tool(
|
|||||||
return f"Error: Permission denied reading file: {requested_path}"
|
return f"Error: Permission denied reading file: {requested_path}"
|
||||||
except IsADirectoryError:
|
except IsADirectoryError:
|
||||||
return f"Error: Path is a directory, not a file: {requested_path}"
|
return f"Error: Path is a directory, not a file: {requested_path}"
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
return (
|
||||||
|
f"Error: cannot read '{requested_path}' as text — it appears to be a binary file "
|
||||||
|
"(e.g. .xlsx, .pdf, or an image). read_file only supports UTF-8 text. Use bash with a "
|
||||||
|
"suitable library instead (pandas/openpyxl for spreadsheets), or view_image for images."
|
||||||
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return f"Error: Unexpected error reading file: {_sanitize_error(e, runtime)}"
|
return f"Error: Unexpected error reading file: {_sanitize_error(e, runtime)}"
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,65 @@
|
|||||||
|
"""read_file tool behaviour on binary files.
|
||||||
|
|
||||||
|
``read_file`` decodes with UTF-8. Binary uploads (``.xlsx``, images, ...) raise
|
||||||
|
``UnicodeDecodeError`` deep in the sandbox layer, which previously surfaced to
|
||||||
|
the model as a vague ``Unexpected error reading file`` message. The model could
|
||||||
|
not tell that the file was binary, so it retried ``read_file`` instead of
|
||||||
|
switching to ``bash`` + pandas/openpyxl — burning LLM round-trips. These tests
|
||||||
|
pin the actionable error contract and guard the normal text path.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from types import SimpleNamespace
|
||||||
|
|
||||||
|
from deerflow.sandbox.local.local_sandbox import LocalSandbox
|
||||||
|
from deerflow.sandbox.tools import read_file_tool
|
||||||
|
|
||||||
|
|
||||||
|
def _local_runtime(tmp_path: Path) -> SimpleNamespace:
|
||||||
|
for sub in ("workspace", "uploads", "outputs"):
|
||||||
|
(tmp_path / sub).mkdir(parents=True, exist_ok=True)
|
||||||
|
thread_data = {
|
||||||
|
"workspace_path": str(tmp_path / "workspace"),
|
||||||
|
"uploads_path": str(tmp_path / "uploads"),
|
||||||
|
"outputs_path": str(tmp_path / "outputs"),
|
||||||
|
}
|
||||||
|
return SimpleNamespace(
|
||||||
|
state={"sandbox": {"sandbox_id": "local:t1"}, "thread_data": thread_data},
|
||||||
|
context={"thread_id": "t1"},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_read_file_tool_binary_file_returns_actionable_hint(tmp_path, monkeypatch) -> None:
|
||||||
|
runtime = _local_runtime(tmp_path)
|
||||||
|
# .xlsx is a zip container: header bytes PK\x03\x04 plus a non-UTF-8 byte 0x82
|
||||||
|
# that makes strict UTF-8 decoding fail (the exact byte seen in the field logs).
|
||||||
|
(tmp_path / "uploads" / "data.xlsx").write_bytes(b"PK\x03\x04\x14\x00\x00\x00\x08\x00\x82\x6a\xb1\x55")
|
||||||
|
monkeypatch.setattr("deerflow.sandbox.tools.ensure_sandbox_initialized", lambda runtime: LocalSandbox("t1"))
|
||||||
|
monkeypatch.setattr("deerflow.sandbox.tools.ensure_thread_directories_exist", lambda runtime: None)
|
||||||
|
|
||||||
|
result = read_file_tool.func(
|
||||||
|
runtime=runtime,
|
||||||
|
description="read uploaded excel",
|
||||||
|
path="/mnt/user-data/uploads/data.xlsx",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert "Unexpected error" not in result, result
|
||||||
|
assert "binary" in result.lower(), result
|
||||||
|
# The model must be steered to bash + pandas/openpyxl, not another read_file.
|
||||||
|
assert "bash" in result.lower(), result
|
||||||
|
|
||||||
|
|
||||||
|
def test_read_file_tool_text_file_unaffected(tmp_path, monkeypatch) -> None:
|
||||||
|
runtime = _local_runtime(tmp_path)
|
||||||
|
(tmp_path / "uploads" / "notes.txt").write_text("hello 你好\nsecond line", encoding="utf-8")
|
||||||
|
monkeypatch.setattr("deerflow.sandbox.tools.ensure_sandbox_initialized", lambda runtime: LocalSandbox("t1"))
|
||||||
|
monkeypatch.setattr("deerflow.sandbox.tools.ensure_thread_directories_exist", lambda runtime: None)
|
||||||
|
|
||||||
|
result = read_file_tool.func(
|
||||||
|
runtime=runtime,
|
||||||
|
description="read notes",
|
||||||
|
path="/mnt/user-data/uploads/notes.txt",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert "hello 你好" in result, result
|
||||||
|
assert "binary" not in result.lower(), result
|
||||||
Reference in New Issue
Block a user