mirror of
https://github.com/bytedance/deer-flow.git
synced 2026-06-18 05:25:57 +00:00
6a4a30fa2b
read_file decodes with UTF-8. Binary uploads (.xlsx, images, ...) raise UnicodeDecodeError in the sandbox layer. UnicodeDecodeError is a ValueError subclass, not an OSError, so it bypassed the typed handlers and fell through to the generic except, surfacing a vague "Unexpected error reading file" message. The model could not tell the file was binary, so it retried read_file instead of switching to bash + pandas/openpyxl, burning LLM round-trips and bloating context with repeated failures. Add a dedicated UnicodeDecodeError handler that tells the model the file is binary and to use bash with a suitable library (or view_image for images).
66 lines
2.8 KiB
Python
66 lines
2.8 KiB
Python
"""read_file tool behaviour on binary files.
|
|
|
|
``read_file`` decodes with UTF-8. Binary uploads (``.xlsx``, images, ...) raise
|
|
``UnicodeDecodeError`` deep in the sandbox layer, which previously surfaced to
|
|
the model as a vague ``Unexpected error reading file`` message. The model could
|
|
not tell that the file was binary, so it retried ``read_file`` instead of
|
|
switching to ``bash`` + pandas/openpyxl — burning LLM round-trips. These tests
|
|
pin the actionable error contract and guard the normal text path.
|
|
"""
|
|
|
|
from pathlib import Path
|
|
from types import SimpleNamespace
|
|
|
|
from deerflow.sandbox.local.local_sandbox import LocalSandbox
|
|
from deerflow.sandbox.tools import read_file_tool
|
|
|
|
|
|
def _local_runtime(tmp_path: Path) -> SimpleNamespace:
|
|
for sub in ("workspace", "uploads", "outputs"):
|
|
(tmp_path / sub).mkdir(parents=True, exist_ok=True)
|
|
thread_data = {
|
|
"workspace_path": str(tmp_path / "workspace"),
|
|
"uploads_path": str(tmp_path / "uploads"),
|
|
"outputs_path": str(tmp_path / "outputs"),
|
|
}
|
|
return SimpleNamespace(
|
|
state={"sandbox": {"sandbox_id": "local:t1"}, "thread_data": thread_data},
|
|
context={"thread_id": "t1"},
|
|
)
|
|
|
|
|
|
def test_read_file_tool_binary_file_returns_actionable_hint(tmp_path, monkeypatch) -> None:
|
|
runtime = _local_runtime(tmp_path)
|
|
# .xlsx is a zip container: header bytes PK\x03\x04 plus a non-UTF-8 byte 0x82
|
|
# that makes strict UTF-8 decoding fail (the exact byte seen in the field logs).
|
|
(tmp_path / "uploads" / "data.xlsx").write_bytes(b"PK\x03\x04\x14\x00\x00\x00\x08\x00\x82\x6a\xb1\x55")
|
|
monkeypatch.setattr("deerflow.sandbox.tools.ensure_sandbox_initialized", lambda runtime: LocalSandbox("t1"))
|
|
monkeypatch.setattr("deerflow.sandbox.tools.ensure_thread_directories_exist", lambda runtime: None)
|
|
|
|
result = read_file_tool.func(
|
|
runtime=runtime,
|
|
description="read uploaded excel",
|
|
path="/mnt/user-data/uploads/data.xlsx",
|
|
)
|
|
|
|
assert "Unexpected error" not in result, result
|
|
assert "binary" in result.lower(), result
|
|
# The model must be steered to bash + pandas/openpyxl, not another read_file.
|
|
assert "bash" in result.lower(), result
|
|
|
|
|
|
def test_read_file_tool_text_file_unaffected(tmp_path, monkeypatch) -> None:
|
|
runtime = _local_runtime(tmp_path)
|
|
(tmp_path / "uploads" / "notes.txt").write_text("hello 你好\nsecond line", encoding="utf-8")
|
|
monkeypatch.setattr("deerflow.sandbox.tools.ensure_sandbox_initialized", lambda runtime: LocalSandbox("t1"))
|
|
monkeypatch.setattr("deerflow.sandbox.tools.ensure_thread_directories_exist", lambda runtime: None)
|
|
|
|
result = read_file_tool.func(
|
|
runtime=runtime,
|
|
description="read notes",
|
|
path="/mnt/user-data/uploads/notes.txt",
|
|
)
|
|
|
|
assert "hello 你好" in result, result
|
|
assert "binary" not in result.lower(), result
|