feat(sandbox): add built-in grep and glob tools (#1784)

* feat(sandbox): add grep and glob tools

* refactor(aio-sandbox): use native file search APIs

* fix(sandbox): address review issues in grep/glob tools

- aio_sandbox: use should_ignore_path() instead of should_ignore_name()
  for include_dirs=True branch to filter nested ignored paths correctly
- aio_sandbox: add early exit when max_results reached in glob loop
- aio_sandbox: guard entry.path.startswith(path) before stripping prefix
- aio_sandbox: validate regex locally before sending to remote API
- search: skip lines exceeding max_line_chars to prevent ReDoS
- search: remove resolve() syscall in os.walk loop
- tools: avoid double get_thread_data() call in glob_tool/grep_tool
- tests: add 6 new cases covering the above code paths
- tests: patch get_app_config in truncation test to isolate config

* Fix sandbox grep/glob review feedback

* Remove unrelated Langfuse RFC from PR
This commit is contained in:
DanielWalnut
2026-04-03 16:03:06 +08:00
committed by GitHub
parent 9735d73b83
commit c6cdf200ce
10 changed files with 1388 additions and 69 deletions
@@ -7,6 +7,7 @@ from langchain.tools import ToolRuntime, tool
from langgraph.typing import ContextT
from deerflow.agents.thread_state import ThreadDataState, ThreadState
from deerflow.config import get_app_config
from deerflow.config.paths import VIRTUAL_PATH_PREFIX
from deerflow.sandbox.exceptions import (
SandboxError,
@@ -16,6 +17,7 @@ from deerflow.sandbox.exceptions import (
from deerflow.sandbox.file_operation_lock import get_file_operation_lock
from deerflow.sandbox.sandbox import Sandbox
from deerflow.sandbox.sandbox_provider import get_sandbox_provider
from deerflow.sandbox.search import GrepMatch
from deerflow.sandbox.security import LOCAL_HOST_BASH_DISABLED_MESSAGE, is_host_bash_allowed
_ABSOLUTE_PATH_PATTERN = re.compile(r"(?<![:\w])(?<!:/)/(?:[^\s\"'`;&|<>()]+)")
@@ -31,6 +33,10 @@ _LOCAL_BASH_SYSTEM_PATH_PREFIXES = (
_DEFAULT_SKILLS_CONTAINER_PATH = "/mnt/skills"
_ACP_WORKSPACE_VIRTUAL_PATH = "/mnt/acp-workspace"
_DEFAULT_GLOB_MAX_RESULTS = 200
_MAX_GLOB_MAX_RESULTS = 1000
_DEFAULT_GREP_MAX_RESULTS = 100
_MAX_GREP_MAX_RESULTS = 500
def _get_skills_container_path() -> str:
@@ -245,6 +251,69 @@ def _get_mcp_allowed_paths() -> list[str]:
return allowed_paths
def _get_tool_config_int(name: str, key: str, default: int) -> int:
try:
tool_config = get_app_config().get_tool_config(name)
if tool_config is not None and key in tool_config.model_extra:
value = tool_config.model_extra.get(key)
if isinstance(value, int):
return value
except Exception:
pass
return default
def _clamp_max_results(value: int, *, default: int, upper_bound: int) -> int:
if value <= 0:
return default
return min(value, upper_bound)
def _resolve_max_results(name: str, requested: int, *, default: int, upper_bound: int) -> int:
requested_max_results = _clamp_max_results(requested, default=default, upper_bound=upper_bound)
configured_max_results = _clamp_max_results(
_get_tool_config_int(name, "max_results", default),
default=default,
upper_bound=upper_bound,
)
return min(requested_max_results, configured_max_results)
def _resolve_local_read_path(path: str, thread_data: ThreadDataState) -> str:
validate_local_tool_path(path, thread_data, read_only=True)
if _is_skills_path(path):
return _resolve_skills_path(path)
if _is_acp_workspace_path(path):
return _resolve_acp_workspace_path(path, _extract_thread_id_from_thread_data(thread_data))
return _resolve_and_validate_user_data_path(path, thread_data)
def _format_glob_results(root_path: str, matches: list[str], truncated: bool) -> str:
if not matches:
return f"No files matched under {root_path}"
lines = [f"Found {len(matches)} paths under {root_path}"]
if truncated:
lines[0] += f" (showing first {len(matches)})"
lines.extend(f"{index}. {path}" for index, path in enumerate(matches, start=1))
if truncated:
lines.append("Results truncated. Narrow the path or pattern to see fewer matches.")
return "\n".join(lines)
def _format_grep_results(root_path: str, matches: list[GrepMatch], truncated: bool) -> str:
if not matches:
return f"No matches found under {root_path}"
lines = [f"Found {len(matches)} matches under {root_path}"]
if truncated:
lines[0] += f" (showing first {len(matches)})"
lines.extend(f"{match.path}:{match.line_number}: {match.line}" for match in matches)
if truncated:
lines.append("Results truncated. Narrow the path or add a glob filter.")
return "\n".join(lines)
def _path_variants(path: str) -> set[str]:
return {path, path.replace("\\", "/"), path.replace("/", "\\")}
@@ -901,6 +970,126 @@ def ls_tool(runtime: ToolRuntime[ContextT, ThreadState], description: str, path:
return f"Error: Unexpected error listing directory: {_sanitize_error(e, runtime)}"
@tool("glob", parse_docstring=True)
def glob_tool(
runtime: ToolRuntime[ContextT, ThreadState],
description: str,
pattern: str,
path: str,
include_dirs: bool = False,
max_results: int = _DEFAULT_GLOB_MAX_RESULTS,
) -> str:
"""Find files or directories that match a glob pattern under a root directory.
Args:
description: Explain why you are searching for these paths in short words. ALWAYS PROVIDE THIS PARAMETER FIRST.
pattern: The glob pattern to match relative to the root path, for example `**/*.py`.
path: The **absolute** root directory to search under.
include_dirs: Whether matching directories should also be returned. Default is False.
max_results: Maximum number of paths to return. Default is 200.
"""
try:
sandbox = ensure_sandbox_initialized(runtime)
ensure_thread_directories_exist(runtime)
requested_path = path
effective_max_results = _resolve_max_results(
"glob",
max_results,
default=_DEFAULT_GLOB_MAX_RESULTS,
upper_bound=_MAX_GLOB_MAX_RESULTS,
)
thread_data = None
if is_local_sandbox(runtime):
thread_data = get_thread_data(runtime)
if thread_data is None:
raise SandboxRuntimeError("Thread data not available for local sandbox")
path = _resolve_local_read_path(path, thread_data)
matches, truncated = sandbox.glob(path, pattern, include_dirs=include_dirs, max_results=effective_max_results)
if thread_data is not None:
matches = [mask_local_paths_in_output(match, thread_data) for match in matches]
return _format_glob_results(requested_path, matches, truncated)
except SandboxError as e:
return f"Error: {e}"
except FileNotFoundError:
return f"Error: Directory not found: {requested_path}"
except NotADirectoryError:
return f"Error: Path is not a directory: {requested_path}"
except PermissionError:
return f"Error: Permission denied: {requested_path}"
except Exception as e:
return f"Error: Unexpected error searching paths: {_sanitize_error(e, runtime)}"
@tool("grep", parse_docstring=True)
def grep_tool(
runtime: ToolRuntime[ContextT, ThreadState],
description: str,
pattern: str,
path: str,
glob: str | None = None,
literal: bool = False,
case_sensitive: bool = False,
max_results: int = _DEFAULT_GREP_MAX_RESULTS,
) -> str:
"""Search for matching lines inside text files under a root directory.
Args:
description: Explain why you are searching file contents in short words. ALWAYS PROVIDE THIS PARAMETER FIRST.
pattern: The string or regex pattern to search for.
path: The **absolute** root directory to search under.
glob: Optional glob filter for candidate files, for example `**/*.py`.
literal: Whether to treat `pattern` as a plain string. Default is False.
case_sensitive: Whether matching is case-sensitive. Default is False.
max_results: Maximum number of matching lines to return. Default is 100.
"""
try:
sandbox = ensure_sandbox_initialized(runtime)
ensure_thread_directories_exist(runtime)
requested_path = path
effective_max_results = _resolve_max_results(
"grep",
max_results,
default=_DEFAULT_GREP_MAX_RESULTS,
upper_bound=_MAX_GREP_MAX_RESULTS,
)
thread_data = None
if is_local_sandbox(runtime):
thread_data = get_thread_data(runtime)
if thread_data is None:
raise SandboxRuntimeError("Thread data not available for local sandbox")
path = _resolve_local_read_path(path, thread_data)
matches, truncated = sandbox.grep(
path,
pattern,
glob=glob,
literal=literal,
case_sensitive=case_sensitive,
max_results=effective_max_results,
)
if thread_data is not None:
matches = [
GrepMatch(
path=mask_local_paths_in_output(match.path, thread_data),
line_number=match.line_number,
line=match.line,
)
for match in matches
]
return _format_grep_results(requested_path, matches, truncated)
except SandboxError as e:
return f"Error: {e}"
except FileNotFoundError:
return f"Error: Directory not found: {requested_path}"
except NotADirectoryError:
return f"Error: Path is not a directory: {requested_path}"
except re.error as e:
return f"Error: Invalid regex pattern: {e}"
except PermissionError:
return f"Error: Permission denied: {requested_path}"
except Exception as e:
return f"Error: Unexpected error searching file contents: {_sanitize_error(e, runtime)}"
@tool("read_file", parse_docstring=True)
def read_file_tool(
runtime: ToolRuntime[ContextT, ThreadState],