feat(sandbox): add built-in grep and glob tools (#1784)

* feat(sandbox): add grep and glob tools

* refactor(aio-sandbox): use native file search APIs

* fix(sandbox): address review issues in grep/glob tools

- aio_sandbox: use should_ignore_path() instead of should_ignore_name()
  for include_dirs=True branch to filter nested ignored paths correctly
- aio_sandbox: add early exit when max_results reached in glob loop
- aio_sandbox: guard entry.path.startswith(path) before stripping prefix
- aio_sandbox: validate regex locally before sending to remote API
- search: skip lines exceeding max_line_chars to prevent ReDoS
- search: remove resolve() syscall in os.walk loop
- tools: avoid double get_thread_data() call in glob_tool/grep_tool
- tests: add 6 new cases covering the above code paths
- tests: patch get_app_config in truncation test to isolate config

* Fix sandbox grep/glob review feedback

* Remove unrelated Langfuse RFC from PR
This commit is contained in:
DanielWalnut
2026-04-03 16:03:06 +08:00
committed by GitHub
parent 9735d73b83
commit c6cdf200ce
10 changed files with 1388 additions and 69 deletions
@@ -1,72 +1,6 @@
import fnmatch
from pathlib import Path
IGNORE_PATTERNS = [
# Version Control
".git",
".svn",
".hg",
".bzr",
# Dependencies
"node_modules",
"__pycache__",
".venv",
"venv",
".env",
"env",
".tox",
".nox",
".eggs",
"*.egg-info",
"site-packages",
# Build outputs
"dist",
"build",
".next",
".nuxt",
".output",
".turbo",
"target",
"out",
# IDE & Editor
".idea",
".vscode",
"*.swp",
"*.swo",
"*~",
".project",
".classpath",
".settings",
# OS generated
".DS_Store",
"Thumbs.db",
"desktop.ini",
"*.lnk",
# Logs & temp files
"*.log",
"*.tmp",
"*.temp",
"*.bak",
"*.cache",
".cache",
"logs",
# Coverage & test artifacts
".coverage",
"coverage",
".nyc_output",
"htmlcov",
".pytest_cache",
".mypy_cache",
".ruff_cache",
]
def _should_ignore(name: str) -> bool:
"""Check if a file/directory name matches any ignore pattern."""
for pattern in IGNORE_PATTERNS:
if fnmatch.fnmatch(name, pattern):
return True
return False
from deerflow.sandbox.search import should_ignore_name
def list_dir(path: str, max_depth: int = 2) -> list[str]:
@@ -95,7 +29,7 @@ def list_dir(path: str, max_depth: int = 2) -> list[str]:
try:
for item in current_path.iterdir():
if _should_ignore(item.name):
if should_ignore_name(item.name):
continue
post_fix = "/" if item.is_dir() else ""
@@ -6,6 +6,7 @@ from pathlib import Path
from deerflow.sandbox.local.list_dir import list_dir
from deerflow.sandbox.sandbox import Sandbox
from deerflow.sandbox.search import GrepMatch, find_glob_matches, find_grep_matches
class LocalSandbox(Sandbox):
@@ -259,6 +260,39 @@ class LocalSandbox(Sandbox):
# Re-raise with the original path for clearer error messages, hiding internal resolved paths
raise type(e)(e.errno, e.strerror, path) from None
def glob(self, path: str, pattern: str, *, include_dirs: bool = False, max_results: int = 200) -> tuple[list[str], bool]:
resolved_path = Path(self._resolve_path(path))
matches, truncated = find_glob_matches(resolved_path, pattern, include_dirs=include_dirs, max_results=max_results)
return [self._reverse_resolve_path(match) for match in matches], truncated
def grep(
self,
path: str,
pattern: str,
*,
glob: str | None = None,
literal: bool = False,
case_sensitive: bool = False,
max_results: int = 100,
) -> tuple[list[GrepMatch], bool]:
resolved_path = Path(self._resolve_path(path))
matches, truncated = find_grep_matches(
resolved_path,
pattern,
glob_pattern=glob,
literal=literal,
case_sensitive=case_sensitive,
max_results=max_results,
)
return [
GrepMatch(
path=self._reverse_resolve_path(match.path),
line_number=match.line_number,
line=match.line,
)
for match in matches
], truncated
def update_file(self, path: str, content: bytes) -> None:
resolved_path = self._resolve_path(path)
try: