feat(sandbox): add built-in grep and glob tools (#1784)

* feat(sandbox): add grep and glob tools * refactor(aio-sandbox): use native file search APIs * fix(sandbox): address review issues in grep/glob tools - aio_sandbox: use should_ignore_path() instead of should_ignore_name() for include_dirs=True branch to filter nested ignored paths correctly - aio_sandbox: add early exit when max_results reached in glob loop - aio_sandbox: guard entry.path.startswith(path) before stripping prefix - aio_sandbox: validate regex locally before sending to remote API - search: skip lines exceeding max_line_chars to prevent ReDoS - search: remove resolve() syscall in os.walk loop - tools: avoid double get_thread_data() call in glob_tool/grep_tool - tests: add 6 new cases covering the above code paths - tests: patch get_app_config in truncation test to isolate config * Fix sandbox grep/glob review feedback * Remove unrelated Langfuse RFC from PR
2026-05-24 17:06:00 +00:00 · 2026-04-03 16:03:06 +08:00
parent 9735d73b83
commit c6cdf200ce
10 changed files with 1388 additions and 69 deletions
@@ -1,72 +1,6 @@
-import fnmatch
 from pathlib import Path

-IGNORE_PATTERNS = [
-    # Version Control
-    ".git",
-    ".svn",
-    ".hg",
-    ".bzr",
-    # Dependencies
-    "node_modules",
-    "__pycache__",
-    ".venv",
-    "venv",
-    ".env",
-    "env",
-    ".tox",
-    ".nox",
-    ".eggs",
-    "*.egg-info",
-    "site-packages",
-    # Build outputs
-    "dist",
-    "build",
-    ".next",
-    ".nuxt",
-    ".output",
-    ".turbo",
-    "target",
-    "out",
-    # IDE & Editor
-    ".idea",
-    ".vscode",
-    "*.swp",
-    "*.swo",
-    "*~",
-    ".project",
-    ".classpath",
-    ".settings",
-    # OS generated
-    ".DS_Store",
-    "Thumbs.db",
-    "desktop.ini",
-    "*.lnk",
-    # Logs & temp files
-    "*.log",
-    "*.tmp",
-    "*.temp",
-    "*.bak",
-    "*.cache",
-    ".cache",
-    "logs",
-    # Coverage & test artifacts
-    ".coverage",
-    "coverage",
-    ".nyc_output",
-    "htmlcov",
-    ".pytest_cache",
-    ".mypy_cache",
-    ".ruff_cache",
-]
-
-
-def _should_ignore(name: str) -> bool:
-    """Check if a file/directory name matches any ignore pattern."""
-    for pattern in IGNORE_PATTERNS:
-        if fnmatch.fnmatch(name, pattern):
-            return True
-    return False
+from deerflow.sandbox.search import should_ignore_name


 def list_dir(path: str, max_depth: int = 2) -> list[str]:
@@ -95,7 +29,7 @@ def list_dir(path: str, max_depth: int = 2) -> list[str]:

        try:
            for item in current_path.iterdir():
-                if _should_ignore(item.name):
+                if should_ignore_name(item.name):
                    continue

                post_fix = "/" if item.is_dir() else ""
@@ -6,6 +6,7 @@ from pathlib import Path

 from deerflow.sandbox.local.list_dir import list_dir
 from deerflow.sandbox.sandbox import Sandbox
+from deerflow.sandbox.search import GrepMatch, find_glob_matches, find_grep_matches


 class LocalSandbox(Sandbox):
@@ -259,6 +260,39 @@ class LocalSandbox(Sandbox):
            # Re-raise with the original path for clearer error messages, hiding internal resolved paths
            raise type(e)(e.errno, e.strerror, path) from None

+    def glob(self, path: str, pattern: str, *, include_dirs: bool = False, max_results: int = 200) -> tuple[list[str], bool]:
+        resolved_path = Path(self._resolve_path(path))
+        matches, truncated = find_glob_matches(resolved_path, pattern, include_dirs=include_dirs, max_results=max_results)
+        return [self._reverse_resolve_path(match) for match in matches], truncated
+
+    def grep(
+        self,
+        path: str,
+        pattern: str,
+        *,
+        glob: str | None = None,
+        literal: bool = False,
+        case_sensitive: bool = False,
+        max_results: int = 100,
+    ) -> tuple[list[GrepMatch], bool]:
+        resolved_path = Path(self._resolve_path(path))
+        matches, truncated = find_grep_matches(
+            resolved_path,
+            pattern,
+            glob_pattern=glob,
+            literal=literal,
+            case_sensitive=case_sensitive,
+            max_results=max_results,
+        )
+        return [
+            GrepMatch(
+                path=self._reverse_resolve_path(match.path),
+                line_number=match.line_number,
+                line=match.line,
+            )
+            for match in matches
+        ], truncated
+
    def update_file(self, path: str, content: bytes) -> None:
        resolved_path = self._resolve_path(path)
        try: