mirror of
https://github.com/bytedance/deer-flow.git
synced 2026-05-24 08:55:59 +00:00
feat(sandbox): add built-in grep and glob tools (#1784)
* feat(sandbox): add grep and glob tools * refactor(aio-sandbox): use native file search APIs * fix(sandbox): address review issues in grep/glob tools - aio_sandbox: use should_ignore_path() instead of should_ignore_name() for include_dirs=True branch to filter nested ignored paths correctly - aio_sandbox: add early exit when max_results reached in glob loop - aio_sandbox: guard entry.path.startswith(path) before stripping prefix - aio_sandbox: validate regex locally before sending to remote API - search: skip lines exceeding max_line_chars to prevent ReDoS - search: remove resolve() syscall in os.walk loop - tools: avoid double get_thread_data() call in glob_tool/grep_tool - tests: add 6 new cases covering the above code paths - tests: patch get_app_config in truncation test to isolate config * Fix sandbox grep/glob review feedback * Remove unrelated Langfuse RFC from PR
This commit is contained in:
@@ -7,6 +7,7 @@ from langchain.tools import ToolRuntime, tool
|
||||
from langgraph.typing import ContextT
|
||||
|
||||
from deerflow.agents.thread_state import ThreadDataState, ThreadState
|
||||
from deerflow.config import get_app_config
|
||||
from deerflow.config.paths import VIRTUAL_PATH_PREFIX
|
||||
from deerflow.sandbox.exceptions import (
|
||||
SandboxError,
|
||||
@@ -16,6 +17,7 @@ from deerflow.sandbox.exceptions import (
|
||||
from deerflow.sandbox.file_operation_lock import get_file_operation_lock
|
||||
from deerflow.sandbox.sandbox import Sandbox
|
||||
from deerflow.sandbox.sandbox_provider import get_sandbox_provider
|
||||
from deerflow.sandbox.search import GrepMatch
|
||||
from deerflow.sandbox.security import LOCAL_HOST_BASH_DISABLED_MESSAGE, is_host_bash_allowed
|
||||
|
||||
_ABSOLUTE_PATH_PATTERN = re.compile(r"(?<![:\w])(?<!:/)/(?:[^\s\"'`;&|<>()]+)")
|
||||
@@ -31,6 +33,10 @@ _LOCAL_BASH_SYSTEM_PATH_PREFIXES = (
|
||||
|
||||
_DEFAULT_SKILLS_CONTAINER_PATH = "/mnt/skills"
|
||||
_ACP_WORKSPACE_VIRTUAL_PATH = "/mnt/acp-workspace"
|
||||
_DEFAULT_GLOB_MAX_RESULTS = 200
|
||||
_MAX_GLOB_MAX_RESULTS = 1000
|
||||
_DEFAULT_GREP_MAX_RESULTS = 100
|
||||
_MAX_GREP_MAX_RESULTS = 500
|
||||
|
||||
|
||||
def _get_skills_container_path() -> str:
|
||||
@@ -245,6 +251,69 @@ def _get_mcp_allowed_paths() -> list[str]:
|
||||
return allowed_paths
|
||||
|
||||
|
||||
def _get_tool_config_int(name: str, key: str, default: int) -> int:
|
||||
try:
|
||||
tool_config = get_app_config().get_tool_config(name)
|
||||
if tool_config is not None and key in tool_config.model_extra:
|
||||
value = tool_config.model_extra.get(key)
|
||||
if isinstance(value, int):
|
||||
return value
|
||||
except Exception:
|
||||
pass
|
||||
return default
|
||||
|
||||
|
||||
def _clamp_max_results(value: int, *, default: int, upper_bound: int) -> int:
|
||||
if value <= 0:
|
||||
return default
|
||||
return min(value, upper_bound)
|
||||
|
||||
|
||||
def _resolve_max_results(name: str, requested: int, *, default: int, upper_bound: int) -> int:
|
||||
requested_max_results = _clamp_max_results(requested, default=default, upper_bound=upper_bound)
|
||||
configured_max_results = _clamp_max_results(
|
||||
_get_tool_config_int(name, "max_results", default),
|
||||
default=default,
|
||||
upper_bound=upper_bound,
|
||||
)
|
||||
return min(requested_max_results, configured_max_results)
|
||||
|
||||
|
||||
def _resolve_local_read_path(path: str, thread_data: ThreadDataState) -> str:
|
||||
validate_local_tool_path(path, thread_data, read_only=True)
|
||||
if _is_skills_path(path):
|
||||
return _resolve_skills_path(path)
|
||||
if _is_acp_workspace_path(path):
|
||||
return _resolve_acp_workspace_path(path, _extract_thread_id_from_thread_data(thread_data))
|
||||
return _resolve_and_validate_user_data_path(path, thread_data)
|
||||
|
||||
|
||||
def _format_glob_results(root_path: str, matches: list[str], truncated: bool) -> str:
|
||||
if not matches:
|
||||
return f"No files matched under {root_path}"
|
||||
|
||||
lines = [f"Found {len(matches)} paths under {root_path}"]
|
||||
if truncated:
|
||||
lines[0] += f" (showing first {len(matches)})"
|
||||
lines.extend(f"{index}. {path}" for index, path in enumerate(matches, start=1))
|
||||
if truncated:
|
||||
lines.append("Results truncated. Narrow the path or pattern to see fewer matches.")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _format_grep_results(root_path: str, matches: list[GrepMatch], truncated: bool) -> str:
|
||||
if not matches:
|
||||
return f"No matches found under {root_path}"
|
||||
|
||||
lines = [f"Found {len(matches)} matches under {root_path}"]
|
||||
if truncated:
|
||||
lines[0] += f" (showing first {len(matches)})"
|
||||
lines.extend(f"{match.path}:{match.line_number}: {match.line}" for match in matches)
|
||||
if truncated:
|
||||
lines.append("Results truncated. Narrow the path or add a glob filter.")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _path_variants(path: str) -> set[str]:
|
||||
return {path, path.replace("\\", "/"), path.replace("/", "\\")}
|
||||
|
||||
@@ -901,6 +970,126 @@ def ls_tool(runtime: ToolRuntime[ContextT, ThreadState], description: str, path:
|
||||
return f"Error: Unexpected error listing directory: {_sanitize_error(e, runtime)}"
|
||||
|
||||
|
||||
@tool("glob", parse_docstring=True)
|
||||
def glob_tool(
|
||||
runtime: ToolRuntime[ContextT, ThreadState],
|
||||
description: str,
|
||||
pattern: str,
|
||||
path: str,
|
||||
include_dirs: bool = False,
|
||||
max_results: int = _DEFAULT_GLOB_MAX_RESULTS,
|
||||
) -> str:
|
||||
"""Find files or directories that match a glob pattern under a root directory.
|
||||
|
||||
Args:
|
||||
description: Explain why you are searching for these paths in short words. ALWAYS PROVIDE THIS PARAMETER FIRST.
|
||||
pattern: The glob pattern to match relative to the root path, for example `**/*.py`.
|
||||
path: The **absolute** root directory to search under.
|
||||
include_dirs: Whether matching directories should also be returned. Default is False.
|
||||
max_results: Maximum number of paths to return. Default is 200.
|
||||
"""
|
||||
try:
|
||||
sandbox = ensure_sandbox_initialized(runtime)
|
||||
ensure_thread_directories_exist(runtime)
|
||||
requested_path = path
|
||||
effective_max_results = _resolve_max_results(
|
||||
"glob",
|
||||
max_results,
|
||||
default=_DEFAULT_GLOB_MAX_RESULTS,
|
||||
upper_bound=_MAX_GLOB_MAX_RESULTS,
|
||||
)
|
||||
thread_data = None
|
||||
if is_local_sandbox(runtime):
|
||||
thread_data = get_thread_data(runtime)
|
||||
if thread_data is None:
|
||||
raise SandboxRuntimeError("Thread data not available for local sandbox")
|
||||
path = _resolve_local_read_path(path, thread_data)
|
||||
matches, truncated = sandbox.glob(path, pattern, include_dirs=include_dirs, max_results=effective_max_results)
|
||||
if thread_data is not None:
|
||||
matches = [mask_local_paths_in_output(match, thread_data) for match in matches]
|
||||
return _format_glob_results(requested_path, matches, truncated)
|
||||
except SandboxError as e:
|
||||
return f"Error: {e}"
|
||||
except FileNotFoundError:
|
||||
return f"Error: Directory not found: {requested_path}"
|
||||
except NotADirectoryError:
|
||||
return f"Error: Path is not a directory: {requested_path}"
|
||||
except PermissionError:
|
||||
return f"Error: Permission denied: {requested_path}"
|
||||
except Exception as e:
|
||||
return f"Error: Unexpected error searching paths: {_sanitize_error(e, runtime)}"
|
||||
|
||||
|
||||
@tool("grep", parse_docstring=True)
|
||||
def grep_tool(
|
||||
runtime: ToolRuntime[ContextT, ThreadState],
|
||||
description: str,
|
||||
pattern: str,
|
||||
path: str,
|
||||
glob: str | None = None,
|
||||
literal: bool = False,
|
||||
case_sensitive: bool = False,
|
||||
max_results: int = _DEFAULT_GREP_MAX_RESULTS,
|
||||
) -> str:
|
||||
"""Search for matching lines inside text files under a root directory.
|
||||
|
||||
Args:
|
||||
description: Explain why you are searching file contents in short words. ALWAYS PROVIDE THIS PARAMETER FIRST.
|
||||
pattern: The string or regex pattern to search for.
|
||||
path: The **absolute** root directory to search under.
|
||||
glob: Optional glob filter for candidate files, for example `**/*.py`.
|
||||
literal: Whether to treat `pattern` as a plain string. Default is False.
|
||||
case_sensitive: Whether matching is case-sensitive. Default is False.
|
||||
max_results: Maximum number of matching lines to return. Default is 100.
|
||||
"""
|
||||
try:
|
||||
sandbox = ensure_sandbox_initialized(runtime)
|
||||
ensure_thread_directories_exist(runtime)
|
||||
requested_path = path
|
||||
effective_max_results = _resolve_max_results(
|
||||
"grep",
|
||||
max_results,
|
||||
default=_DEFAULT_GREP_MAX_RESULTS,
|
||||
upper_bound=_MAX_GREP_MAX_RESULTS,
|
||||
)
|
||||
thread_data = None
|
||||
if is_local_sandbox(runtime):
|
||||
thread_data = get_thread_data(runtime)
|
||||
if thread_data is None:
|
||||
raise SandboxRuntimeError("Thread data not available for local sandbox")
|
||||
path = _resolve_local_read_path(path, thread_data)
|
||||
matches, truncated = sandbox.grep(
|
||||
path,
|
||||
pattern,
|
||||
glob=glob,
|
||||
literal=literal,
|
||||
case_sensitive=case_sensitive,
|
||||
max_results=effective_max_results,
|
||||
)
|
||||
if thread_data is not None:
|
||||
matches = [
|
||||
GrepMatch(
|
||||
path=mask_local_paths_in_output(match.path, thread_data),
|
||||
line_number=match.line_number,
|
||||
line=match.line,
|
||||
)
|
||||
for match in matches
|
||||
]
|
||||
return _format_grep_results(requested_path, matches, truncated)
|
||||
except SandboxError as e:
|
||||
return f"Error: {e}"
|
||||
except FileNotFoundError:
|
||||
return f"Error: Directory not found: {requested_path}"
|
||||
except NotADirectoryError:
|
||||
return f"Error: Path is not a directory: {requested_path}"
|
||||
except re.error as e:
|
||||
return f"Error: Invalid regex pattern: {e}"
|
||||
except PermissionError:
|
||||
return f"Error: Permission denied: {requested_path}"
|
||||
except Exception as e:
|
||||
return f"Error: Unexpected error searching file contents: {_sanitize_error(e, runtime)}"
|
||||
|
||||
|
||||
@tool("read_file", parse_docstring=True)
|
||||
def read_file_tool(
|
||||
runtime: ToolRuntime[ContextT, ThreadState],
|
||||
|
||||
Reference in New Issue
Block a user