Files
deer-flow/backend/packages/harness/deerflow/tools/builtins/tool_search.py
T
AochenShen99 2bbc7879fa refactor(tool-search): consolidate MCP metadata tag and harden deferred-tool setup (#3370)
Follow-up to #3342 (deferred MCP tool loading). Maintainability cleanup plus
hardening of malformed/empty tool_search queries; no change to the deferral
mechanism or search ranking.

- Add deerflow/tools/mcp_metadata.py as the single source of truth for the
  "deerflow_mcp" tag (MCP_TOOL_METADATA_KEY + tag_mcp_tool + public
  is_mcp_tool). Removes the duplicated magic string and the private,
  cross-module _is_mcp_tool import.
- tool_search.search: never raise on model-generated input. Extract
  _compile_catalog_regex (shared compile-with-literal-fallback); return empty
  for empty/whitespace queries and a bare "+" instead of matching everything
  or raising IndexError.
- DeferredToolSetup: document the empty-vs-populated invariant.
- build_deferred_tool_setup: comment the two distinct empty-return branches.
- _assemble_deferred: add return type, rename local to deferred_setup, build
  the final list with an explicit append.
- Tests: use tag_mcp_tool instead of per-file tag helpers; cover empty and
  bare-"+" queries.
2026-06-05 15:21:41 +08:00

182 lines
7.1 KiB
Python

"""Tool search — deferred tool discovery at runtime.
Contains:
- DeferredToolCatalog: immutable, searchable catalog of deferred tools.
- build_tool_search_tool: builds the `tool_search` tool as a closure over a
catalog; it records promotions into graph state via ``Command``.
- build_deferred_tool_setup: assembles the catalog + tool from a
policy-filtered tool list (call AFTER tool-policy filtering).
The agent sees deferred tool names in <available-deferred-tools> but cannot
call them until it fetches their full schema via the tool_search tool. The
deferred set rides on a build-time closure and promotion lives in per-thread
graph state — there is no ContextVar. Source-agnostic: a tool is "deferred"
when it carries the ``deerflow_mcp`` metadata tag.
"""
import hashlib
import json
import logging
import re
from dataclasses import dataclass
from functools import cached_property
from typing import Annotated
from langchain.tools import BaseTool
from langchain_core.messages import ToolMessage
from langchain_core.tools import InjectedToolCallId, tool
from langchain_core.utils.function_calling import convert_to_openai_function
from langgraph.types import Command
from deerflow.tools.mcp_metadata import is_mcp_tool
logger = logging.getLogger(__name__)
MAX_RESULTS = 5 # Max tools returned per search
def _compile_catalog_regex(pattern: str) -> re.Pattern[str]:
"""Compile ``pattern`` case-insensitively, falling back to a literal match.
Search queries come from the model, so an invalid regex (e.g. an unbalanced
paren) must degrade to a literal substring match rather than raise.
"""
try:
return re.compile(pattern, re.IGNORECASE)
except re.error:
return re.compile(re.escape(pattern), re.IGNORECASE)
# ── Catalog ──
# NOTE: frozen=True without slots=True keeps __dict__, which is what lets the
# @cached_property fields below cache (they write to instance.__dict__, bypassing
# the frozen __setattr__). Do NOT add slots=True or hash/names break at runtime.
@dataclass(frozen=True)
class DeferredToolCatalog:
"""Immutable catalog of deferred tools. Pure search, no mutation."""
tools: tuple[BaseTool, ...]
@cached_property
def names(self) -> frozenset[str]:
return frozenset(t.name for t in self.tools)
@cached_property
def hash(self) -> str:
canon = [{"name": t.name, "schema": convert_to_openai_function(t)} for t in sorted(self.tools, key=lambda t: t.name)]
blob = json.dumps(canon, sort_keys=True, ensure_ascii=False, default=str)
return hashlib.sha256(blob.encode("utf-8")).hexdigest()[:16]
def search(self, query: str) -> list[BaseTool]:
query = query.strip()
if not query:
return []
if query.startswith("select:"):
wanted = {n.strip() for n in query[7:].split(",")}
return [t for t in self.tools if t.name in wanted][:MAX_RESULTS]
if query.startswith("+"):
parts = query[1:].split(None, 1)
if not parts:
return [] # bare "+" with no required token — nothing to require
required = parts[0].lower()
candidates = [t for t in self.tools if required in t.name.lower()]
if len(parts) > 1:
candidates.sort(key=lambda t: _catalog_regex_score(parts[1], t), reverse=True)
return candidates[:MAX_RESULTS]
regex = _compile_catalog_regex(query)
scored: list[tuple[int, BaseTool]] = []
for t in self.tools:
searchable = f"{t.name} {t.description or ''}"
if regex.search(searchable):
scored.append((2 if regex.search(t.name) else 1, t))
scored.sort(key=lambda x: x[0], reverse=True)
return [t for _, t in scored][:MAX_RESULTS]
def _catalog_regex_score(pattern: str, t: BaseTool) -> int:
regex = _compile_catalog_regex(pattern)
return len(regex.findall(f"{t.name} {t.description or ''}"))
# ── Setup / tool ──
@dataclass(frozen=True)
class DeferredToolSetup:
"""Result of assembling deferred-tool support for one agent build.
The three fields move as a unit, so callers branch on ``tool_search_tool``:
- **Empty** ``(None, frozenset(), None)``: deferral is disabled, or no MCP
tool survived policy filtering. Nothing is deferred — bind tools as-is.
- **Populated**: ``tool_search_tool`` is appended to the agent's tools,
``deferred_names`` are withheld from the model until promoted, and
``catalog_hash`` scopes those promotions in graph state.
Invariant: ``tool_search_tool is None`` ⟺ ``deferred_names`` is empty ⟺
``catalog_hash is None``.
"""
tool_search_tool: BaseTool | None
deferred_names: frozenset[str]
catalog_hash: str | None
def build_tool_search_tool(catalog: DeferredToolCatalog) -> BaseTool:
catalog_hash = catalog.hash
@tool
def tool_search(query: str, tool_call_id: Annotated[str, InjectedToolCallId]) -> Command:
"""Fetches full schema definitions for deferred tools so they can be called.
Deferred tools appear by name in <available-deferred-tools> in the system
prompt. Until fetched, only the name is known. This tool matches a query
against the deferred tools and returns the matched tools complete schemas;
once returned, a tool becomes callable.
Query forms:
- "select:Read,Edit" -- fetch these exact tools by name
- "notebook jupyter" -- keyword search, up to max_results best matches
- "+slack send" -- require "slack" in the name, rank by remaining terms
"""
matched = catalog.search(query)[:MAX_RESULTS]
if not matched:
content, names = f"No tools found matching: {query}", []
else:
content = json.dumps([convert_to_openai_function(t) for t in matched], indent=2, ensure_ascii=False)
names = [t.name for t in matched]
return Command(
update={
"promoted": {"catalog_hash": catalog_hash, "names": names},
"messages": [ToolMessage(content=content, tool_call_id=tool_call_id, name="tool_search")],
}
)
return tool_search
def build_deferred_tool_setup(filtered_tools: list[BaseTool], *, enabled: bool) -> DeferredToolSetup:
"""Build the deferred-tool setup from a POLICY-FILTERED tool list.
Must be called after skill/agent tool-policy filtering so the catalog never
exposes a tool the current agent is not allowed to use.
Returns an empty setup (see :class:`DeferredToolSetup`) in two distinct
cases: deferral is disabled, or it is enabled but no MCP tool survived
filtering.
"""
if not enabled:
# Deferral disabled: defer nothing; the model binds every tool as before.
return DeferredToolSetup(None, frozenset(), None)
deferred = [t for t in filtered_tools if is_mcp_tool(t)]
if not deferred:
# Enabled, but no MCP tool to defer: same empty result, different reason.
return DeferredToolSetup(None, frozenset(), None)
catalog = DeferredToolCatalog(tuple(deferred))
return DeferredToolSetup(build_tool_search_tool(catalog), catalog.names, catalog.hash)