mirror of
https://github.com/bytedance/deer-flow.git
synced 2026-05-22 16:06:50 +00:00
fix(tools): preserve tool_search promotions across re-entrant get_available_tools (#2885)
* fix(tools): preserve tool_search promotions across re-entrant get_available_tools Closes #2884. ``get_available_tools`` used to unconditionally call ``reset_deferred_registry()`` and rebuild a fresh ``DeferredToolRegistry`` on every invocation. That works for the first call of a request (the ContextVar starts at its default of ``None``), but any RE-ENTRANT call during the same async context — e.g. ``task_tool`` building a subagent's toolset, or a custom middleware that rebuilds tools mid-run — wiped any ``tool_search`` promotions the parent agent had already made. The ``DeferredToolFilterMiddleware`` would then re-hide those tools from the next model call, leaving the agent able to see a tool's name (via the prior ``tool_search`` result that's still in conversation history) but unable to invoke it. Fix: when the ContextVar already holds a registry, reuse it instead of rebuilding. Fresh requests still get a fresh registry because each new graph run starts in a new asyncio task with the ContextVar at ``None``. ## Verification - Unit-level reproduction (``test_get_available_tools_resets_registry_wiping_promotion``): promote a tool in the registry, call ``get_available_tools`` again, assert the promotion is preserved. Fails on main, passes on this branch. - Graph-execution reproduction (two tests): drive a real ``langchain.agents.create_agent`` graph with the real ``DeferredToolFilterMiddleware`` through two model turns, including one that issues a re-entrant ``get_available_tools`` call to simulate the task_tool subagent path. - Real-LLM end-to-end (``test_deferred_tool_promotion_real_llm.py``, opt-in via ``ONEAPI_E2E=1``): drives the same flow against a real OpenAI-compatible model (verified on GPT-5.4-mini through the one-api gateway), watches the model call the promoted ``fake_calculator`` through the deferred-filter middleware, and asserts the right arithmetic result. Passes against the fixed branch. - Companion update to ``test_tool_deduplication.py``: dropped the ``@patch("deerflow.tools.tools.reset_deferred_registry")`` decorators because the symbol is no longer imported there. - Test fixtures in the new files patch ``deerflow.tools.tools.get_app_config`` with a minimal ``model_construct``-ed ``AppConfig`` instead of calling the real loader, so they never trigger ``_apply_singleton_configs`` and never leak ``_memory_config``/``_title_config``/… mutations into the rest of the suite. Full backend suite: 3208 passed / 14 skipped / 0 failed. ruff check + format clean. * fix(tools): address Copilot review on #2885 - tools.py: rewrite the reuse-path comment to spell out (a) why we don't reconcile the registry against the current ``mcp_tools`` snapshot — the MCP cache doesn't refresh mid-graph-run, the lead agent's ``ToolNode`` is already bound to the previous tool set anyway, and ``promote()`` drops the entry so a naive re-sync misclassifies promotions as new tools — and (b) why the log uses ``max(0, …)`` to avoid negative counts when the cache shrinks between snapshots. - Replace direct ``ts_mod._registry_var.set(None)`` in test fixtures with the public ``reset_deferred_registry()`` helper so tests don't couple to module internals. - Correct the docstring path in ``test_deferred_tool_registry_promotion.py`` to match the actual monkeypatch target (``deerflow.mcp.cache.get_cached_mcp_tools``). - Rename ``test_get_available_tools_resets_registry_wiping_promotion`` to ``test_get_available_tools_preserves_promotions_across_reentrant_calls`` so the test name describes the contract being asserted, not the bug it originally reproduced. Full backend suite: 3208 passed / 14 skipped. Real-LLM e2e: 1 passed.
This commit is contained in:
@@ -7,7 +7,7 @@ from deerflow.config.app_config import AppConfig
|
|||||||
from deerflow.reflection import resolve_variable
|
from deerflow.reflection import resolve_variable
|
||||||
from deerflow.sandbox.security import is_host_bash_allowed
|
from deerflow.sandbox.security import is_host_bash_allowed
|
||||||
from deerflow.tools.builtins import ask_clarification_tool, present_file_tool, task_tool, view_image_tool
|
from deerflow.tools.builtins import ask_clarification_tool, present_file_tool, task_tool, view_image_tool
|
||||||
from deerflow.tools.builtins.tool_search import reset_deferred_registry
|
from deerflow.tools.builtins.tool_search import get_deferred_registry
|
||||||
from deerflow.tools.sync import make_sync_tool_wrapper
|
from deerflow.tools.sync import make_sync_tool_wrapper
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@@ -116,8 +116,6 @@ def get_available_tools(
|
|||||||
# made through the Gateway API (which runs in a separate process) are immediately
|
# made through the Gateway API (which runs in a separate process) are immediately
|
||||||
# reflected when loading MCP tools.
|
# reflected when loading MCP tools.
|
||||||
mcp_tools = []
|
mcp_tools = []
|
||||||
# Reset deferred registry upfront to prevent stale state from previous calls
|
|
||||||
reset_deferred_registry()
|
|
||||||
if include_mcp:
|
if include_mcp:
|
||||||
try:
|
try:
|
||||||
from deerflow.config.extensions_config import ExtensionsConfig
|
from deerflow.config.extensions_config import ExtensionsConfig
|
||||||
@@ -135,12 +133,51 @@ def get_available_tools(
|
|||||||
from deerflow.tools.builtins.tool_search import DeferredToolRegistry, set_deferred_registry
|
from deerflow.tools.builtins.tool_search import DeferredToolRegistry, set_deferred_registry
|
||||||
from deerflow.tools.builtins.tool_search import tool_search as tool_search_tool
|
from deerflow.tools.builtins.tool_search import tool_search as tool_search_tool
|
||||||
|
|
||||||
registry = DeferredToolRegistry()
|
# Reuse the existing registry if one is already set for
|
||||||
for t in mcp_tools:
|
# this async context. ``get_available_tools`` is
|
||||||
registry.register(t)
|
# re-entered whenever a subagent is spawned
|
||||||
set_deferred_registry(registry)
|
# (``task_tool`` calls it to build the child agent's
|
||||||
|
# toolset), and previously we used to unconditionally
|
||||||
|
# rebuild the registry — wiping out the parent agent's
|
||||||
|
# tool_search promotions. The
|
||||||
|
# ``DeferredToolFilterMiddleware`` then re-hid those
|
||||||
|
# tools from subsequent model calls, leaving the agent
|
||||||
|
# able to see a tool's name but unable to invoke it
|
||||||
|
# (issue #2884). ``contextvars`` already gives us the
|
||||||
|
# lifetime semantics we want: a fresh request / graph
|
||||||
|
# run starts in a new asyncio task with the
|
||||||
|
# ContextVar at its default of ``None``, so reuse is
|
||||||
|
# only triggered for re-entrant calls inside one run.
|
||||||
|
#
|
||||||
|
# Intentionally NOT reconciling against the current
|
||||||
|
# ``mcp_tools`` snapshot. The MCP cache only refreshes
|
||||||
|
# on ``extensions_config.json`` mtime changes, which
|
||||||
|
# in practice happens between graph runs — not inside
|
||||||
|
# one. And even if a refresh did happen mid-run, the
|
||||||
|
# already-built lead agent's ``ToolNode`` still holds
|
||||||
|
# the *previous* tool set (LangGraph binds tools at
|
||||||
|
# graph construction time), so a brand-new MCP tool
|
||||||
|
# couldn't actually be invoked anyway. The
|
||||||
|
# ``DeferredToolRegistry`` doesn't retain the names
|
||||||
|
# of previously-promoted tools (``promote()`` drops
|
||||||
|
# the entry entirely), so re-syncing the registry
|
||||||
|
# against a fresh ``mcp_tools`` list would
|
||||||
|
# mis-classify those promotions as new tools and
|
||||||
|
# re-register them as deferred — exactly the bug
|
||||||
|
# this fix exists to prevent.
|
||||||
|
existing_registry = get_deferred_registry()
|
||||||
|
if existing_registry is None:
|
||||||
|
registry = DeferredToolRegistry()
|
||||||
|
for t in mcp_tools:
|
||||||
|
registry.register(t)
|
||||||
|
set_deferred_registry(registry)
|
||||||
|
logger.info(f"Tool search active: {len(mcp_tools)} tools deferred")
|
||||||
|
else:
|
||||||
|
mcp_tool_names = {t.name for t in mcp_tools}
|
||||||
|
still_deferred = len(existing_registry)
|
||||||
|
promoted_count = max(0, len(mcp_tool_names) - still_deferred)
|
||||||
|
logger.info(f"Tool search active (preserved promotions): {still_deferred} tools deferred, {promoted_count} already promoted")
|
||||||
builtin_tools.append(tool_search_tool)
|
builtin_tools.append(tool_search_tool)
|
||||||
logger.info(f"Tool search active: {len(mcp_tools)} tools deferred")
|
|
||||||
except ImportError:
|
except ImportError:
|
||||||
logger.warning("MCP module not available. Install 'langchain-mcp-adapters' package to enable MCP tools.")
|
logger.warning("MCP module not available. Install 'langchain-mcp-adapters' package to enable MCP tools.")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
@@ -0,0 +1,222 @@
|
|||||||
|
"""Real-LLM end-to-end verification for issue #2884.
|
||||||
|
|
||||||
|
Drives a real ``langchain.agents.create_agent`` graph against a real OpenAI-
|
||||||
|
compatible LLM (one-api gateway), bound through ``DeferredToolFilterMiddleware``
|
||||||
|
and the production ``get_available_tools`` pipeline. The only thing we mock is
|
||||||
|
the MCP tool source — we hand-roll two ``@tool``s and inject them through
|
||||||
|
``deerflow.mcp.cache.get_cached_mcp_tools``.
|
||||||
|
|
||||||
|
The flow exercised:
|
||||||
|
1. Turn 1: agent sees ``tool_search`` (plus a ``fake_subagent_trigger``
|
||||||
|
that re-enters ``get_available_tools`` on the same task — this is the
|
||||||
|
code path issue #2884 reports). It must call ``tool_search`` to
|
||||||
|
discover the deferred ``fake_calculator`` tool.
|
||||||
|
2. Tool batch: ``tool_search`` promotes ``fake_calculator``;
|
||||||
|
``fake_subagent_trigger`` re-enters ``get_available_tools``.
|
||||||
|
3. Turn 2: the promoted ``fake_calculator`` schema must reach the model
|
||||||
|
so it can actually call it. Without this PR's fix, the re-entry wipes
|
||||||
|
the promotion and the model can no longer invoke the tool.
|
||||||
|
|
||||||
|
Skipped unless ``ONEAPI_E2E=1`` is set so this doesn't burn credits on every
|
||||||
|
test run. Run with::
|
||||||
|
|
||||||
|
ONEAPI_E2E=1 OPENAI_API_KEY=... OPENAI_API_BASE=... \
|
||||||
|
PYTHONPATH=. uv run pytest \
|
||||||
|
tests/test_deferred_tool_promotion_real_llm.py -v -s
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from langchain_core.messages import HumanMessage
|
||||||
|
from langchain_core.tools import tool as as_tool
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Skip control: only run when explicitly opted in.
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
pytestmark = pytest.mark.skipif(
|
||||||
|
os.getenv("ONEAPI_E2E") != "1",
|
||||||
|
reason="Real-LLM e2e: opt in with ONEAPI_E2E=1 (requires OPENAI_API_KEY + OPENAI_API_BASE)",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Fake "MCP" tools the agent should discover via tool_search.
|
||||||
|
# Keep them obviously synthetic so the model can pattern-match the search.
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
_calls: list[str] = []
|
||||||
|
|
||||||
|
|
||||||
|
@as_tool
|
||||||
|
def fake_calculator(expression: str) -> str:
|
||||||
|
"""Evaluate a tiny arithmetic expression like '2 + 2'.
|
||||||
|
|
||||||
|
Reserved for the user — only call this if the user asks for arithmetic.
|
||||||
|
"""
|
||||||
|
_calls.append(f"fake_calculator:{expression}")
|
||||||
|
try:
|
||||||
|
# Trivially safe-eval just for the e2e check
|
||||||
|
allowed = set("0123456789+-*/() .")
|
||||||
|
if not set(expression) <= allowed:
|
||||||
|
return "expression contains disallowed characters"
|
||||||
|
return str(eval(expression, {"__builtins__": {}}, {})) # noqa: S307
|
||||||
|
except Exception as e:
|
||||||
|
return f"error: {e}"
|
||||||
|
|
||||||
|
|
||||||
|
@as_tool
|
||||||
|
def fake_translator(text: str, target_lang: str) -> str:
|
||||||
|
"""Translate text into the given language code. Decorative — not used."""
|
||||||
|
_calls.append(f"fake_translator:{text}:{target_lang}")
|
||||||
|
return f"[{target_lang}] {text}"
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Pipeline wiring (same shape as the in-process tests).
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def _reset_registry_between_tests():
|
||||||
|
from deerflow.tools.builtins.tool_search import reset_deferred_registry
|
||||||
|
|
||||||
|
reset_deferred_registry()
|
||||||
|
yield
|
||||||
|
reset_deferred_registry()
|
||||||
|
|
||||||
|
|
||||||
|
def _patch_mcp_pipeline(monkeypatch: pytest.MonkeyPatch, mcp_tools: list) -> None:
|
||||||
|
from deerflow.config.extensions_config import ExtensionsConfig, McpServerConfig
|
||||||
|
|
||||||
|
real_ext = ExtensionsConfig(
|
||||||
|
mcpServers={"fake-server": McpServerConfig(type="stdio", command="echo", enabled=True)},
|
||||||
|
)
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"deerflow.config.extensions_config.ExtensionsConfig.from_file",
|
||||||
|
classmethod(lambda cls: real_ext),
|
||||||
|
)
|
||||||
|
monkeypatch.setattr("deerflow.mcp.cache.get_cached_mcp_tools", lambda: list(mcp_tools))
|
||||||
|
|
||||||
|
|
||||||
|
def _force_tool_search_enabled(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||||
|
"""Build a minimal mock AppConfig and patch the symbol — never call the
|
||||||
|
real loader, which would trigger ``_apply_singleton_configs`` and
|
||||||
|
permanently mutate cross-test singletons (memory, title, …)."""
|
||||||
|
from deerflow.config.app_config import AppConfig
|
||||||
|
from deerflow.config.tool_search_config import ToolSearchConfig
|
||||||
|
|
||||||
|
mock_cfg = AppConfig.model_construct(
|
||||||
|
log_level="info",
|
||||||
|
models=[],
|
||||||
|
tools=[],
|
||||||
|
tool_groups=[],
|
||||||
|
sandbox=AppConfig.model_fields["sandbox"].annotation.model_construct(use="x"),
|
||||||
|
tool_search=ToolSearchConfig(enabled=True),
|
||||||
|
)
|
||||||
|
monkeypatch.setattr("deerflow.tools.tools.get_app_config", lambda: mock_cfg)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Real-LLM e2e test
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_real_llm_promotes_then_invokes_with_subagent_reentry(monkeypatch: pytest.MonkeyPatch):
|
||||||
|
"""End-to-end against a real OpenAI-compatible LLM.
|
||||||
|
|
||||||
|
The model must:
|
||||||
|
Turn 1 — see ``tool_search`` (deferred tools aren't bound yet) and
|
||||||
|
batch-call BOTH ``tool_search(select:fake_calculator)`` AND
|
||||||
|
``fake_subagent_trigger(...)``.
|
||||||
|
Turn 2 — call ``fake_calculator`` and finish.
|
||||||
|
|
||||||
|
Pass criterion: ``fake_calculator`` actually gets invoked at the tool
|
||||||
|
layer — recorded in ``_calls`` — which proves the model received the
|
||||||
|
promoted schema after the re-entrant ``get_available_tools`` call.
|
||||||
|
"""
|
||||||
|
from langchain.agents import create_agent
|
||||||
|
from langchain_openai import ChatOpenAI
|
||||||
|
|
||||||
|
from deerflow.agents.middlewares.deferred_tool_filter_middleware import DeferredToolFilterMiddleware
|
||||||
|
from deerflow.tools.tools import get_available_tools
|
||||||
|
|
||||||
|
_patch_mcp_pipeline(monkeypatch, [fake_calculator, fake_translator])
|
||||||
|
_force_tool_search_enabled(monkeypatch)
|
||||||
|
_calls.clear()
|
||||||
|
|
||||||
|
@as_tool
|
||||||
|
async def fake_subagent_trigger(prompt: str) -> str:
|
||||||
|
"""Pretend to spawn a subagent. Internally rebuilds the toolset.
|
||||||
|
|
||||||
|
Use this whenever the user asks you to delegate work — pass a short
|
||||||
|
description as ``prompt``.
|
||||||
|
"""
|
||||||
|
# ``task_tool`` does this internally. Whether the registry-reset that
|
||||||
|
# used to happen here actually leaks back to the parent task depends
|
||||||
|
# on asyncio's implicit context-copying semantics (gather creates
|
||||||
|
# child tasks with copied contexts, so reset_deferred_registry is
|
||||||
|
# task-local) — but the fix in this PR is what GUARANTEES the
|
||||||
|
# promotion sticks regardless of which integration path triggers a
|
||||||
|
# re-entrant ``get_available_tools`` call.
|
||||||
|
get_available_tools(subagent_enabled=False)
|
||||||
|
_calls.append(f"fake_subagent_trigger:{prompt}")
|
||||||
|
return "subagent completed"
|
||||||
|
|
||||||
|
tools = get_available_tools() + [fake_subagent_trigger]
|
||||||
|
|
||||||
|
model = ChatOpenAI(
|
||||||
|
model=os.environ.get("ONEAPI_MODEL", "claude-sonnet-4-6"),
|
||||||
|
api_key=os.environ["OPENAI_API_KEY"],
|
||||||
|
base_url=os.environ["OPENAI_API_BASE"],
|
||||||
|
temperature=0,
|
||||||
|
max_retries=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
system_prompt = (
|
||||||
|
"You are a meticulous assistant. Available deferred tools include a "
|
||||||
|
"calculator and a translator — their schemas are hidden until you "
|
||||||
|
"search for them via tool_search.\n\n"
|
||||||
|
"Procedure for the user's request:\n"
|
||||||
|
" 1. Call tool_search with query 'select:fake_calculator' AND "
|
||||||
|
"in the SAME tool batch also call fake_subagent_trigger(prompt='go') "
|
||||||
|
"to delegate the side work. Put both tool_calls in your first response.\n"
|
||||||
|
" 2. After both tool messages come back, call fake_calculator with "
|
||||||
|
"the user's expression.\n"
|
||||||
|
" 3. Reply with just the numeric result."
|
||||||
|
)
|
||||||
|
|
||||||
|
graph = create_agent(
|
||||||
|
model=model,
|
||||||
|
tools=tools,
|
||||||
|
middleware=[DeferredToolFilterMiddleware()],
|
||||||
|
system_prompt=system_prompt,
|
||||||
|
)
|
||||||
|
|
||||||
|
result = await graph.ainvoke(
|
||||||
|
{"messages": [HumanMessage(content="What is 17 * 23? Use the deferred calculator tool.")]},
|
||||||
|
config={"recursion_limit": 12},
|
||||||
|
)
|
||||||
|
|
||||||
|
print("\n=== tool calls recorded ===")
|
||||||
|
for c in _calls:
|
||||||
|
print(f" {c}")
|
||||||
|
print("\n=== final message ===")
|
||||||
|
final_text = result["messages"][-1].content if result["messages"] else "(none)"
|
||||||
|
print(f" {final_text!r}")
|
||||||
|
|
||||||
|
# The smoking-gun assertion: fake_calculator was actually invoked at the
|
||||||
|
# tool layer. This is only possible if the promoted schema reached the
|
||||||
|
# model in turn 2, despite the subagent-style re-entry in turn 1.
|
||||||
|
calc_calls = [c for c in _calls if c.startswith("fake_calculator:")]
|
||||||
|
assert calc_calls, f"REGRESSION (#2884): the model never managed to call fake_calculator. All recorded tool calls: {_calls!r}. Final text: {final_text!r}"
|
||||||
|
|
||||||
|
# And the math should actually be done correctly (sanity that the LLM
|
||||||
|
# really used the result, not just hallucinated the answer).
|
||||||
|
assert "391" in str(final_text), f"Model didn't surface 17*23=391. Final text: {final_text!r}"
|
||||||
@@ -0,0 +1,390 @@
|
|||||||
|
"""Reproduce + regression-guard issue #2884.
|
||||||
|
|
||||||
|
Hypothesis from the issue:
|
||||||
|
``tools.tools.get_available_tools`` unconditionally calls
|
||||||
|
``reset_deferred_registry()`` and constructs a fresh ``DeferredToolRegistry``
|
||||||
|
every time it is invoked. If anything calls ``get_available_tools`` again
|
||||||
|
during the same async context (after the agent has promoted tools via
|
||||||
|
``tool_search``), the promotion is wiped and the next model call hides the
|
||||||
|
tool's schema again.
|
||||||
|
|
||||||
|
These tests pin two things:
|
||||||
|
|
||||||
|
A. **At the unit boundary** — verify the failure mode directly. Promote a
|
||||||
|
tool in the registry, then call ``get_available_tools`` again and observe
|
||||||
|
that the ContextVar registry is reset and the promotion is lost.
|
||||||
|
|
||||||
|
B. **At the graph-execution boundary** — drive a real ``create_agent`` graph
|
||||||
|
with the real ``DeferredToolFilterMiddleware`` through two model turns.
|
||||||
|
The first turn calls ``tool_search`` which promotes a tool. The second
|
||||||
|
turn must see that tool's schema in ``request.tools``. If
|
||||||
|
``get_available_tools`` were to run again between the two turns and reset
|
||||||
|
the registry, the second turn's filter would strip the tool.
|
||||||
|
|
||||||
|
Strategy: use the production ``deerflow.tools.tools.get_available_tools``
|
||||||
|
unmodified; mock only the LLM and the MCP tool source. Patch
|
||||||
|
``deerflow.mcp.cache.get_cached_mcp_tools`` (the symbol that
|
||||||
|
``get_available_tools`` resolves via lazy import) to return our fixture
|
||||||
|
tools so we don't need a real MCP server.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from langchain_core.language_models.fake_chat_models import FakeMessagesListChatModel
|
||||||
|
from langchain_core.messages import AIMessage, HumanMessage
|
||||||
|
from langchain_core.runnables import Runnable
|
||||||
|
from langchain_core.tools import tool as as_tool
|
||||||
|
|
||||||
|
|
||||||
|
class FakeToolCallingModel(FakeMessagesListChatModel):
|
||||||
|
"""FakeMessagesListChatModel + no-op bind_tools so create_agent works."""
|
||||||
|
|
||||||
|
def bind_tools( # type: ignore[override]
|
||||||
|
self,
|
||||||
|
tools: Any,
|
||||||
|
*,
|
||||||
|
tool_choice: Any = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> Runnable:
|
||||||
|
return self
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Fixtures: a fake MCP tool source + a way to force config.tool_search.enabled
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@as_tool
|
||||||
|
def fake_mcp_search(query: str) -> str:
|
||||||
|
"""Pretend to search a knowledge base for the given query."""
|
||||||
|
return f"results for {query}"
|
||||||
|
|
||||||
|
|
||||||
|
@as_tool
|
||||||
|
def fake_mcp_fetch(url: str) -> str:
|
||||||
|
"""Pretend to fetch a page at the given URL."""
|
||||||
|
return f"content of {url}"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def _supply_env(monkeypatch: pytest.MonkeyPatch):
|
||||||
|
"""config.yaml references $OPENAI_API_KEY at parse time; supply a placeholder."""
|
||||||
|
monkeypatch.setenv("OPENAI_API_KEY", "sk-fake-not-used")
|
||||||
|
monkeypatch.setenv("OPENAI_API_BASE", "https://example.invalid")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def _reset_deferred_registry_between_tests():
|
||||||
|
"""Each test must start with a clean ContextVar.
|
||||||
|
|
||||||
|
The registry lives in a module-level ContextVar with no per-task isolation
|
||||||
|
in a synchronous test runner, so one test's promotion can leak into the
|
||||||
|
next and silently break filter assertions.
|
||||||
|
"""
|
||||||
|
from deerflow.tools.builtins.tool_search import reset_deferred_registry
|
||||||
|
|
||||||
|
reset_deferred_registry()
|
||||||
|
yield
|
||||||
|
reset_deferred_registry()
|
||||||
|
|
||||||
|
|
||||||
|
def _patch_mcp_pipeline(monkeypatch: pytest.MonkeyPatch, mcp_tools: list) -> None:
|
||||||
|
"""Make get_available_tools believe an MCP server is registered.
|
||||||
|
|
||||||
|
Build a real ``ExtensionsConfig`` with one enabled MCP server entry so
|
||||||
|
that both ``AppConfig.from_file`` (which calls
|
||||||
|
``ExtensionsConfig.from_file().model_dump()``) and ``tools.get_available_tools``
|
||||||
|
(which calls ``ExtensionsConfig.from_file().get_enabled_mcp_servers()``)
|
||||||
|
see a valid instance. Then point the MCP tool cache at our fixture tools.
|
||||||
|
"""
|
||||||
|
from deerflow.config.extensions_config import ExtensionsConfig, McpServerConfig
|
||||||
|
|
||||||
|
real_ext = ExtensionsConfig(
|
||||||
|
mcpServers={"fake-server": McpServerConfig(type="stdio", command="echo", enabled=True)},
|
||||||
|
)
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"deerflow.config.extensions_config.ExtensionsConfig.from_file",
|
||||||
|
classmethod(lambda cls: real_ext),
|
||||||
|
)
|
||||||
|
monkeypatch.setattr("deerflow.mcp.cache.get_cached_mcp_tools", lambda: list(mcp_tools))
|
||||||
|
|
||||||
|
|
||||||
|
def _force_tool_search_enabled(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||||
|
"""Force config.tool_search.enabled=True without touching the yaml.
|
||||||
|
|
||||||
|
Calling the real ``get_app_config()`` would trigger ``_apply_singleton_configs``
|
||||||
|
which permanently mutates module-level singletons (``_memory_config``,
|
||||||
|
``_title_config``, …) to match the developer's ``config.yaml`` — even
|
||||||
|
after pytest restores our patch. That leaks across tests later in the
|
||||||
|
run that rely on those singletons' DEFAULTS (e.g. memory queue tests
|
||||||
|
require ``_memory_config.enabled = True``, which is the dataclass default
|
||||||
|
but FALSE in the actual yaml).
|
||||||
|
|
||||||
|
Build a minimal mock AppConfig instead and never call the real loader.
|
||||||
|
"""
|
||||||
|
from deerflow.config.app_config import AppConfig
|
||||||
|
from deerflow.config.tool_search_config import ToolSearchConfig
|
||||||
|
|
||||||
|
mock_cfg = AppConfig.model_construct(
|
||||||
|
log_level="info",
|
||||||
|
models=[],
|
||||||
|
tools=[],
|
||||||
|
tool_groups=[],
|
||||||
|
sandbox=AppConfig.model_fields["sandbox"].annotation.model_construct(use="x"),
|
||||||
|
tool_search=ToolSearchConfig(enabled=True),
|
||||||
|
)
|
||||||
|
monkeypatch.setattr("deerflow.tools.tools.get_app_config", lambda: mock_cfg)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Section A — direct unit-level reproduction
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_available_tools_preserves_promotions_across_reentrant_calls(monkeypatch: pytest.MonkeyPatch):
|
||||||
|
"""Re-entrant ``get_available_tools()`` must preserve prior promotions.
|
||||||
|
|
||||||
|
Step 1: call get_available_tools() — registers MCP tools as deferred.
|
||||||
|
Step 2: simulate the agent calling tool_search by promoting one tool.
|
||||||
|
Step 3: call get_available_tools() again (the same code path
|
||||||
|
``task_tool`` exercises mid-run).
|
||||||
|
|
||||||
|
Assertion: after step 3, the promoted tool is STILL promoted (not
|
||||||
|
re-deferred). On ``main`` before the fix, step 3's
|
||||||
|
``reset_deferred_registry()`` wiped the promotion and re-registered
|
||||||
|
every MCP tool as deferred — this assertion fired with
|
||||||
|
``REGRESSION (#2884)``.
|
||||||
|
"""
|
||||||
|
from deerflow.tools.builtins.tool_search import get_deferred_registry
|
||||||
|
from deerflow.tools.tools import get_available_tools
|
||||||
|
|
||||||
|
_patch_mcp_pipeline(monkeypatch, [fake_mcp_search, fake_mcp_fetch])
|
||||||
|
_force_tool_search_enabled(monkeypatch)
|
||||||
|
|
||||||
|
# Step 1: first call — both MCP tools start deferred
|
||||||
|
get_available_tools()
|
||||||
|
reg1 = get_deferred_registry()
|
||||||
|
assert reg1 is not None
|
||||||
|
assert {e.name for e in reg1.entries} == {"fake_mcp_search", "fake_mcp_fetch"}
|
||||||
|
|
||||||
|
# Step 2: simulate tool_search promoting one of them
|
||||||
|
reg1.promote({"fake_mcp_search"})
|
||||||
|
assert {e.name for e in reg1.entries} == {"fake_mcp_fetch"}, "Sanity: promote should remove fake_mcp_search"
|
||||||
|
|
||||||
|
# Step 3: second call — registry must NOT silently undo the promotion
|
||||||
|
get_available_tools()
|
||||||
|
reg2 = get_deferred_registry()
|
||||||
|
assert reg2 is not None
|
||||||
|
deferred_after = {e.name for e in reg2.entries}
|
||||||
|
assert "fake_mcp_search" not in deferred_after, f"REGRESSION (#2884): get_available_tools wiped the deferred registry, re-deferring a tool that was already promoted by tool_search. deferred_after_second_call={deferred_after!r}"
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Section B — graph-execution reproduction
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class _ToolSearchPromotingModel(FakeToolCallingModel):
|
||||||
|
"""Two-turn model that:
|
||||||
|
|
||||||
|
Turn 1 → emit a tool_call for ``tool_search`` (the real one)
|
||||||
|
Turn 2 → emit a tool_call for ``fake_mcp_search`` (the promoted tool)
|
||||||
|
|
||||||
|
Records the tools it received on each turn so the test can inspect what
|
||||||
|
DeferredToolFilterMiddleware actually fed to ``bind_tools``.
|
||||||
|
"""
|
||||||
|
|
||||||
|
bound_tools_per_turn: list[list[str]] = []
|
||||||
|
|
||||||
|
def bind_tools( # type: ignore[override]
|
||||||
|
self,
|
||||||
|
tools: Any,
|
||||||
|
*,
|
||||||
|
tool_choice: Any = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> Runnable:
|
||||||
|
# Record the tool names the model would see in this turn
|
||||||
|
names = [getattr(t, "name", getattr(t, "__name__", repr(t))) for t in tools]
|
||||||
|
self.bound_tools_per_turn.append(names)
|
||||||
|
return self
|
||||||
|
|
||||||
|
|
||||||
|
def _build_promoting_model() -> _ToolSearchPromotingModel:
|
||||||
|
return _ToolSearchPromotingModel(
|
||||||
|
responses=[
|
||||||
|
AIMessage(
|
||||||
|
content="",
|
||||||
|
tool_calls=[
|
||||||
|
{
|
||||||
|
"name": "tool_search",
|
||||||
|
"args": {"query": "select:fake_mcp_search"},
|
||||||
|
"id": "call_search_1",
|
||||||
|
"type": "tool_call",
|
||||||
|
}
|
||||||
|
],
|
||||||
|
),
|
||||||
|
AIMessage(
|
||||||
|
content="",
|
||||||
|
tool_calls=[
|
||||||
|
{
|
||||||
|
"name": "fake_mcp_search",
|
||||||
|
"args": {"query": "hello"},
|
||||||
|
"id": "call_mcp_1",
|
||||||
|
"type": "tool_call",
|
||||||
|
}
|
||||||
|
],
|
||||||
|
),
|
||||||
|
AIMessage(content="all done"),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_promoted_tool_is_visible_to_model_on_second_turn(monkeypatch: pytest.MonkeyPatch):
|
||||||
|
"""End-to-end: drive a real create_agent graph through two turns.
|
||||||
|
|
||||||
|
Without the fix, the second-turn bind_tools call should NOT contain
|
||||||
|
fake_mcp_search (because DeferredToolFilterMiddleware sees it in the
|
||||||
|
registry and strips it). With the fix, the model sees the schema and can
|
||||||
|
invoke it.
|
||||||
|
"""
|
||||||
|
from langchain.agents import create_agent
|
||||||
|
|
||||||
|
from deerflow.agents.middlewares.deferred_tool_filter_middleware import DeferredToolFilterMiddleware
|
||||||
|
from deerflow.tools.tools import get_available_tools
|
||||||
|
|
||||||
|
_patch_mcp_pipeline(monkeypatch, [fake_mcp_search, fake_mcp_fetch])
|
||||||
|
_force_tool_search_enabled(monkeypatch)
|
||||||
|
|
||||||
|
tools = get_available_tools()
|
||||||
|
# Sanity: the assembled tool list includes the deferred tools (they're in
|
||||||
|
# bind_tools but DeferredToolFilterMiddleware strips deferred ones before
|
||||||
|
# they reach the model)
|
||||||
|
tool_names = {getattr(t, "name", "") for t in tools}
|
||||||
|
assert {"tool_search", "fake_mcp_search", "fake_mcp_fetch"} <= tool_names
|
||||||
|
|
||||||
|
model = _build_promoting_model()
|
||||||
|
model.bound_tools_per_turn = [] # reset class-level recorder
|
||||||
|
|
||||||
|
graph = create_agent(
|
||||||
|
model=model,
|
||||||
|
tools=tools,
|
||||||
|
middleware=[DeferredToolFilterMiddleware()],
|
||||||
|
system_prompt="bug-2884-repro",
|
||||||
|
)
|
||||||
|
|
||||||
|
graph.invoke({"messages": [HumanMessage(content="use the search tool")]})
|
||||||
|
|
||||||
|
# Turn 1: model should NOT see fake_mcp_search (it's deferred)
|
||||||
|
turn1 = set(model.bound_tools_per_turn[0])
|
||||||
|
assert "fake_mcp_search" not in turn1, f"Turn 1 sanity: deferred tools must be hidden from the model. Saw: {turn1!r}"
|
||||||
|
assert "tool_search" in turn1, f"Turn 1 sanity: tool_search must be visible so the agent can discover. Saw: {turn1!r}"
|
||||||
|
|
||||||
|
# Turn 2: AFTER tool_search promotes fake_mcp_search, the model must see it.
|
||||||
|
# This is the load-bearing assertion for issue #2884.
|
||||||
|
assert len(model.bound_tools_per_turn) >= 2, f"Expected at least 2 model turns, got {len(model.bound_tools_per_turn)}"
|
||||||
|
turn2 = set(model.bound_tools_per_turn[1])
|
||||||
|
assert "fake_mcp_search" in turn2, f"REGRESSION (#2884): tool_search promoted fake_mcp_search in turn 1, but the deferred-tool filter still hid it from the model in turn 2. Turn 2 bound tools: {turn2!r}"
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Section C — the actual issue #2884 trigger: a re-entrant
|
||||||
|
# get_available_tools call (e.g. when task_tool spawns a subagent) must not
|
||||||
|
# wipe the parent's promotion.
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def test_reentrant_get_available_tools_preserves_promotion(monkeypatch: pytest.MonkeyPatch):
|
||||||
|
"""Issue #2884 in its real shape: a re-entrant get_available_tools call
|
||||||
|
(the same pattern that happens when ``task_tool`` builds a subagent's
|
||||||
|
toolset mid-run) must not wipe the parent agent's tool_search promotions.
|
||||||
|
|
||||||
|
Turn 1's tool batch contains BOTH ``tool_search`` (which promotes
|
||||||
|
``fake_mcp_search``) AND ``fake_subagent_trigger`` (which calls
|
||||||
|
``get_available_tools`` again — exactly what ``task_tool`` does when it
|
||||||
|
builds a subagent's toolset). With the fix, turn 2's bind_tools sees the
|
||||||
|
promoted tool. Without the fix, the re-entry wipes the registry and
|
||||||
|
the filter re-hides it.
|
||||||
|
"""
|
||||||
|
from langchain.agents import create_agent
|
||||||
|
|
||||||
|
from deerflow.agents.middlewares.deferred_tool_filter_middleware import DeferredToolFilterMiddleware
|
||||||
|
from deerflow.tools.tools import get_available_tools
|
||||||
|
|
||||||
|
_patch_mcp_pipeline(monkeypatch, [fake_mcp_search, fake_mcp_fetch])
|
||||||
|
_force_tool_search_enabled(monkeypatch)
|
||||||
|
|
||||||
|
# The trigger tool simulates what task_tool does internally: rebuild the
|
||||||
|
# toolset by calling get_available_tools while the registry is live.
|
||||||
|
@as_tool
|
||||||
|
def fake_subagent_trigger(prompt: str) -> str:
|
||||||
|
"""Pretend to spawn a subagent. Internally rebuilds the toolset."""
|
||||||
|
get_available_tools(subagent_enabled=False)
|
||||||
|
return f"spawned subagent for: {prompt}"
|
||||||
|
|
||||||
|
tools = get_available_tools() + [fake_subagent_trigger]
|
||||||
|
|
||||||
|
bound_per_turn: list[list[str]] = []
|
||||||
|
|
||||||
|
class _Model(FakeToolCallingModel):
|
||||||
|
def bind_tools(self, tools_arg, **kwargs): # type: ignore[override]
|
||||||
|
bound_per_turn.append([getattr(t, "name", repr(t)) for t in tools_arg])
|
||||||
|
return self
|
||||||
|
|
||||||
|
model = _Model(
|
||||||
|
responses=[
|
||||||
|
# Turn 1: do both in one batch — promote AND trigger the
|
||||||
|
# subagent-style rebuild. LangGraph executes them in order in the
|
||||||
|
# same agent step.
|
||||||
|
AIMessage(
|
||||||
|
content="",
|
||||||
|
tool_calls=[
|
||||||
|
{
|
||||||
|
"name": "tool_search",
|
||||||
|
"args": {"query": "select:fake_mcp_search"},
|
||||||
|
"id": "call_search_1",
|
||||||
|
"type": "tool_call",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "fake_subagent_trigger",
|
||||||
|
"args": {"prompt": "go"},
|
||||||
|
"id": "call_trigger_1",
|
||||||
|
"type": "tool_call",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
),
|
||||||
|
# Turn 2: try to invoke the promoted tool. The model gets this
|
||||||
|
# turn only if turn 1's bind_tools recorded what the filter sent.
|
||||||
|
AIMessage(
|
||||||
|
content="",
|
||||||
|
tool_calls=[
|
||||||
|
{
|
||||||
|
"name": "fake_mcp_search",
|
||||||
|
"args": {"query": "hello"},
|
||||||
|
"id": "call_mcp_1",
|
||||||
|
"type": "tool_call",
|
||||||
|
}
|
||||||
|
],
|
||||||
|
),
|
||||||
|
AIMessage(content="all done"),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
graph = create_agent(
|
||||||
|
model=model,
|
||||||
|
tools=tools,
|
||||||
|
middleware=[DeferredToolFilterMiddleware()],
|
||||||
|
system_prompt="bug-2884-subagent-repro",
|
||||||
|
)
|
||||||
|
graph.invoke({"messages": [HumanMessage(content="use the search tool")]})
|
||||||
|
|
||||||
|
# Turn 1 sanity: deferred tool not visible yet
|
||||||
|
assert "fake_mcp_search" not in set(bound_per_turn[0]), bound_per_turn[0]
|
||||||
|
|
||||||
|
# The smoking-gun assertion: turn 2 sees the promoted tool DESPITE the
|
||||||
|
# re-entrant get_available_tools call that happened in turn 1's tool batch.
|
||||||
|
assert len(bound_per_turn) >= 2, f"Expected ≥2 turns, got {len(bound_per_turn)}"
|
||||||
|
turn2 = set(bound_per_turn[1])
|
||||||
|
assert "fake_mcp_search" in turn2, f"REGRESSION (#2884): a re-entrant get_available_tools call (e.g. task_tool spawning a subagent) wiped the parent agent's promotion. Turn 2 bound tools: {turn2!r}"
|
||||||
@@ -65,8 +65,7 @@ def _make_minimal_config(tools):
|
|||||||
|
|
||||||
@patch("deerflow.tools.tools.get_app_config")
|
@patch("deerflow.tools.tools.get_app_config")
|
||||||
@patch("deerflow.tools.tools.is_host_bash_allowed", return_value=True)
|
@patch("deerflow.tools.tools.is_host_bash_allowed", return_value=True)
|
||||||
@patch("deerflow.tools.tools.reset_deferred_registry")
|
def test_config_loaded_async_only_tool_gets_sync_wrapper(mock_bash, mock_cfg):
|
||||||
def test_config_loaded_async_only_tool_gets_sync_wrapper(mock_reset, mock_bash, mock_cfg):
|
|
||||||
"""Config-loaded async-only tools can still be invoked by sync clients."""
|
"""Config-loaded async-only tools can still be invoked by sync clients."""
|
||||||
|
|
||||||
async def async_tool_impl(x: int) -> str:
|
async def async_tool_impl(x: int) -> str:
|
||||||
@@ -98,8 +97,7 @@ def test_config_loaded_async_only_tool_gets_sync_wrapper(mock_reset, mock_bash,
|
|||||||
|
|
||||||
@patch("deerflow.tools.tools.get_app_config")
|
@patch("deerflow.tools.tools.get_app_config")
|
||||||
@patch("deerflow.tools.tools.is_host_bash_allowed", return_value=True)
|
@patch("deerflow.tools.tools.is_host_bash_allowed", return_value=True)
|
||||||
@patch("deerflow.tools.tools.reset_deferred_registry")
|
def test_no_duplicates_returned(mock_bash, mock_cfg):
|
||||||
def test_no_duplicates_returned(mock_reset, mock_bash, mock_cfg):
|
|
||||||
"""get_available_tools() never returns two tools with the same name."""
|
"""get_available_tools() never returns two tools with the same name."""
|
||||||
mock_cfg.return_value = _make_minimal_config([])
|
mock_cfg.return_value = _make_minimal_config([])
|
||||||
|
|
||||||
@@ -113,8 +111,7 @@ def test_no_duplicates_returned(mock_reset, mock_bash, mock_cfg):
|
|||||||
|
|
||||||
@patch("deerflow.tools.tools.get_app_config")
|
@patch("deerflow.tools.tools.get_app_config")
|
||||||
@patch("deerflow.tools.tools.is_host_bash_allowed", return_value=True)
|
@patch("deerflow.tools.tools.is_host_bash_allowed", return_value=True)
|
||||||
@patch("deerflow.tools.tools.reset_deferred_registry")
|
def test_first_occurrence_wins(mock_bash, mock_cfg):
|
||||||
def test_first_occurrence_wins(mock_reset, mock_bash, mock_cfg):
|
|
||||||
"""When duplicates exist, the first occurrence is kept."""
|
"""When duplicates exist, the first occurrence is kept."""
|
||||||
mock_cfg.return_value = _make_minimal_config([])
|
mock_cfg.return_value = _make_minimal_config([])
|
||||||
|
|
||||||
@@ -132,8 +129,7 @@ def test_first_occurrence_wins(mock_reset, mock_bash, mock_cfg):
|
|||||||
|
|
||||||
@patch("deerflow.tools.tools.get_app_config")
|
@patch("deerflow.tools.tools.get_app_config")
|
||||||
@patch("deerflow.tools.tools.is_host_bash_allowed", return_value=True)
|
@patch("deerflow.tools.tools.is_host_bash_allowed", return_value=True)
|
||||||
@patch("deerflow.tools.tools.reset_deferred_registry")
|
def test_duplicate_triggers_warning(mock_bash, mock_cfg, caplog):
|
||||||
def test_duplicate_triggers_warning(mock_reset, mock_bash, mock_cfg, caplog):
|
|
||||||
"""A warning is logged for every skipped duplicate."""
|
"""A warning is logged for every skipped duplicate."""
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user