From 05be7ea688ff9bc827dea8469c3935dfcef2eee6 Mon Sep 17 00:00:00 2001 From: DanielWalnut <45447813+hetaoBackend@users.noreply.github.com> Date: Tue, 16 Jun 2026 19:55:04 +0800 Subject: [PATCH] fix(subagents): raise general-purpose max_turns to 150 and default timeout to 30min (#3610) * fix(subagents): raise general-purpose max_turns to 150 and default timeout to 30min Deep-research subtasks failed out of the box with GraphRecursionError (Recursion limit of 100 reached): the built-in general-purpose subagent caps at max_turns=100. Raise it to 150 and bump the default subagent timeout from 900s (15min) to 1800s (30min) so the extra turns have time to run instead of shifting the failure to a timeout. The lead agent recursion_limit (100) is unchanged; the failures are subagent-only. Co-Authored-By: Claude Opus 4.8 * docs(subagents): clarify lead recursion_limit is independent of subagent max_turns Add comments at both lead recursion_limit=100 sites (gateway services + channel manager) explaining the lead's LangGraph super-step budget is separate from subagent depth, so the two 100s are not conflated. Comment-only, no behavior change. Co-Authored-By: Claude Opus 4.8 * docs(subagents): clarify built-in vs custom timeout scope; pin bash max_turns in test Review follow-ups: (1) clarify SubagentConfig docstring + global timeout field/comment that the 1800 default applies to built-in subagents (custom agents keep their own timeout_seconds); (2) pin bash.max_turns==60 in the defaults regression test so the config.example.yaml doc cannot drift; (3) rename test_default_timeout_preserved_when_no_config -> test_explicit_global_timeout_propagates_to_general_purpose since it intentionally exercises an explicit non-default 900. No runtime behavior change. Co-Authored-By: Claude Opus 4.8 --------- Co-authored-by: Claude Opus 4.8 --- backend/CLAUDE.md | 2 +- backend/app/channels/manager.py | 5 ++++ backend/app/gateway/services.py | 5 ++++ .../deerflow/config/subagents_config.py | 4 +-- .../subagents/builtins/general_purpose.py | 2 +- .../harness/deerflow/subagents/config.py | 9 +++++-- backend/tests/test_app_config_reload.py | 2 +- backend/tests/test_subagent_timeout_config.py | 25 ++++++++++++++++--- config.example.yaml | 12 +++++---- 9 files changed, 50 insertions(+), 16 deletions(-) diff --git a/backend/CLAUDE.md b/backend/CLAUDE.md index 82d750740..058a7ec24 100644 --- a/backend/CLAUDE.md +++ b/backend/CLAUDE.md @@ -311,7 +311,7 @@ Proxied through nginx: `/api/langgraph/*` → Gateway LangGraph-compatible runti **Built-in Agents**: `general-purpose` (all tools except `task`) and `bash` (command specialist) **Execution**: Dual thread pool - `_scheduler_pool` (3 workers) + `_execution_pool` (3 workers) -**Concurrency**: `MAX_CONCURRENT_SUBAGENTS = 3` enforced by `SubagentLimitMiddleware` (truncates excess tool calls in `after_model`), 15-minute timeout +**Concurrency**: `MAX_CONCURRENT_SUBAGENTS = 3` enforced by `SubagentLimitMiddleware` (truncates excess tool calls in `after_model`); default subagent timeout `subagents.timeout_seconds=1800` (30 min) and built-in `general-purpose` `max_turns=150` (raised from 100/15-min so deep-research subtasks stop hitting `GraphRecursionError` out of the box) **Flow**: `task()` tool → `SubagentExecutor` → background thread → poll 5s → SSE events → result **Events**: `task_started`, `task_running`, `task_completed`/`task_failed`/`task_timed_out` **Deferred MCP tools** (if `tool_search.enabled`): `SubagentExecutor._build_initial_state` assembles deferral after policy filtering via the shared `assemble_deferred_tools` (fail-closed), appends the `tool_search` tool, injects the `` section into the subagent's `SystemMessage`, and threads the setup to `_create_agent`, which attaches `DeferredToolFilterMiddleware` through `build_subagent_runtime_middlewares(deferred_setup=...)`. Subagents thus withhold full MCP schemas until promotion, same as the lead agent; each task run gets a fresh `ThreadState` so promotion is isolated per run diff --git a/backend/app/channels/manager.py b/backend/app/channels/manager.py index f7f6afaad..b129269f9 100644 --- a/backend/app/channels/manager.py +++ b/backend/app/channels/manager.py @@ -42,6 +42,11 @@ DEFAULT_GATEWAY_URL = "http://localhost:8001" DEFAULT_ASSISTANT_ID = "lead_agent" CUSTOM_AGENT_NAME_PATTERN = re.compile(r"^[A-Za-z0-9-]+$") +# Lead-agent recursion budget (LangGraph super-steps for the lead graph only). +# This is independent of subagent depth: a `task()` dispatch runs the whole +# subagent inside ONE lead tools-node step, and subagents enforce their own +# limit via `subagents.max_turns` (see SubagentExecutor). Do not conflate this +# 100 with the general-purpose subagent's max_turns. DEFAULT_RUN_CONFIG: dict[str, Any] = {"recursion_limit": 100} DEFAULT_RUN_CONTEXT: dict[str, Any] = { "thinking_enabled": True, diff --git a/backend/app/gateway/services.py b/backend/app/gateway/services.py index 77a9e8076..04a9f567b 100644 --- a/backend/app/gateway/services.py +++ b/backend/app/gateway/services.py @@ -224,6 +224,11 @@ def build_run_config( the LangGraph Platform-compatible HTTP API and the IM channel path behave identically. """ + # Lead-agent recursion budget (LangGraph super-steps for the lead graph + # only). Independent of subagent depth: a `task()` dispatch runs the whole + # subagent inside ONE lead tools-node step, and subagents enforce their own + # limit via `subagents.max_turns`. Do not conflate this 100 with the + # general-purpose subagent's max_turns. config: dict[str, Any] = {"recursion_limit": 100} if request_config: # LangGraph >= 0.6.0 introduced ``context`` as the preferred way to diff --git a/backend/packages/harness/deerflow/config/subagents_config.py b/backend/packages/harness/deerflow/config/subagents_config.py index 026016b21..738616884 100644 --- a/backend/packages/harness/deerflow/config/subagents_config.py +++ b/backend/packages/harness/deerflow/config/subagents_config.py @@ -72,9 +72,9 @@ class SubagentsAppConfig(BaseModel): """Configuration for the subagent system.""" timeout_seconds: int = Field( - default=900, + default=1800, ge=1, - description="Default timeout in seconds for all subagents (default: 900 = 15 minutes)", + description="Default timeout in seconds for built-in subagents (default: 1800 = 30 minutes); custom agents use their own timeout_seconds unless given a per-agent override", ) max_turns: int | None = Field( default=None, diff --git a/backend/packages/harness/deerflow/subagents/builtins/general_purpose.py b/backend/packages/harness/deerflow/subagents/builtins/general_purpose.py index 176194729..c5291d1b5 100644 --- a/backend/packages/harness/deerflow/subagents/builtins/general_purpose.py +++ b/backend/packages/harness/deerflow/subagents/builtins/general_purpose.py @@ -57,5 +57,5 @@ You have access to the same sandbox environment as the parent agent: tools=None, # Inherit all tools from parent disallowed_tools=["task", "ask_clarification", "present_files"], # Prevent nesting and clarification model="inherit", - max_turns=100, + max_turns=150, ) diff --git a/backend/packages/harness/deerflow/subagents/config.py b/backend/packages/harness/deerflow/subagents/config.py index 9081e2df9..a3ae6024d 100644 --- a/backend/packages/harness/deerflow/subagents/config.py +++ b/backend/packages/harness/deerflow/subagents/config.py @@ -20,8 +20,13 @@ class SubagentConfig: skills: Optional list of skill names to load. If None, inherits all enabled skills. If an empty list, no skills are loaded. model: Model to use - 'inherit' uses parent's model. - max_turns: Maximum number of agent turns before stopping. - timeout_seconds: Maximum execution time in seconds (default: 900 = 15 minutes). + max_turns: Maximum agent turns before stopping. Built-in agents use the + value set here (general-purpose=150, bash=60) unless the global + ``subagents.max_turns`` is set. + timeout_seconds: Bare fallback execution-time cap. For built-in agents the + effective limit is the global ``subagents.timeout_seconds`` (default + 1800 = 30 min), layered on by the registry; this 900 only applies + when no differing global value exists. """ name: str diff --git a/backend/tests/test_app_config_reload.py b/backend/tests/test_app_config_reload.py index c0bc00bff..7a7fd02df 100644 --- a/backend/tests/test_app_config_reload.py +++ b/backend/tests/test_app_config_reload.py @@ -316,7 +316,7 @@ def test_get_app_config_resets_singleton_configs_when_sections_removed(tmp_path, assert get_title_config().enabled is True assert get_summarization_config().enabled is False assert get_memory_config().enabled is True - assert get_subagents_app_config().timeout_seconds == 900 + assert get_subagents_app_config().timeout_seconds == 1800 assert get_tool_search_config().enabled is False assert get_guardrails_config().enabled is False assert get_checkpointer_config() is None diff --git a/backend/tests/test_subagent_timeout_config.py b/backend/tests/test_subagent_timeout_config.py index b20bbe7a9..d68e0f99f 100644 --- a/backend/tests/test_subagent_timeout_config.py +++ b/backend/tests/test_subagent_timeout_config.py @@ -97,7 +97,7 @@ class TestSubagentOverrideConfig: class TestSubagentsAppConfigDefaults: def test_default_timeout(self): config = SubagentsAppConfig() - assert config.timeout_seconds == 900 + assert config.timeout_seconds == 1800 def test_default_max_turns_override_is_none(self): config = SubagentsAppConfig() @@ -281,7 +281,7 @@ class TestLoadSubagentsConfig: def test_load_empty_dict_uses_defaults(self): load_subagents_config_from_dict({}) cfg = get_subagents_app_config() - assert cfg.timeout_seconds == 900 + assert cfg.timeout_seconds == 1800 assert cfg.max_turns is None assert cfg.agents == {} @@ -319,13 +319,30 @@ class TestRegistryGetSubagentConfig: assert get_subagent_config("general-purpose") is not None assert get_subagent_config("bash") is not None - def test_default_timeout_preserved_when_no_config(self): + def test_explicit_global_timeout_propagates_to_general_purpose(self): + """An explicit global timeout (here the non-default 900) propagates to a + built-in agent, while max_turns still comes from the builtin def (150). + """ from deerflow.subagents.registry import get_subagent_config _reset_subagents_config(timeout_seconds=900) config = get_subagent_config("general-purpose") assert config.timeout_seconds == 900 - assert config.max_turns == 100 + assert config.max_turns == 150 + + def test_builtin_defaults_have_research_headroom(self): + """Out-of-box defaults (no config.yaml subagents section) must give + general-purpose enough turns/time for deep research, which previously + failed with GraphRecursionError at the old max_turns=100 limit. + """ + from deerflow.subagents.registry import get_subagent_config + + load_subagents_config_from_dict({}) # no subagents config -> model defaults + config = get_subagent_config("general-purpose") + assert config.max_turns == 150 + assert config.timeout_seconds == 1800 + # Pin bash too so the config.example.yaml "bash=60" doc cannot drift. + assert get_subagent_config("bash").max_turns == 60 def test_global_timeout_override_applied(self): from deerflow.subagents.registry import get_subagent_config diff --git a/config.example.yaml b/config.example.yaml index 107ee7166..d2affc98a 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -892,16 +892,18 @@ sandbox: # Subagents are background workers delegated tasks by the lead agent # subagents: -# # Default timeout in seconds for all subagents (default: 900 = 15 minutes) -# timeout_seconds: 900 -# # Optional global max-turn override for all subagents +# # Default timeout (seconds) for built-in subagents (default: 1800 = 30 min). +# # Custom agents use their own timeout_seconds (default 900) unless overridden. +# timeout_seconds: 1800 +# # Optional global max-turn override for all subagents. +# # Built-in defaults: general-purpose=150, bash=60. Leave unset to keep them. # # max_turns: 120 # # # Optional per-agent overrides (applies to both built-in and custom agents) # agents: # general-purpose: -# timeout_seconds: 1800 # 30 minutes for complex multi-step tasks -# max_turns: 160 +# timeout_seconds: 2700 # 45 minutes for very long deep-research tasks +# max_turns: 250 # raise above the 150 default for very deep tasks # # model: qwen3:32b # Use a specific model (default: inherit from lead agent) # # skills: # Skill whitelist (default: inherit all enabled skills) # # - web-search