mirror of
https://github.com/bytedance/deer-flow.git
synced 2026-05-23 16:35:59 +00:00
fix: avoid temporary event loops in async subagent execution (#2414)
* fix: avoid temporary event loops in async subagent execution * Rename isolated subagent loop globals * Harden isolated subagent loop shutdown and logging * Sort subagent executor imports * Format subagent executor * Remove isolated loop pool from subagent executor * Format subagent executor cleanup --------- Co-authored-by: Willem Jiang <willem.jiang@gmail.com>
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
"""Subagent execution engine."""
|
||||
|
||||
import asyncio
|
||||
import atexit
|
||||
import logging
|
||||
import threading
|
||||
import uuid
|
||||
@@ -24,6 +25,12 @@ from deerflow.subagents.config import SubagentConfig
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
_previous_shutdown_isolated_subagent_loop = globals().get("_shutdown_isolated_subagent_loop")
|
||||
if callable(_previous_shutdown_isolated_subagent_loop):
|
||||
atexit.unregister(_previous_shutdown_isolated_subagent_loop)
|
||||
_previous_shutdown_isolated_subagent_loop()
|
||||
|
||||
|
||||
class SubagentStatus(Enum):
|
||||
"""Status of a subagent execution."""
|
||||
|
||||
@@ -73,12 +80,92 @@ _background_tasks_lock = threading.Lock()
|
||||
# Thread pool for background task scheduling and orchestration
|
||||
_scheduler_pool = ThreadPoolExecutor(max_workers=3, thread_name_prefix="subagent-scheduler-")
|
||||
|
||||
# Thread pool for actual subagent execution (with timeout support)
|
||||
# Larger pool to avoid blocking when scheduler submits execution tasks
|
||||
_execution_pool = ThreadPoolExecutor(max_workers=3, thread_name_prefix="subagent-exec-")
|
||||
# Persistent event loop for isolated subagent executions triggered from an
|
||||
# already-running parent loop. Reusing one long-lived loop avoids creating a
|
||||
# fresh loop per execution and then closing async resources bound to it.
|
||||
_isolated_subagent_loop: asyncio.AbstractEventLoop | None = None
|
||||
_isolated_subagent_loop_thread: threading.Thread | None = None
|
||||
_isolated_subagent_loop_started: threading.Event | None = None
|
||||
_isolated_subagent_loop_lock = threading.Lock()
|
||||
|
||||
# Dedicated pool for sync execute() calls made from an already-running event loop.
|
||||
_isolated_loop_pool = ThreadPoolExecutor(max_workers=3, thread_name_prefix="subagent-isolated-")
|
||||
|
||||
def _run_isolated_subagent_loop(
|
||||
loop: asyncio.AbstractEventLoop,
|
||||
started_event: threading.Event,
|
||||
) -> None:
|
||||
"""Run the persistent isolated subagent loop in a dedicated daemon thread."""
|
||||
asyncio.set_event_loop(loop)
|
||||
loop.call_soon(started_event.set)
|
||||
try:
|
||||
loop.run_forever()
|
||||
finally:
|
||||
started_event.clear()
|
||||
|
||||
|
||||
def _shutdown_isolated_subagent_loop() -> None:
|
||||
"""Stop and close the persistent isolated subagent loop."""
|
||||
global _isolated_subagent_loop, _isolated_subagent_loop_thread, _isolated_subagent_loop_started
|
||||
|
||||
with _isolated_subagent_loop_lock:
|
||||
loop = _isolated_subagent_loop
|
||||
thread = _isolated_subagent_loop_thread
|
||||
_isolated_subagent_loop = None
|
||||
_isolated_subagent_loop_thread = None
|
||||
_isolated_subagent_loop_started = None
|
||||
|
||||
if loop is None:
|
||||
return
|
||||
|
||||
if loop.is_running():
|
||||
loop.call_soon_threadsafe(loop.stop)
|
||||
|
||||
if thread is not None and thread.is_alive() and thread is not threading.current_thread():
|
||||
thread.join(timeout=1)
|
||||
|
||||
thread_stopped = thread is None or not thread.is_alive()
|
||||
loop_stopped = not loop.is_running()
|
||||
|
||||
if not loop.is_closed():
|
||||
if thread_stopped and loop_stopped:
|
||||
loop.close()
|
||||
else:
|
||||
logger.warning(
|
||||
"Skipping close of isolated subagent loop because shutdown did not complete within timeout (thread_alive=%s, loop_running=%s)",
|
||||
thread is not None and thread.is_alive(),
|
||||
loop.is_running(),
|
||||
)
|
||||
|
||||
|
||||
atexit.register(_shutdown_isolated_subagent_loop)
|
||||
|
||||
|
||||
def _get_isolated_subagent_loop() -> asyncio.AbstractEventLoop:
|
||||
"""Return the persistent event loop used by isolated subagent executions."""
|
||||
global _isolated_subagent_loop, _isolated_subagent_loop_thread, _isolated_subagent_loop_started
|
||||
with _isolated_subagent_loop_lock:
|
||||
thread_is_alive = _isolated_subagent_loop_thread is not None and _isolated_subagent_loop_thread.is_alive()
|
||||
loop_is_usable = _isolated_subagent_loop is not None and not _isolated_subagent_loop.is_closed() and _isolated_subagent_loop.is_running() and thread_is_alive
|
||||
|
||||
if not loop_is_usable:
|
||||
loop = asyncio.new_event_loop()
|
||||
started_event = threading.Event()
|
||||
thread = threading.Thread(
|
||||
target=_run_isolated_subagent_loop,
|
||||
args=(loop, started_event),
|
||||
name="subagent-persistent-loop",
|
||||
daemon=True,
|
||||
)
|
||||
thread.start()
|
||||
if not started_event.wait(timeout=5):
|
||||
loop.call_soon_threadsafe(loop.stop)
|
||||
thread.join(timeout=1)
|
||||
loop.close()
|
||||
raise RuntimeError("Timed out starting isolated subagent event loop")
|
||||
_isolated_subagent_loop = loop
|
||||
_isolated_subagent_loop_thread = thread
|
||||
_isolated_subagent_loop_started = started_event
|
||||
|
||||
return _isolated_subagent_loop
|
||||
|
||||
|
||||
def _filter_tools(
|
||||
@@ -453,42 +540,39 @@ class SubagentExecutor:
|
||||
return result
|
||||
|
||||
def _execute_in_isolated_loop(self, task: str, result_holder: SubagentResult | None = None) -> SubagentResult:
|
||||
"""Execute the subagent in a completely fresh event loop.
|
||||
"""Execute the subagent on the persistent isolated event loop.
|
||||
|
||||
This method is designed to run in a separate thread to ensure complete
|
||||
isolation from any parent event loop, preventing conflicts with asyncio
|
||||
primitives that may be bound to the parent loop (e.g., httpx clients).
|
||||
This method is used by the sync ``execute()`` path when the caller is
|
||||
already running inside an event loop. Because ``execute()`` is a sync
|
||||
API, this path blocks the caller while the actual coroutine runs on the
|
||||
long-lived isolated loop. Reusing that loop keeps shared async clients
|
||||
from being tied to a short-lived loop that gets closed per execution.
|
||||
"""
|
||||
future: Future[SubagentResult] | None = None
|
||||
try:
|
||||
previous_loop = asyncio.get_event_loop()
|
||||
except RuntimeError:
|
||||
previous_loop = None
|
||||
|
||||
# Create and set a new event loop for this thread
|
||||
loop = asyncio.new_event_loop()
|
||||
try:
|
||||
asyncio.set_event_loop(loop)
|
||||
return loop.run_until_complete(self._aexecute(task, result_holder))
|
||||
finally:
|
||||
try:
|
||||
pending = asyncio.all_tasks(loop)
|
||||
if pending:
|
||||
for task_obj in pending:
|
||||
task_obj.cancel()
|
||||
loop.run_until_complete(asyncio.gather(*pending, return_exceptions=True))
|
||||
|
||||
loop.run_until_complete(loop.shutdown_asyncgens())
|
||||
loop.run_until_complete(loop.shutdown_default_executor())
|
||||
except Exception:
|
||||
future = asyncio.run_coroutine_threadsafe(
|
||||
self._aexecute(task, result_holder),
|
||||
_get_isolated_subagent_loop(),
|
||||
)
|
||||
return future.result(timeout=self.config.timeout_seconds)
|
||||
except FuturesTimeoutError:
|
||||
if result_holder is not None:
|
||||
result_holder.cancel_event.set()
|
||||
if future is not None:
|
||||
future.cancel()
|
||||
raise
|
||||
except Exception:
|
||||
if future is None:
|
||||
logger.debug(
|
||||
f"[trace={self.trace_id}] Failed while cleaning up isolated event loop for subagent {self.config.name}",
|
||||
f"[trace={self.trace_id}] Failed to submit subagent {self.config.name} to the isolated event loop",
|
||||
exc_info=True,
|
||||
)
|
||||
finally:
|
||||
try:
|
||||
loop.close()
|
||||
finally:
|
||||
asyncio.set_event_loop(previous_loop)
|
||||
else:
|
||||
logger.debug(
|
||||
f"[trace={self.trace_id}] Subagent {self.config.name} failed while executing on the isolated event loop",
|
||||
exc_info=True,
|
||||
)
|
||||
raise
|
||||
|
||||
def execute(self, task: str, result_holder: SubagentResult | None = None) -> SubagentResult:
|
||||
"""Execute a task synchronously (wrapper around async execution).
|
||||
@@ -497,9 +581,9 @@ class SubagentExecutor:
|
||||
asynchronous tools (like MCP tools) to be used within the thread pool.
|
||||
|
||||
When called from within an already-running event loop (e.g., when the
|
||||
parent agent is async), this method isolates the subagent execution in
|
||||
a separate thread to avoid event loop conflicts with shared async
|
||||
primitives like httpx clients.
|
||||
parent agent is async), this method synchronously waits on the
|
||||
persistent isolated loop to avoid event loop conflicts with shared
|
||||
async primitives like httpx clients.
|
||||
|
||||
Args:
|
||||
task: The task description for the subagent.
|
||||
@@ -515,9 +599,8 @@ class SubagentExecutor:
|
||||
loop = None
|
||||
|
||||
if loop is not None and loop.is_running():
|
||||
logger.debug(f"[trace={self.trace_id}] Subagent {self.config.name} detected running event loop, using isolated thread")
|
||||
future = _isolated_loop_pool.submit(self._execute_in_isolated_loop, task, result_holder)
|
||||
return future.result()
|
||||
logger.debug(f"[trace={self.trace_id}] Subagent {self.config.name} detected running event loop, using isolated loop")
|
||||
return self._execute_in_isolated_loop(task, result_holder)
|
||||
|
||||
# Standard path: no running event loop, use asyncio.run
|
||||
return asyncio.run(self._aexecute(task, result_holder))
|
||||
@@ -571,9 +654,12 @@ class SubagentExecutor:
|
||||
result_holder = _background_tasks[task_id]
|
||||
|
||||
try:
|
||||
# Submit execution to execution pool with timeout
|
||||
# Pass result_holder so execute() can update it in real-time
|
||||
execution_future: Future = _execution_pool.submit(self.execute, task, result_holder)
|
||||
# Submit execution directly to the persistent isolated loop so the
|
||||
# background path does not create a temporary loop via execute().
|
||||
execution_future = asyncio.run_coroutine_threadsafe(
|
||||
self._aexecute(task, result_holder),
|
||||
_get_isolated_subagent_loop(),
|
||||
)
|
||||
try:
|
||||
# Wait for execution with timeout
|
||||
exec_result = execution_future.result(timeout=self.config.timeout_seconds)
|
||||
|
||||
Reference in New Issue
Block a user