fix: avoid temporary event loops in async subagent execution (#2414)

* fix: avoid temporary event loops in async subagent execution

* Rename isolated subagent loop globals

* Harden isolated subagent loop shutdown and logging

* Sort subagent executor imports

* Format subagent executor

* Remove isolated loop pool from subagent executor

* Format subagent executor cleanup

---------

Co-authored-by: Willem Jiang <willem.jiang@gmail.com>
This commit is contained in:
KiteEater
2026-04-30 15:29:17 +08:00
committed by GitHub
parent 88d47f677f
commit 7dea1666ce
2 changed files with 236 additions and 75 deletions
@@ -1,6 +1,7 @@
"""Subagent execution engine."""
import asyncio
import atexit
import logging
import threading
import uuid
@@ -24,6 +25,12 @@ from deerflow.subagents.config import SubagentConfig
logger = logging.getLogger(__name__)
_previous_shutdown_isolated_subagent_loop = globals().get("_shutdown_isolated_subagent_loop")
if callable(_previous_shutdown_isolated_subagent_loop):
atexit.unregister(_previous_shutdown_isolated_subagent_loop)
_previous_shutdown_isolated_subagent_loop()
class SubagentStatus(Enum):
"""Status of a subagent execution."""
@@ -73,12 +80,92 @@ _background_tasks_lock = threading.Lock()
# Thread pool for background task scheduling and orchestration
_scheduler_pool = ThreadPoolExecutor(max_workers=3, thread_name_prefix="subagent-scheduler-")
# Thread pool for actual subagent execution (with timeout support)
# Larger pool to avoid blocking when scheduler submits execution tasks
_execution_pool = ThreadPoolExecutor(max_workers=3, thread_name_prefix="subagent-exec-")
# Persistent event loop for isolated subagent executions triggered from an
# already-running parent loop. Reusing one long-lived loop avoids creating a
# fresh loop per execution and then closing async resources bound to it.
_isolated_subagent_loop: asyncio.AbstractEventLoop | None = None
_isolated_subagent_loop_thread: threading.Thread | None = None
_isolated_subagent_loop_started: threading.Event | None = None
_isolated_subagent_loop_lock = threading.Lock()
# Dedicated pool for sync execute() calls made from an already-running event loop.
_isolated_loop_pool = ThreadPoolExecutor(max_workers=3, thread_name_prefix="subagent-isolated-")
def _run_isolated_subagent_loop(
loop: asyncio.AbstractEventLoop,
started_event: threading.Event,
) -> None:
"""Run the persistent isolated subagent loop in a dedicated daemon thread."""
asyncio.set_event_loop(loop)
loop.call_soon(started_event.set)
try:
loop.run_forever()
finally:
started_event.clear()
def _shutdown_isolated_subagent_loop() -> None:
"""Stop and close the persistent isolated subagent loop."""
global _isolated_subagent_loop, _isolated_subagent_loop_thread, _isolated_subagent_loop_started
with _isolated_subagent_loop_lock:
loop = _isolated_subagent_loop
thread = _isolated_subagent_loop_thread
_isolated_subagent_loop = None
_isolated_subagent_loop_thread = None
_isolated_subagent_loop_started = None
if loop is None:
return
if loop.is_running():
loop.call_soon_threadsafe(loop.stop)
if thread is not None and thread.is_alive() and thread is not threading.current_thread():
thread.join(timeout=1)
thread_stopped = thread is None or not thread.is_alive()
loop_stopped = not loop.is_running()
if not loop.is_closed():
if thread_stopped and loop_stopped:
loop.close()
else:
logger.warning(
"Skipping close of isolated subagent loop because shutdown did not complete within timeout (thread_alive=%s, loop_running=%s)",
thread is not None and thread.is_alive(),
loop.is_running(),
)
atexit.register(_shutdown_isolated_subagent_loop)
def _get_isolated_subagent_loop() -> asyncio.AbstractEventLoop:
"""Return the persistent event loop used by isolated subagent executions."""
global _isolated_subagent_loop, _isolated_subagent_loop_thread, _isolated_subagent_loop_started
with _isolated_subagent_loop_lock:
thread_is_alive = _isolated_subagent_loop_thread is not None and _isolated_subagent_loop_thread.is_alive()
loop_is_usable = _isolated_subagent_loop is not None and not _isolated_subagent_loop.is_closed() and _isolated_subagent_loop.is_running() and thread_is_alive
if not loop_is_usable:
loop = asyncio.new_event_loop()
started_event = threading.Event()
thread = threading.Thread(
target=_run_isolated_subagent_loop,
args=(loop, started_event),
name="subagent-persistent-loop",
daemon=True,
)
thread.start()
if not started_event.wait(timeout=5):
loop.call_soon_threadsafe(loop.stop)
thread.join(timeout=1)
loop.close()
raise RuntimeError("Timed out starting isolated subagent event loop")
_isolated_subagent_loop = loop
_isolated_subagent_loop_thread = thread
_isolated_subagent_loop_started = started_event
return _isolated_subagent_loop
def _filter_tools(
@@ -453,42 +540,39 @@ class SubagentExecutor:
return result
def _execute_in_isolated_loop(self, task: str, result_holder: SubagentResult | None = None) -> SubagentResult:
"""Execute the subagent in a completely fresh event loop.
"""Execute the subagent on the persistent isolated event loop.
This method is designed to run in a separate thread to ensure complete
isolation from any parent event loop, preventing conflicts with asyncio
primitives that may be bound to the parent loop (e.g., httpx clients).
This method is used by the sync ``execute()`` path when the caller is
already running inside an event loop. Because ``execute()`` is a sync
API, this path blocks the caller while the actual coroutine runs on the
long-lived isolated loop. Reusing that loop keeps shared async clients
from being tied to a short-lived loop that gets closed per execution.
"""
future: Future[SubagentResult] | None = None
try:
previous_loop = asyncio.get_event_loop()
except RuntimeError:
previous_loop = None
# Create and set a new event loop for this thread
loop = asyncio.new_event_loop()
try:
asyncio.set_event_loop(loop)
return loop.run_until_complete(self._aexecute(task, result_holder))
finally:
try:
pending = asyncio.all_tasks(loop)
if pending:
for task_obj in pending:
task_obj.cancel()
loop.run_until_complete(asyncio.gather(*pending, return_exceptions=True))
loop.run_until_complete(loop.shutdown_asyncgens())
loop.run_until_complete(loop.shutdown_default_executor())
except Exception:
future = asyncio.run_coroutine_threadsafe(
self._aexecute(task, result_holder),
_get_isolated_subagent_loop(),
)
return future.result(timeout=self.config.timeout_seconds)
except FuturesTimeoutError:
if result_holder is not None:
result_holder.cancel_event.set()
if future is not None:
future.cancel()
raise
except Exception:
if future is None:
logger.debug(
f"[trace={self.trace_id}] Failed while cleaning up isolated event loop for subagent {self.config.name}",
f"[trace={self.trace_id}] Failed to submit subagent {self.config.name} to the isolated event loop",
exc_info=True,
)
finally:
try:
loop.close()
finally:
asyncio.set_event_loop(previous_loop)
else:
logger.debug(
f"[trace={self.trace_id}] Subagent {self.config.name} failed while executing on the isolated event loop",
exc_info=True,
)
raise
def execute(self, task: str, result_holder: SubagentResult | None = None) -> SubagentResult:
"""Execute a task synchronously (wrapper around async execution).
@@ -497,9 +581,9 @@ class SubagentExecutor:
asynchronous tools (like MCP tools) to be used within the thread pool.
When called from within an already-running event loop (e.g., when the
parent agent is async), this method isolates the subagent execution in
a separate thread to avoid event loop conflicts with shared async
primitives like httpx clients.
parent agent is async), this method synchronously waits on the
persistent isolated loop to avoid event loop conflicts with shared
async primitives like httpx clients.
Args:
task: The task description for the subagent.
@@ -515,9 +599,8 @@ class SubagentExecutor:
loop = None
if loop is not None and loop.is_running():
logger.debug(f"[trace={self.trace_id}] Subagent {self.config.name} detected running event loop, using isolated thread")
future = _isolated_loop_pool.submit(self._execute_in_isolated_loop, task, result_holder)
return future.result()
logger.debug(f"[trace={self.trace_id}] Subagent {self.config.name} detected running event loop, using isolated loop")
return self._execute_in_isolated_loop(task, result_holder)
# Standard path: no running event loop, use asyncio.run
return asyncio.run(self._aexecute(task, result_holder))
@@ -571,9 +654,12 @@ class SubagentExecutor:
result_holder = _background_tasks[task_id]
try:
# Submit execution to execution pool with timeout
# Pass result_holder so execute() can update it in real-time
execution_future: Future = _execution_pool.submit(self.execute, task, result_holder)
# Submit execution directly to the persistent isolated loop so the
# background path does not create a temporary loop via execute().
execution_future = asyncio.run_coroutine_threadsafe(
self._aexecute(task, result_holder),
_get_isolated_subagent_loop(),
)
try:
# Wait for execution with timeout
exec_result = execution_future.result(timeout=self.config.timeout_seconds)