mirror of
https://github.com/bytedance/deer-flow.git
synced 2026-06-10 17:35:57 +00:00
* fix(sandbox): close AioSandbox HTTP client during provider teardown (#2872) AioSandbox allocates a host-side agent_sandbox client (wrapping an httpx.Client) in __init__, but AioSandboxProvider.release/destroy/shutdown only popped provider state and tore down the backend container — the client/transport owned by each cached AioSandbox was never explicitly closed, accumulating unreclaimed sockets in long-running services. - Add AioSandbox.close(): best-effort, idempotent close of the wrapped httpx_client (falls back to top-level client.close()); errors are logged but never raised so backend cleanup is never blocked. - AioSandboxProvider.release()/destroy() now close the cached AioSandbox before dropping it; shutdown() inherits this via destroy(). * fix(sandbox): close the real httpx.Client owned by AioSandbox (#2872) The previous close() only walked one level (wrapper.httpx_client), which resolves to the Fern-generated HttpClient wrapper that has no close(). The real socket-owning httpx.Client lives one level deeper at _client_wrapper.httpx_client.httpx_client, so the close path never fired and host-side sockets still leaked. Resolve the real httpx.Client with graceful degradation; clear self._client under the lock for use-after-close and concurrent double-close safety; mark provider release()/destroy() try/except as defense-in-depth; rewrite TestClose against the real nested structure to lock down the original no-op bug.
This commit is contained in:
@@ -39,11 +39,63 @@ class AioSandbox(Sandbox):
|
||||
self._client = AioSandboxClient(base_url=base_url, timeout=600)
|
||||
self._home_dir = home_dir
|
||||
self._lock = threading.Lock()
|
||||
self._closed = False
|
||||
|
||||
@property
|
||||
def base_url(self) -> str:
|
||||
return self._base_url
|
||||
|
||||
def close(self) -> None:
|
||||
"""Best-effort close of the host-side HTTP client owned by this sandbox.
|
||||
|
||||
The agent_sandbox SDK is Fern-generated and exposes no ``close()`` /
|
||||
``__exit__``, so we reach the socket-owning ``httpx.Client`` explicitly
|
||||
through its attribute chain::
|
||||
|
||||
Sandbox._client_wrapper -> SyncClientWrapper
|
||||
.httpx_client -> Fern HttpClient (a wrapper, NOT httpx.Client)
|
||||
.httpx_client -> httpx.Client <- the real socket owner
|
||||
|
||||
Closing it releases pooled sockets so long-running provider lifecycles
|
||||
do not accumulate unreclaimed host-side resources (#2872).
|
||||
|
||||
Resolution is most-specific-first with graceful degradation: if a future
|
||||
SDK adds a top-level ``Sandbox.close()`` it is picked up automatically
|
||||
without changing this code. Idempotent, thread-safe, and non-fatal:
|
||||
failures during teardown are logged and swallowed so provider/backend
|
||||
cleanup is never blocked.
|
||||
"""
|
||||
with self._lock:
|
||||
if self._closed:
|
||||
return
|
||||
self._closed = True
|
||||
client = self._client
|
||||
# Drop the reference under the lock for use-after-close safety: any
|
||||
# later command on this instance fails loudly instead of reusing a
|
||||
# half-closed client.
|
||||
self._client = None
|
||||
|
||||
if client is None:
|
||||
return
|
||||
|
||||
# Walk from the real httpx.Client up to the top-level client, picking the
|
||||
# first object that actually exposes close().
|
||||
wrapper = getattr(client, "_client_wrapper", None)
|
||||
fern_http = getattr(wrapper, "httpx_client", None)
|
||||
real_httpx = getattr(fern_http, "httpx_client", None)
|
||||
target = next(
|
||||
(c for c in (real_httpx, fern_http, client) if c is not None and hasattr(c, "close")),
|
||||
None,
|
||||
)
|
||||
if target is None:
|
||||
logger.debug("AioSandbox %s: no closable client found, nothing to release", self.id)
|
||||
return
|
||||
|
||||
try:
|
||||
target.close()
|
||||
except Exception as e:
|
||||
logger.warning(f"Error closing AioSandbox client for {self.id}: {e}")
|
||||
|
||||
@property
|
||||
def home_dir(self) -> str:
|
||||
"""Get the home directory inside the sandbox."""
|
||||
|
||||
@@ -790,14 +790,20 @@ class AioSandboxProvider(SandboxProvider):
|
||||
thread on its next turn without a cold-start. The container will only be
|
||||
stopped when the replicas limit forces eviction or during shutdown.
|
||||
|
||||
The host-side HTTP client owned by the cached ``AioSandbox`` instance is
|
||||
closed before the instance is dropped (#2872). The warm-pool entry only
|
||||
stores ``SandboxInfo``, so a fresh ``AioSandbox`` (and a fresh client)
|
||||
is constructed if the container is later reclaimed.
|
||||
|
||||
Args:
|
||||
sandbox_id: The ID of the sandbox to release.
|
||||
"""
|
||||
info = None
|
||||
sandbox = None
|
||||
thread_ids_to_remove: list[str] = []
|
||||
|
||||
with self._lock:
|
||||
self._sandboxes.pop(sandbox_id, None)
|
||||
sandbox = self._sandboxes.pop(sandbox_id, None)
|
||||
info = self._sandbox_infos.pop(sandbox_id, None)
|
||||
thread_ids_to_remove = [tid for tid, sid in self._thread_sandboxes.items() if sid == sandbox_id]
|
||||
for tid in thread_ids_to_remove:
|
||||
@@ -807,6 +813,15 @@ class AioSandboxProvider(SandboxProvider):
|
||||
if info and sandbox_id not in self._warm_pool:
|
||||
self._warm_pool[sandbox_id] = (info, time.time())
|
||||
|
||||
if sandbox is not None:
|
||||
# Defense-in-depth: close() already swallows its own errors; this
|
||||
# guard only protects against a future close() that misbehaves, so
|
||||
# host-side client cleanup can never block parking in the warm pool.
|
||||
try:
|
||||
sandbox.close()
|
||||
except Exception as e:
|
||||
logger.warning(f"Error closing sandbox {sandbox_id} during release: {e}")
|
||||
|
||||
logger.info(f"Released sandbox {sandbox_id} to warm pool (container still running)")
|
||||
|
||||
def destroy(self, sandbox_id: str) -> None:
|
||||
@@ -815,14 +830,19 @@ class AioSandboxProvider(SandboxProvider):
|
||||
Unlike release(), this actually stops the container. Use this for
|
||||
explicit cleanup, capacity-driven eviction, or shutdown.
|
||||
|
||||
The host-side HTTP client owned by the cached ``AioSandbox`` instance is
|
||||
closed alongside backend/container destruction so no client/socket
|
||||
resources leak (#2872).
|
||||
|
||||
Args:
|
||||
sandbox_id: The ID of the sandbox to destroy.
|
||||
"""
|
||||
info = None
|
||||
sandbox = None
|
||||
thread_ids_to_remove: list[str] = []
|
||||
|
||||
with self._lock:
|
||||
self._sandboxes.pop(sandbox_id, None)
|
||||
sandbox = self._sandboxes.pop(sandbox_id, None)
|
||||
info = self._sandbox_infos.pop(sandbox_id, None)
|
||||
thread_ids_to_remove = [tid for tid, sid in self._thread_sandboxes.items() if sid == sandbox_id]
|
||||
for tid in thread_ids_to_remove:
|
||||
@@ -834,6 +854,15 @@ class AioSandboxProvider(SandboxProvider):
|
||||
else:
|
||||
self._warm_pool.pop(sandbox_id, None)
|
||||
|
||||
if sandbox is not None:
|
||||
# Defense-in-depth: close() already swallows its own errors; this
|
||||
# guard only protects against a future close() that misbehaves, so
|
||||
# host-side client cleanup can never block container destruction.
|
||||
try:
|
||||
sandbox.close()
|
||||
except Exception as e:
|
||||
logger.warning(f"Error closing sandbox {sandbox_id} during destroy: {e}")
|
||||
|
||||
if info:
|
||||
self._backend.destroy(info)
|
||||
logger.info(f"Destroyed sandbox {sandbox_id}")
|
||||
|
||||
Reference in New Issue
Block a user