mirror of
https://github.com/bytedance/deer-flow.git
synced 2026-05-22 07:56:48 +00:00
fix(sandbox): auto-restart crashed containers transparently (#2788)
When a sandbox container crashes (e.g. due to an internal error), the agent enters a connection-refused loop because AioSandboxProvider.get() returns a cached but dead sandbox object. Add a liveness check in get() that detects crashed containers via backend.is_alive() and evicts them from all caches, allowing ensure_sandbox_initialized() to transparently recreate a fresh container on the next acquire(). The behavior is controlled by a new config option (default: true). Set to false to skip health checks and preserve the old behavior of returning stale cached sandboxes. Closes #2788
This commit is contained in:
@@ -80,6 +80,7 @@ class AioSandboxProvider(SandboxProvider):
|
||||
port: 8080 # Base port for local containers
|
||||
container_prefix: deer-flow-sandbox
|
||||
idle_timeout: 600 # Idle timeout in seconds (0 to disable)
|
||||
auto_restart: true # Restart crashed containers automatically
|
||||
replicas: 3 # Max concurrent sandbox containers (LRU eviction when exceeded)
|
||||
mounts: # Volume mounts for local containers
|
||||
- host_path: /path/on/host
|
||||
@@ -164,12 +165,14 @@ class AioSandboxProvider(SandboxProvider):
|
||||
|
||||
idle_timeout = getattr(sandbox_config, "idle_timeout", None)
|
||||
replicas = getattr(sandbox_config, "replicas", None)
|
||||
auto_restart = getattr(sandbox_config, "auto_restart", True)
|
||||
|
||||
return {
|
||||
"image": sandbox_config.image or DEFAULT_IMAGE,
|
||||
"port": sandbox_config.port or DEFAULT_PORT,
|
||||
"container_prefix": sandbox_config.container_prefix or DEFAULT_CONTAINER_PREFIX,
|
||||
"idle_timeout": idle_timeout if idle_timeout is not None else DEFAULT_IDLE_TIMEOUT,
|
||||
"auto_restart": auto_restart,
|
||||
"replicas": replicas if replicas is not None else DEFAULT_REPLICAS,
|
||||
"mounts": sandbox_config.mounts or [],
|
||||
"environment": self._resolve_env_vars(sandbox_config.environment or {}),
|
||||
@@ -608,16 +611,35 @@ class AioSandboxProvider(SandboxProvider):
|
||||
def get(self, sandbox_id: str) -> Sandbox | None:
|
||||
"""Get a sandbox by ID. Updates last activity timestamp.
|
||||
|
||||
When ``auto_restart`` is enabled (the default), the container's liveness
|
||||
is verified on each lookup. If the underlying container has crashed, the
|
||||
sandbox is evicted from all caches so that the next ``acquire()`` call will
|
||||
transparently create a fresh container.
|
||||
|
||||
Args:
|
||||
sandbox_id: The ID of the sandbox.
|
||||
|
||||
Returns:
|
||||
The sandbox instance if found, None otherwise.
|
||||
The sandbox instance if found and alive, None otherwise.
|
||||
"""
|
||||
with self._lock:
|
||||
sandbox = self._sandboxes.get(sandbox_id)
|
||||
if sandbox is not None:
|
||||
self._last_activity[sandbox_id] = time.time()
|
||||
# Auto-restart: detect crashed containers and evict from cache
|
||||
# so the next acquire() transparently recreates them.
|
||||
if self._config.get("auto_restart", True):
|
||||
info = self._sandbox_infos.get(sandbox_id)
|
||||
if info and not self._backend.is_alive(info):
|
||||
logger.warning(f"Sandbox {sandbox_id} container is not alive, evicting from cache for auto-restart")
|
||||
self._sandboxes.pop(sandbox_id, None)
|
||||
self._sandbox_infos.pop(sandbox_id, None)
|
||||
self._last_activity.pop(sandbox_id, None)
|
||||
thread_ids = [tid for tid, sid in self._thread_sandboxes.items() if sid == sandbox_id]
|
||||
for tid in thread_ids:
|
||||
del self._thread_sandboxes[tid]
|
||||
return None
|
||||
return sandbox
|
||||
return sandbox
|
||||
|
||||
def release(self, sandbox_id: str) -> None:
|
||||
|
||||
@@ -23,6 +23,9 @@ class SandboxConfig(BaseModel):
|
||||
replicas: Maximum number of concurrent sandbox containers (default: 3). When the limit is reached the least-recently-used sandbox is evicted to make room.
|
||||
container_prefix: Prefix for container names (default: deer-flow-sandbox)
|
||||
idle_timeout: Idle timeout in seconds before sandbox is released (default: 600 = 10 minutes). Set to 0 to disable.
|
||||
auto_restart: Automatically restart sandbox containers that have crashed (default: true). When a tool call
|
||||
detects the container is no longer alive, the sandbox is evicted from cache and transparently recreated
|
||||
on the next acquire. Set to false to disable.
|
||||
mounts: List of volume mounts to share directories with the container
|
||||
environment: Environment variables to inject into the container (values starting with $ are resolved from host env)
|
||||
"""
|
||||
@@ -55,6 +58,10 @@ class SandboxConfig(BaseModel):
|
||||
default=None,
|
||||
description="Idle timeout in seconds before sandbox is released (default: 600 = 10 minutes). Set to 0 to disable.",
|
||||
)
|
||||
auto_restart: bool = Field(
|
||||
default=True,
|
||||
description="Automatically restart sandbox containers that have crashed. When a tool call detects the container is no longer alive, the sandbox is evicted from cache and transparently recreated on the next acquire.",
|
||||
)
|
||||
mounts: list[VolumeMountConfig] = Field(
|
||||
default_factory=list,
|
||||
description="List of volume mounts to share directories between host and container",
|
||||
|
||||
Reference in New Issue
Block a user