Compare commits
2 Commits
v2.0-m1-rc1
...
fix-2788
| Author | SHA1 | Date | |
|---|---|---|---|
| dad3997459 | |||
| b67c2a4e56 |
@@ -80,6 +80,7 @@ class AioSandboxProvider(SandboxProvider):
|
||||
port: 8080 # Base port for local containers
|
||||
container_prefix: deer-flow-sandbox
|
||||
idle_timeout: 600 # Idle timeout in seconds (0 to disable)
|
||||
auto_restart: true # Restart crashed containers automatically
|
||||
replicas: 3 # Max concurrent sandbox containers (LRU eviction when exceeded)
|
||||
mounts: # Volume mounts for local containers
|
||||
- host_path: /path/on/host
|
||||
@@ -164,12 +165,14 @@ class AioSandboxProvider(SandboxProvider):
|
||||
|
||||
idle_timeout = getattr(sandbox_config, "idle_timeout", None)
|
||||
replicas = getattr(sandbox_config, "replicas", None)
|
||||
auto_restart = getattr(sandbox_config, "auto_restart", True)
|
||||
|
||||
return {
|
||||
"image": sandbox_config.image or DEFAULT_IMAGE,
|
||||
"port": sandbox_config.port or DEFAULT_PORT,
|
||||
"container_prefix": sandbox_config.container_prefix or DEFAULT_CONTAINER_PREFIX,
|
||||
"idle_timeout": idle_timeout if idle_timeout is not None else DEFAULT_IDLE_TIMEOUT,
|
||||
"auto_restart": auto_restart,
|
||||
"replicas": replicas if replicas is not None else DEFAULT_REPLICAS,
|
||||
"mounts": sandbox_config.mounts or [],
|
||||
"environment": self._resolve_env_vars(sandbox_config.environment or {}),
|
||||
@@ -608,18 +611,58 @@ class AioSandboxProvider(SandboxProvider):
|
||||
def get(self, sandbox_id: str) -> Sandbox | None:
|
||||
"""Get a sandbox by ID. Updates last activity timestamp.
|
||||
|
||||
When ``auto_restart`` is enabled (the default), the container's liveness
|
||||
is verified on each lookup. If the underlying container has crashed, the
|
||||
sandbox is evicted from all caches so that the next ``acquire()`` call will
|
||||
transparently create a fresh container.
|
||||
|
||||
Args:
|
||||
sandbox_id: The ID of the sandbox.
|
||||
|
||||
Returns:
|
||||
The sandbox instance if found, None otherwise.
|
||||
The sandbox instance if found and alive, None otherwise.
|
||||
"""
|
||||
with self._lock:
|
||||
sandbox = self._sandboxes.get(sandbox_id)
|
||||
if sandbox is not None:
|
||||
self._last_activity[sandbox_id] = time.time()
|
||||
if sandbox is None:
|
||||
return None
|
||||
self._last_activity[sandbox_id] = time.time()
|
||||
auto_restart = self._config.get("auto_restart", True)
|
||||
info = self._sandbox_infos.get(sandbox_id) if auto_restart else None
|
||||
|
||||
if not info:
|
||||
return sandbox
|
||||
|
||||
if self._backend.is_alive(info):
|
||||
return sandbox
|
||||
|
||||
info_to_destroy = None
|
||||
with self._lock:
|
||||
current_sandbox = self._sandboxes.get(sandbox_id)
|
||||
current_info = self._sandbox_infos.get(sandbox_id)
|
||||
if current_sandbox is None:
|
||||
return None
|
||||
if current_info is not info:
|
||||
self._last_activity[sandbox_id] = time.time()
|
||||
return current_sandbox
|
||||
|
||||
logger.warning(f"Sandbox {sandbox_id} container is not alive, evicting from cache for auto-restart")
|
||||
self._sandboxes.pop(sandbox_id, None)
|
||||
self._sandbox_infos.pop(sandbox_id, None)
|
||||
self._last_activity.pop(sandbox_id, None)
|
||||
self._warm_pool.pop(sandbox_id, None)
|
||||
thread_ids = [tid for tid, sid in self._thread_sandboxes.items() if sid == sandbox_id]
|
||||
for tid in thread_ids:
|
||||
del self._thread_sandboxes[tid]
|
||||
info_to_destroy = info
|
||||
|
||||
if info_to_destroy:
|
||||
try:
|
||||
self._backend.destroy(info_to_destroy)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to cleanup dead sandbox {sandbox_id}: {e}")
|
||||
return None
|
||||
|
||||
def release(self, sandbox_id: str) -> None:
|
||||
"""Release a sandbox from active use into the warm pool.
|
||||
|
||||
|
||||
@@ -23,6 +23,9 @@ class SandboxConfig(BaseModel):
|
||||
replicas: Maximum number of concurrent sandbox containers (default: 3). When the limit is reached the least-recently-used sandbox is evicted to make room.
|
||||
container_prefix: Prefix for container names (default: deer-flow-sandbox)
|
||||
idle_timeout: Idle timeout in seconds before sandbox is released (default: 600 = 10 minutes). Set to 0 to disable.
|
||||
auto_restart: Automatically restart sandbox containers that have crashed (default: true). When a tool call
|
||||
detects the container is no longer alive, the sandbox is evicted from cache and transparently recreated
|
||||
on the next acquire. Set to false to disable.
|
||||
mounts: List of volume mounts to share directories with the container
|
||||
environment: Environment variables to inject into the container (values starting with $ are resolved from host env)
|
||||
"""
|
||||
@@ -55,6 +58,10 @@ class SandboxConfig(BaseModel):
|
||||
default=None,
|
||||
description="Idle timeout in seconds before sandbox is released (default: 600 = 10 minutes). Set to 0 to disable.",
|
||||
)
|
||||
auto_restart: bool = Field(
|
||||
default=True,
|
||||
description="Automatically restart sandbox containers that have crashed. When a tool call detects the container is no longer alive, the sandbox is evicted from cache and transparently recreated on the next acquire.",
|
||||
)
|
||||
mounts: list[VolumeMountConfig] = Field(
|
||||
default_factory=list,
|
||||
description="List of volume mounts to share directories between host and container",
|
||||
|
||||
@@ -0,0 +1,210 @@
|
||||
"""Tests for AioSandboxProvider auto-restart of crashed containers."""
|
||||
|
||||
import importlib
|
||||
import threading
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
|
||||
def _import_provider():
|
||||
return importlib.import_module("deerflow.community.aio_sandbox.aio_sandbox_provider")
|
||||
|
||||
|
||||
def _make_provider(*, auto_restart=True, alive=True):
|
||||
"""Build a minimal AioSandboxProvider with a mock backend.
|
||||
|
||||
Args:
|
||||
auto_restart: Value for the auto_restart config key.
|
||||
alive: Whether the mock backend reports containers as alive.
|
||||
"""
|
||||
mod = _import_provider()
|
||||
with patch.object(mod.AioSandboxProvider, "_start_idle_checker"):
|
||||
provider = mod.AioSandboxProvider.__new__(mod.AioSandboxProvider)
|
||||
provider._config = {"auto_restart": auto_restart}
|
||||
provider._lock = threading.Lock()
|
||||
provider._sandboxes = {}
|
||||
provider._sandbox_infos = {}
|
||||
provider._thread_sandboxes = {}
|
||||
provider._thread_locks = {}
|
||||
provider._last_activity = {}
|
||||
provider._warm_pool = {}
|
||||
provider._shutdown_called = False
|
||||
provider._idle_checker_stop = threading.Event()
|
||||
|
||||
backend = MagicMock()
|
||||
backend.is_alive.return_value = alive
|
||||
provider._backend = backend
|
||||
|
||||
return provider, backend
|
||||
|
||||
|
||||
def _seed_sandbox(provider, sandbox_id="dead-beef", thread_id="thread-1"):
|
||||
"""Insert a sandbox into the provider's caches as if it were acquired."""
|
||||
sandbox = MagicMock()
|
||||
info = MagicMock()
|
||||
|
||||
provider._sandboxes[sandbox_id] = sandbox
|
||||
provider._sandbox_infos[sandbox_id] = info
|
||||
provider._last_activity[sandbox_id] = 0.0
|
||||
if thread_id:
|
||||
provider._thread_sandboxes[thread_id] = sandbox_id
|
||||
|
||||
return sandbox, info
|
||||
|
||||
|
||||
# ── get() returns sandbox when container is alive ──────────────────────────
|
||||
|
||||
|
||||
def test_get_returns_sandbox_when_container_alive():
|
||||
"""When auto_restart is on and the container is alive, get() returns the sandbox."""
|
||||
provider, backend = _make_provider(auto_restart=True, alive=True)
|
||||
sandbox, _ = _seed_sandbox(provider)
|
||||
|
||||
result = provider.get("dead-beef")
|
||||
|
||||
assert result is sandbox
|
||||
backend.is_alive.assert_called_once()
|
||||
|
||||
|
||||
def test_get_returns_sandbox_when_auto_restart_disabled():
|
||||
"""When auto_restart is off, get() skips the health check entirely."""
|
||||
provider, backend = _make_provider(auto_restart=False)
|
||||
sandbox, _ = _seed_sandbox(provider)
|
||||
|
||||
result = provider.get("dead-beef")
|
||||
|
||||
assert result is sandbox
|
||||
backend.is_alive.assert_not_called()
|
||||
|
||||
|
||||
# ── get() evicts dead sandbox when auto_restart is on ──────────────────────
|
||||
|
||||
|
||||
def test_get_evicts_dead_sandbox_when_auto_restart_enabled():
|
||||
"""When the container is dead and auto_restart is on, get() returns None and cleans caches."""
|
||||
provider, backend = _make_provider(auto_restart=True, alive=False)
|
||||
_, info = _seed_sandbox(provider, sandbox_id="dead-beef", thread_id="thread-1")
|
||||
|
||||
result = provider.get("dead-beef")
|
||||
|
||||
assert result is None
|
||||
assert "dead-beef" not in provider._sandboxes
|
||||
assert "dead-beef" not in provider._sandbox_infos
|
||||
assert "dead-beef" not in provider._last_activity
|
||||
assert "thread-1" not in provider._thread_sandboxes
|
||||
backend.destroy.assert_called_once_with(info)
|
||||
|
||||
|
||||
def test_get_returns_dead_sandbox_when_auto_restart_disabled():
|
||||
"""When auto_restart is off, get() returns the cached sandbox even if the container is dead."""
|
||||
provider, backend = _make_provider(auto_restart=False, alive=False)
|
||||
sandbox, _ = _seed_sandbox(provider)
|
||||
|
||||
result = provider.get("dead-beef")
|
||||
|
||||
assert result is sandbox
|
||||
# Caches are untouched
|
||||
assert "dead-beef" in provider._sandboxes
|
||||
|
||||
|
||||
def test_get_eviction_cleans_multiple_thread_mappings():
|
||||
"""A sandbox mapped to multiple thread IDs has all mappings cleaned on eviction."""
|
||||
provider, backend = _make_provider(auto_restart=True, alive=False)
|
||||
_seed_sandbox(provider, sandbox_id="sid-1", thread_id="t-a")
|
||||
# Manually add a second thread mapping to the same sandbox
|
||||
provider._thread_sandboxes["t-b"] = "sid-1"
|
||||
|
||||
result = provider.get("sid-1")
|
||||
|
||||
assert result is None
|
||||
assert "t-a" not in provider._thread_sandboxes
|
||||
assert "t-b" not in provider._thread_sandboxes
|
||||
|
||||
|
||||
# ── get() does not check health for unknown sandbox IDs ────────────────────
|
||||
|
||||
|
||||
def test_get_returns_none_for_unknown_id():
|
||||
"""If the sandbox_id is not in cache, get() returns None without checking health."""
|
||||
provider, backend = _make_provider(auto_restart=True, alive=True)
|
||||
|
||||
result = provider.get("nonexistent")
|
||||
|
||||
assert result is None
|
||||
backend.is_alive.assert_not_called()
|
||||
|
||||
|
||||
# ── get() handles missing sandbox_info gracefully ──────────────────────────
|
||||
|
||||
|
||||
def test_get_handles_missing_info_gracefully():
|
||||
"""If sandbox is cached but info is missing, get() skips the health check."""
|
||||
provider, backend = _make_provider(auto_restart=True, alive=False)
|
||||
sandbox = MagicMock()
|
||||
provider._sandboxes["sid-x"] = sandbox
|
||||
provider._sandbox_infos.pop("sid-x", None) # Ensure no info
|
||||
provider._last_activity["sid-x"] = 0.0
|
||||
|
||||
result = provider.get("sid-x")
|
||||
|
||||
# No info → cannot call is_alive → sandbox returned as-is
|
||||
assert result is sandbox
|
||||
backend.is_alive.assert_not_called()
|
||||
|
||||
|
||||
def test_get_liveness_check_runs_outside_provider_lock():
|
||||
"""get() should not hold the provider lock while checking backend liveness."""
|
||||
provider, backend = _make_provider(auto_restart=True, alive=False)
|
||||
_seed_sandbox(provider, sandbox_id="sid-locked", thread_id="thread-1")
|
||||
|
||||
def _assert_lock_not_held(_):
|
||||
assert not provider._lock.locked()
|
||||
return False
|
||||
|
||||
backend.is_alive.side_effect = _assert_lock_not_held
|
||||
|
||||
assert provider.get("sid-locked") is None
|
||||
|
||||
|
||||
def test_get_still_evicts_when_backend_destroy_fails():
|
||||
"""Cleanup errors should not keep stale sandbox state in memory."""
|
||||
provider, backend = _make_provider(auto_restart=True, alive=False)
|
||||
_seed_sandbox(provider, sandbox_id="sid-fail", thread_id="thread-1")
|
||||
backend.destroy.side_effect = RuntimeError("boom")
|
||||
|
||||
assert provider.get("sid-fail") is None
|
||||
assert "sid-fail" not in provider._sandboxes
|
||||
assert "sid-fail" not in provider._sandbox_infos
|
||||
assert "thread-1" not in provider._thread_sandboxes
|
||||
backend.destroy.assert_called_once()
|
||||
|
||||
|
||||
# ── Integration: eviction clears caches for recreation ─────────────────────
|
||||
|
||||
|
||||
def test_eviction_clears_all_caches_for_recreation():
|
||||
"""After eviction, all caches are clean so _acquire_internal can recreate.
|
||||
|
||||
This verifies the preconditions for transparent restart: when get() evicts
|
||||
a dead sandbox, the next _acquire_internal call will find no cached entry,
|
||||
no warm-pool entry, and fall through to _create_sandbox.
|
||||
"""
|
||||
provider, backend = _make_provider(auto_restart=True, alive=False)
|
||||
_seed_sandbox(provider, sandbox_id="sid-1", thread_id="thread-1")
|
||||
|
||||
# Before eviction: caches populated
|
||||
assert "sid-1" in provider._sandboxes
|
||||
assert "sid-1" in provider._sandbox_infos
|
||||
assert "thread-1" in provider._thread_sandboxes
|
||||
|
||||
# get() detects the dead container and evicts
|
||||
assert provider.get("sid-1") is None
|
||||
|
||||
# After eviction: all caches clean
|
||||
assert "sid-1" not in provider._sandboxes
|
||||
assert "sid-1" not in provider._sandbox_infos
|
||||
assert "thread-1" not in provider._thread_sandboxes
|
||||
assert "sid-1" not in provider._warm_pool
|
||||
|
||||
# _acquire_internal for the same thread would find nothing cached
|
||||
# and generate the deterministic ID, then discover fails (container
|
||||
# is gone), falling through to _create_sandbox — a fresh start.
|
||||
@@ -601,6 +601,11 @@ sandbox:
|
||||
# # Optional: Prefix for container names (default: deer-flow-sandbox)
|
||||
# # container_prefix: deer-flow-sandbox
|
||||
#
|
||||
# # Optional: Automatically restart crashed sandbox containers (default: true)
|
||||
# # When enabled, a dead container is detected on the next tool call and
|
||||
# # transparently replaced with a fresh one. Set to false to disable.
|
||||
# # auto_restart: true
|
||||
#
|
||||
# # Optional: Additional mount directories from host to container
|
||||
# # NOTE: Skills directory is automatically mounted from skills.path to skills.container_path
|
||||
# # mounts:
|
||||
|
||||
Reference in New Issue
Block a user