From b67c2a4e5622332d1df9e28e10a212d6070651e8 Mon Sep 17 00:00:00 2001 From: Willem Jiang Date: Sun, 10 May 2026 22:53:58 +0800 Subject: [PATCH] fix(sandbox): auto-restart crashed containers transparently (#2788) When a sandbox container crashes (e.g. due to an internal error), the agent enters a connection-refused loop because AioSandboxProvider.get() returns a cached but dead sandbox object. Add a liveness check in get() that detects crashed containers via backend.is_alive() and evicts them from all caches, allowing ensure_sandbox_initialized() to transparently recreate a fresh container on the next acquire(). The behavior is controlled by a new config option (default: true). Set to false to skip health checks and preserve the old behavior of returning stale cached sandboxes. Closes #2788 --- .../aio_sandbox/aio_sandbox_provider.py | 24 ++- .../harness/deerflow/config/sandbox_config.py | 7 + .../tests/test_aio_sandbox_auto_restart.py | 182 ++++++++++++++++++ config.example.yaml | 5 + 4 files changed, 217 insertions(+), 1 deletion(-) create mode 100644 backend/tests/test_aio_sandbox_auto_restart.py diff --git a/backend/packages/harness/deerflow/community/aio_sandbox/aio_sandbox_provider.py b/backend/packages/harness/deerflow/community/aio_sandbox/aio_sandbox_provider.py index 292a43758..8da10242b 100644 --- a/backend/packages/harness/deerflow/community/aio_sandbox/aio_sandbox_provider.py +++ b/backend/packages/harness/deerflow/community/aio_sandbox/aio_sandbox_provider.py @@ -80,6 +80,7 @@ class AioSandboxProvider(SandboxProvider): port: 8080 # Base port for local containers container_prefix: deer-flow-sandbox idle_timeout: 600 # Idle timeout in seconds (0 to disable) + auto_restart: true # Restart crashed containers automatically replicas: 3 # Max concurrent sandbox containers (LRU eviction when exceeded) mounts: # Volume mounts for local containers - host_path: /path/on/host @@ -164,12 +165,14 @@ class AioSandboxProvider(SandboxProvider): idle_timeout = getattr(sandbox_config, "idle_timeout", None) replicas = getattr(sandbox_config, "replicas", None) + auto_restart = getattr(sandbox_config, "auto_restart", True) return { "image": sandbox_config.image or DEFAULT_IMAGE, "port": sandbox_config.port or DEFAULT_PORT, "container_prefix": sandbox_config.container_prefix or DEFAULT_CONTAINER_PREFIX, "idle_timeout": idle_timeout if idle_timeout is not None else DEFAULT_IDLE_TIMEOUT, + "auto_restart": auto_restart, "replicas": replicas if replicas is not None else DEFAULT_REPLICAS, "mounts": sandbox_config.mounts or [], "environment": self._resolve_env_vars(sandbox_config.environment or {}), @@ -608,16 +611,35 @@ class AioSandboxProvider(SandboxProvider): def get(self, sandbox_id: str) -> Sandbox | None: """Get a sandbox by ID. Updates last activity timestamp. + When ``auto_restart`` is enabled (the default), the container's liveness + is verified on each lookup. If the underlying container has crashed, the + sandbox is evicted from all caches so that the next ``acquire()`` call will + transparently create a fresh container. + Args: sandbox_id: The ID of the sandbox. Returns: - The sandbox instance if found, None otherwise. + The sandbox instance if found and alive, None otherwise. """ with self._lock: sandbox = self._sandboxes.get(sandbox_id) if sandbox is not None: self._last_activity[sandbox_id] = time.time() + # Auto-restart: detect crashed containers and evict from cache + # so the next acquire() transparently recreates them. + if self._config.get("auto_restart", True): + info = self._sandbox_infos.get(sandbox_id) + if info and not self._backend.is_alive(info): + logger.warning(f"Sandbox {sandbox_id} container is not alive, evicting from cache for auto-restart") + self._sandboxes.pop(sandbox_id, None) + self._sandbox_infos.pop(sandbox_id, None) + self._last_activity.pop(sandbox_id, None) + thread_ids = [tid for tid, sid in self._thread_sandboxes.items() if sid == sandbox_id] + for tid in thread_ids: + del self._thread_sandboxes[tid] + return None + return sandbox return sandbox def release(self, sandbox_id: str) -> None: diff --git a/backend/packages/harness/deerflow/config/sandbox_config.py b/backend/packages/harness/deerflow/config/sandbox_config.py index d9aac4ab4..429f2961a 100644 --- a/backend/packages/harness/deerflow/config/sandbox_config.py +++ b/backend/packages/harness/deerflow/config/sandbox_config.py @@ -23,6 +23,9 @@ class SandboxConfig(BaseModel): replicas: Maximum number of concurrent sandbox containers (default: 3). When the limit is reached the least-recently-used sandbox is evicted to make room. container_prefix: Prefix for container names (default: deer-flow-sandbox) idle_timeout: Idle timeout in seconds before sandbox is released (default: 600 = 10 minutes). Set to 0 to disable. + auto_restart: Automatically restart sandbox containers that have crashed (default: true). When a tool call + detects the container is no longer alive, the sandbox is evicted from cache and transparently recreated + on the next acquire. Set to false to disable. mounts: List of volume mounts to share directories with the container environment: Environment variables to inject into the container (values starting with $ are resolved from host env) """ @@ -55,6 +58,10 @@ class SandboxConfig(BaseModel): default=None, description="Idle timeout in seconds before sandbox is released (default: 600 = 10 minutes). Set to 0 to disable.", ) + auto_restart: bool = Field( + default=True, + description="Automatically restart sandbox containers that have crashed. When a tool call detects the container is no longer alive, the sandbox is evicted from cache and transparently recreated on the next acquire.", + ) mounts: list[VolumeMountConfig] = Field( default_factory=list, description="List of volume mounts to share directories between host and container", diff --git a/backend/tests/test_aio_sandbox_auto_restart.py b/backend/tests/test_aio_sandbox_auto_restart.py new file mode 100644 index 000000000..e561f481c --- /dev/null +++ b/backend/tests/test_aio_sandbox_auto_restart.py @@ -0,0 +1,182 @@ +"""Tests for AioSandboxProvider auto-restart of crashed containers.""" + +import importlib +import threading +from unittest.mock import MagicMock, patch + + +def _import_provider(): + return importlib.import_module("deerflow.community.aio_sandbox.aio_sandbox_provider") + + +def _make_provider(*, auto_restart=True, alive=True): + """Build a minimal AioSandboxProvider with a mock backend. + + Args: + auto_restart: Value for the auto_restart config key. + alive: Whether the mock backend reports containers as alive. + """ + mod = _import_provider() + with patch.object(mod.AioSandboxProvider, "_start_idle_checker"): + provider = mod.AioSandboxProvider.__new__(mod.AioSandboxProvider) + provider._config = {"auto_restart": auto_restart} + provider._lock = threading.Lock() + provider._sandboxes = {} + provider._sandbox_infos = {} + provider._thread_sandboxes = {} + provider._thread_locks = {} + provider._last_activity = {} + provider._warm_pool = {} + provider._shutdown_called = False + provider._idle_checker_stop = threading.Event() + + backend = MagicMock() + backend.is_alive.return_value = alive + provider._backend = backend + + return provider, backend + + +def _seed_sandbox(provider, sandbox_id="dead-beef", thread_id="thread-1"): + """Insert a sandbox into the provider's caches as if it were acquired.""" + sandbox = MagicMock() + info = MagicMock() + + provider._sandboxes[sandbox_id] = sandbox + provider._sandbox_infos[sandbox_id] = info + provider._last_activity[sandbox_id] = 0.0 + if thread_id: + provider._thread_sandboxes[thread_id] = sandbox_id + + return sandbox, info + + +# ── get() returns sandbox when container is alive ────────────────────────── + + +def test_get_returns_sandbox_when_container_alive(): + """When auto_restart is on and the container is alive, get() returns the sandbox.""" + provider, backend = _make_provider(auto_restart=True, alive=True) + sandbox, _ = _seed_sandbox(provider) + + result = provider.get("dead-beef") + + assert result is sandbox + backend.is_alive.assert_called_once() + + +def test_get_returns_sandbox_when_auto_restart_disabled(): + """When auto_restart is off, get() skips the health check entirely.""" + provider, backend = _make_provider(auto_restart=False) + sandbox, _ = _seed_sandbox(provider) + + result = provider.get("dead-beef") + + assert result is sandbox + backend.is_alive.assert_not_called() + + +# ── get() evicts dead sandbox when auto_restart is on ────────────────────── + + +def test_get_evicts_dead_sandbox_when_auto_restart_enabled(): + """When the container is dead and auto_restart is on, get() returns None and cleans caches.""" + provider, backend = _make_provider(auto_restart=True, alive=False) + _seed_sandbox(provider, sandbox_id="dead-beef", thread_id="thread-1") + + result = provider.get("dead-beef") + + assert result is None + assert "dead-beef" not in provider._sandboxes + assert "dead-beef" not in provider._sandbox_infos + assert "dead-beef" not in provider._last_activity + assert "thread-1" not in provider._thread_sandboxes + + +def test_get_returns_dead_sandbox_when_auto_restart_disabled(): + """When auto_restart is off, get() returns the cached sandbox even if the container is dead.""" + provider, backend = _make_provider(auto_restart=False, alive=False) + sandbox, _ = _seed_sandbox(provider) + + result = provider.get("dead-beef") + + assert result is sandbox + # Caches are untouched + assert "dead-beef" in provider._sandboxes + + +def test_get_eviction_cleans_multiple_thread_mappings(): + """A sandbox mapped to multiple thread IDs has all mappings cleaned on eviction.""" + provider, backend = _make_provider(auto_restart=True, alive=False) + _seed_sandbox(provider, sandbox_id="sid-1", thread_id="t-a") + # Manually add a second thread mapping to the same sandbox + provider._thread_sandboxes["t-b"] = "sid-1" + + result = provider.get("sid-1") + + assert result is None + assert "t-a" not in provider._thread_sandboxes + assert "t-b" not in provider._thread_sandboxes + + +# ── get() does not check health for unknown sandbox IDs ──────────────────── + + +def test_get_returns_none_for_unknown_id(): + """If the sandbox_id is not in cache, get() returns None without checking health.""" + provider, backend = _make_provider(auto_restart=True, alive=True) + + result = provider.get("nonexistent") + + assert result is None + backend.is_alive.assert_not_called() + + +# ── get() handles missing sandbox_info gracefully ────────────────────────── + + +def test_get_handles_missing_info_gracefully(): + """If sandbox is cached but info is missing, get() skips the health check.""" + provider, backend = _make_provider(auto_restart=True, alive=False) + sandbox = MagicMock() + provider._sandboxes["sid-x"] = sandbox + provider._sandbox_infos.pop("sid-x", None) # Ensure no info + provider._last_activity["sid-x"] = 0.0 + + result = provider.get("sid-x") + + # No info → cannot call is_alive → sandbox returned as-is + assert result is sandbox + backend.is_alive.assert_not_called() + + +# ── Integration: eviction clears caches for recreation ───────────────────── + + +def test_eviction_clears_all_caches_for_recreation(): + """After eviction, all caches are clean so _acquire_internal can recreate. + + This verifies the preconditions for transparent restart: when get() evicts + a dead sandbox, the next _acquire_internal call will find no cached entry, + no warm-pool entry, and fall through to _create_sandbox. + """ + provider, backend = _make_provider(auto_restart=True, alive=False) + _seed_sandbox(provider, sandbox_id="sid-1", thread_id="thread-1") + + # Before eviction: caches populated + assert "sid-1" in provider._sandboxes + assert "sid-1" in provider._sandbox_infos + assert "thread-1" in provider._thread_sandboxes + + # get() detects the dead container and evicts + assert provider.get("sid-1") is None + + # After eviction: all caches clean + assert "sid-1" not in provider._sandboxes + assert "sid-1" not in provider._sandbox_infos + assert "thread-1" not in provider._thread_sandboxes + assert "sid-1" not in provider._warm_pool + + # _acquire_internal for the same thread would find nothing cached + # and generate the deterministic ID, then discover fails (container + # is gone), falling through to _create_sandbox — a fresh start. diff --git a/config.example.yaml b/config.example.yaml index 9a8d07bf4..c25178dc4 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -601,6 +601,11 @@ sandbox: # # Optional: Prefix for container names (default: deer-flow-sandbox) # # container_prefix: deer-flow-sandbox # +# # Optional: Automatically restart crashed sandbox containers (default: true) +# # When enabled, a dead container is detected on the next tool call and +# # transparently replaced with a fresh one. Set to false to disable. +# # auto_restart: true +# # # Optional: Additional mount directories from host to container # # NOTE: Skills directory is automatically mounted from skills.path to skills.container_path # # mounts: