diff --git a/backend/packages/harness/deerflow/community/aio_sandbox/aio_sandbox_provider.py b/backend/packages/harness/deerflow/community/aio_sandbox/aio_sandbox_provider.py index 8da10242b..532b475c8 100644 --- a/backend/packages/harness/deerflow/community/aio_sandbox/aio_sandbox_provider.py +++ b/backend/packages/harness/deerflow/community/aio_sandbox/aio_sandbox_provider.py @@ -624,24 +624,45 @@ class AioSandboxProvider(SandboxProvider): """ with self._lock: sandbox = self._sandboxes.get(sandbox_id) - if sandbox is not None: - self._last_activity[sandbox_id] = time.time() - # Auto-restart: detect crashed containers and evict from cache - # so the next acquire() transparently recreates them. - if self._config.get("auto_restart", True): - info = self._sandbox_infos.get(sandbox_id) - if info and not self._backend.is_alive(info): - logger.warning(f"Sandbox {sandbox_id} container is not alive, evicting from cache for auto-restart") - self._sandboxes.pop(sandbox_id, None) - self._sandbox_infos.pop(sandbox_id, None) - self._last_activity.pop(sandbox_id, None) - thread_ids = [tid for tid, sid in self._thread_sandboxes.items() if sid == sandbox_id] - for tid in thread_ids: - del self._thread_sandboxes[tid] - return None - return sandbox + if sandbox is None: + return None + self._last_activity[sandbox_id] = time.time() + auto_restart = self._config.get("auto_restart", True) + info = self._sandbox_infos.get(sandbox_id) if auto_restart else None + + if not info: return sandbox + if self._backend.is_alive(info): + return sandbox + + info_to_destroy = None + with self._lock: + current_sandbox = self._sandboxes.get(sandbox_id) + current_info = self._sandbox_infos.get(sandbox_id) + if current_sandbox is None: + return None + if current_info is not info: + self._last_activity[sandbox_id] = time.time() + return current_sandbox + + logger.warning(f"Sandbox {sandbox_id} container is not alive, evicting from cache for auto-restart") + self._sandboxes.pop(sandbox_id, None) + self._sandbox_infos.pop(sandbox_id, None) + self._last_activity.pop(sandbox_id, None) + self._warm_pool.pop(sandbox_id, None) + thread_ids = [tid for tid, sid in self._thread_sandboxes.items() if sid == sandbox_id] + for tid in thread_ids: + del self._thread_sandboxes[tid] + info_to_destroy = info + + if info_to_destroy: + try: + self._backend.destroy(info_to_destroy) + except Exception as e: + logger.warning(f"Failed to cleanup dead sandbox {sandbox_id}: {e}") + return None + def release(self, sandbox_id: str) -> None: """Release a sandbox from active use into the warm pool. diff --git a/backend/tests/test_aio_sandbox_auto_restart.py b/backend/tests/test_aio_sandbox_auto_restart.py index e561f481c..6d6cf6df2 100644 --- a/backend/tests/test_aio_sandbox_auto_restart.py +++ b/backend/tests/test_aio_sandbox_auto_restart.py @@ -82,7 +82,7 @@ def test_get_returns_sandbox_when_auto_restart_disabled(): def test_get_evicts_dead_sandbox_when_auto_restart_enabled(): """When the container is dead and auto_restart is on, get() returns None and cleans caches.""" provider, backend = _make_provider(auto_restart=True, alive=False) - _seed_sandbox(provider, sandbox_id="dead-beef", thread_id="thread-1") + _, info = _seed_sandbox(provider, sandbox_id="dead-beef", thread_id="thread-1") result = provider.get("dead-beef") @@ -91,6 +91,7 @@ def test_get_evicts_dead_sandbox_when_auto_restart_enabled(): assert "dead-beef" not in provider._sandbox_infos assert "dead-beef" not in provider._last_activity assert "thread-1" not in provider._thread_sandboxes + backend.destroy.assert_called_once_with(info) def test_get_returns_dead_sandbox_when_auto_restart_disabled(): @@ -150,6 +151,33 @@ def test_get_handles_missing_info_gracefully(): backend.is_alive.assert_not_called() +def test_get_liveness_check_runs_outside_provider_lock(): + """get() should not hold the provider lock while checking backend liveness.""" + provider, backend = _make_provider(auto_restart=True, alive=False) + _seed_sandbox(provider, sandbox_id="sid-locked", thread_id="thread-1") + + def _assert_lock_not_held(_): + assert not provider._lock.locked() + return False + + backend.is_alive.side_effect = _assert_lock_not_held + + assert provider.get("sid-locked") is None + + +def test_get_still_evicts_when_backend_destroy_fails(): + """Cleanup errors should not keep stale sandbox state in memory.""" + provider, backend = _make_provider(auto_restart=True, alive=False) + _seed_sandbox(provider, sandbox_id="sid-fail", thread_id="thread-1") + backend.destroy.side_effect = RuntimeError("boom") + + assert provider.get("sid-fail") is None + assert "sid-fail" not in provider._sandboxes + assert "sid-fail" not in provider._sandbox_infos + assert "thread-1" not in provider._thread_sandboxes + backend.destroy.assert_called_once() + + # ── Integration: eviction clears caches for recreation ─────────────────────