mirror of
https://github.com/bytedance/deer-flow.git
synced 2026-05-24 08:55:59 +00:00
fix(sandbox): cleanup dead containers and avoid lock-held liveness checks
Agent-Logs-Url: https://github.com/bytedance/deer-flow/sessions/96707445-0f8b-4901-8ef3-d8e5667f8a05 Co-authored-by: WillemJiang <219644+WillemJiang@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
b67c2a4e56
commit
dad3997459
@@ -624,24 +624,45 @@ class AioSandboxProvider(SandboxProvider):
|
|||||||
"""
|
"""
|
||||||
with self._lock:
|
with self._lock:
|
||||||
sandbox = self._sandboxes.get(sandbox_id)
|
sandbox = self._sandboxes.get(sandbox_id)
|
||||||
if sandbox is not None:
|
if sandbox is None:
|
||||||
self._last_activity[sandbox_id] = time.time()
|
return None
|
||||||
# Auto-restart: detect crashed containers and evict from cache
|
self._last_activity[sandbox_id] = time.time()
|
||||||
# so the next acquire() transparently recreates them.
|
auto_restart = self._config.get("auto_restart", True)
|
||||||
if self._config.get("auto_restart", True):
|
info = self._sandbox_infos.get(sandbox_id) if auto_restart else None
|
||||||
info = self._sandbox_infos.get(sandbox_id)
|
|
||||||
if info and not self._backend.is_alive(info):
|
if not info:
|
||||||
logger.warning(f"Sandbox {sandbox_id} container is not alive, evicting from cache for auto-restart")
|
|
||||||
self._sandboxes.pop(sandbox_id, None)
|
|
||||||
self._sandbox_infos.pop(sandbox_id, None)
|
|
||||||
self._last_activity.pop(sandbox_id, None)
|
|
||||||
thread_ids = [tid for tid, sid in self._thread_sandboxes.items() if sid == sandbox_id]
|
|
||||||
for tid in thread_ids:
|
|
||||||
del self._thread_sandboxes[tid]
|
|
||||||
return None
|
|
||||||
return sandbox
|
|
||||||
return sandbox
|
return sandbox
|
||||||
|
|
||||||
|
if self._backend.is_alive(info):
|
||||||
|
return sandbox
|
||||||
|
|
||||||
|
info_to_destroy = None
|
||||||
|
with self._lock:
|
||||||
|
current_sandbox = self._sandboxes.get(sandbox_id)
|
||||||
|
current_info = self._sandbox_infos.get(sandbox_id)
|
||||||
|
if current_sandbox is None:
|
||||||
|
return None
|
||||||
|
if current_info is not info:
|
||||||
|
self._last_activity[sandbox_id] = time.time()
|
||||||
|
return current_sandbox
|
||||||
|
|
||||||
|
logger.warning(f"Sandbox {sandbox_id} container is not alive, evicting from cache for auto-restart")
|
||||||
|
self._sandboxes.pop(sandbox_id, None)
|
||||||
|
self._sandbox_infos.pop(sandbox_id, None)
|
||||||
|
self._last_activity.pop(sandbox_id, None)
|
||||||
|
self._warm_pool.pop(sandbox_id, None)
|
||||||
|
thread_ids = [tid for tid, sid in self._thread_sandboxes.items() if sid == sandbox_id]
|
||||||
|
for tid in thread_ids:
|
||||||
|
del self._thread_sandboxes[tid]
|
||||||
|
info_to_destroy = info
|
||||||
|
|
||||||
|
if info_to_destroy:
|
||||||
|
try:
|
||||||
|
self._backend.destroy(info_to_destroy)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to cleanup dead sandbox {sandbox_id}: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
def release(self, sandbox_id: str) -> None:
|
def release(self, sandbox_id: str) -> None:
|
||||||
"""Release a sandbox from active use into the warm pool.
|
"""Release a sandbox from active use into the warm pool.
|
||||||
|
|
||||||
|
|||||||
@@ -82,7 +82,7 @@ def test_get_returns_sandbox_when_auto_restart_disabled():
|
|||||||
def test_get_evicts_dead_sandbox_when_auto_restart_enabled():
|
def test_get_evicts_dead_sandbox_when_auto_restart_enabled():
|
||||||
"""When the container is dead and auto_restart is on, get() returns None and cleans caches."""
|
"""When the container is dead and auto_restart is on, get() returns None and cleans caches."""
|
||||||
provider, backend = _make_provider(auto_restart=True, alive=False)
|
provider, backend = _make_provider(auto_restart=True, alive=False)
|
||||||
_seed_sandbox(provider, sandbox_id="dead-beef", thread_id="thread-1")
|
_, info = _seed_sandbox(provider, sandbox_id="dead-beef", thread_id="thread-1")
|
||||||
|
|
||||||
result = provider.get("dead-beef")
|
result = provider.get("dead-beef")
|
||||||
|
|
||||||
@@ -91,6 +91,7 @@ def test_get_evicts_dead_sandbox_when_auto_restart_enabled():
|
|||||||
assert "dead-beef" not in provider._sandbox_infos
|
assert "dead-beef" not in provider._sandbox_infos
|
||||||
assert "dead-beef" not in provider._last_activity
|
assert "dead-beef" not in provider._last_activity
|
||||||
assert "thread-1" not in provider._thread_sandboxes
|
assert "thread-1" not in provider._thread_sandboxes
|
||||||
|
backend.destroy.assert_called_once_with(info)
|
||||||
|
|
||||||
|
|
||||||
def test_get_returns_dead_sandbox_when_auto_restart_disabled():
|
def test_get_returns_dead_sandbox_when_auto_restart_disabled():
|
||||||
@@ -150,6 +151,33 @@ def test_get_handles_missing_info_gracefully():
|
|||||||
backend.is_alive.assert_not_called()
|
backend.is_alive.assert_not_called()
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_liveness_check_runs_outside_provider_lock():
|
||||||
|
"""get() should not hold the provider lock while checking backend liveness."""
|
||||||
|
provider, backend = _make_provider(auto_restart=True, alive=False)
|
||||||
|
_seed_sandbox(provider, sandbox_id="sid-locked", thread_id="thread-1")
|
||||||
|
|
||||||
|
def _assert_lock_not_held(_):
|
||||||
|
assert not provider._lock.locked()
|
||||||
|
return False
|
||||||
|
|
||||||
|
backend.is_alive.side_effect = _assert_lock_not_held
|
||||||
|
|
||||||
|
assert provider.get("sid-locked") is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_still_evicts_when_backend_destroy_fails():
|
||||||
|
"""Cleanup errors should not keep stale sandbox state in memory."""
|
||||||
|
provider, backend = _make_provider(auto_restart=True, alive=False)
|
||||||
|
_seed_sandbox(provider, sandbox_id="sid-fail", thread_id="thread-1")
|
||||||
|
backend.destroy.side_effect = RuntimeError("boom")
|
||||||
|
|
||||||
|
assert provider.get("sid-fail") is None
|
||||||
|
assert "sid-fail" not in provider._sandboxes
|
||||||
|
assert "sid-fail" not in provider._sandbox_infos
|
||||||
|
assert "thread-1" not in provider._thread_sandboxes
|
||||||
|
backend.destroy.assert_called_once()
|
||||||
|
|
||||||
|
|
||||||
# ── Integration: eviction clears caches for recreation ─────────────────────
|
# ── Integration: eviction clears caches for recreation ─────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user