mirror of
https://github.com/bytedance/deer-flow.git
synced 2026-06-11 18:05:58 +00:00
[codex] Fix stale AIO sandbox cache reuse (#3494)
* Fix stale AIO sandbox cache reuse * Address AIO sandbox review feedback * Distinguish sandbox health check failures * Keep local discovery recoverable when the runtime check fails LocalContainerBackend.discover() shares _is_container_running, which now raises on transient daemon errors instead of returning False. Discovery has no exception handling in _discover_or_create_with_lock(_async), so a brief Docker hiccup turned a recoverable "could not verify, create instead" into a hard acquire failure. Catch the check failure inside discover() and return None so an unverifiable container is simply not adopted, restoring the pre-change fall-through while keeping raise-on-unknown semantics protecting the destroy path. Reported by fancy-agent on PR #3494. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com> * Narrow the not-found match in container inspect error handling A bare "not found" substring also matches transient failures like "command not found" or "context not found", which would misclassify a check error as "container definitely gone" and bypass the raise-on-unknown contract. Keep Docker's specific "No such object"/"No such container" phrases, and only trust a generic "not found" (Apple Container) when the message names the inspected container or refers to a container/object. Reported by WillemJiang on PR #3494. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com> --------- Co-authored-by: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
@@ -470,14 +470,32 @@ class AioSandboxProvider(SandboxProvider):
|
||||
|
||||
existing_id = self._thread_sandboxes[thread_id]
|
||||
if existing_id in self._sandboxes:
|
||||
suffix = " (post-lock check)" if post_lock else ""
|
||||
logger.info(f"Reusing in-process sandbox {existing_id} for thread {thread_id}{suffix}")
|
||||
self._last_activity[existing_id] = time.time()
|
||||
return existing_id
|
||||
info = self._sandbox_infos.get(existing_id)
|
||||
else:
|
||||
del self._thread_sandboxes[thread_id]
|
||||
return None
|
||||
|
||||
del self._thread_sandboxes[thread_id]
|
||||
alive = self._check_tracked_sandbox_alive(existing_id, info) if info is not None else True
|
||||
if alive is False:
|
||||
self._drop_unhealthy_sandbox(
|
||||
existing_id,
|
||||
"in-process cache failed health check",
|
||||
expected_info=info,
|
||||
)
|
||||
return None
|
||||
|
||||
with self._lock:
|
||||
if self._thread_sandboxes.get(thread_id) != existing_id:
|
||||
return None
|
||||
if existing_id not in self._sandboxes:
|
||||
self._thread_sandboxes.pop(thread_id, None)
|
||||
return None
|
||||
|
||||
suffix = " (post-lock check)" if post_lock else ""
|
||||
logger.info(f"Reusing in-process sandbox {existing_id} for thread {thread_id}{suffix}")
|
||||
self._last_activity[existing_id] = time.time()
|
||||
return existing_id
|
||||
|
||||
def _reclaim_warm_pool_sandbox(self, thread_id: str | None, sandbox_id: str, *, post_lock: bool = False) -> str | None:
|
||||
"""Promote a warm-pool sandbox back to active tracking if available."""
|
||||
if thread_id is None:
|
||||
@@ -487,7 +505,22 @@ class AioSandboxProvider(SandboxProvider):
|
||||
if sandbox_id not in self._warm_pool:
|
||||
return None
|
||||
|
||||
info, _ = self._warm_pool.pop(sandbox_id)
|
||||
info, _ = self._warm_pool[sandbox_id]
|
||||
|
||||
alive = self._check_tracked_sandbox_alive(sandbox_id, info)
|
||||
if alive is False:
|
||||
self._drop_unhealthy_sandbox(
|
||||
sandbox_id,
|
||||
"warm-pool cache failed health check",
|
||||
expected_info=info,
|
||||
)
|
||||
return None
|
||||
|
||||
with self._lock:
|
||||
warm_item = self._warm_pool.pop(sandbox_id, None)
|
||||
if warm_item is None:
|
||||
return None
|
||||
info, _ = warm_item
|
||||
sandbox = AioSandbox(id=sandbox_id, base_url=info.sandbox_url)
|
||||
self._sandboxes[sandbox_id] = sandbox
|
||||
self._sandbox_infos[sandbox_id] = info
|
||||
@@ -527,6 +560,70 @@ class AioSandboxProvider(SandboxProvider):
|
||||
logger.info(f"Created sandbox {sandbox_id} for thread {thread_id} at {info.sandbox_url}")
|
||||
return sandbox_id
|
||||
|
||||
def _check_tracked_sandbox_alive(self, sandbox_id: str, info: SandboxInfo) -> bool | None:
|
||||
"""Return whether a tracked sandbox appears alive, or None if unknown."""
|
||||
try:
|
||||
return self._backend.is_alive(info)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to check sandbox {sandbox_id} health: {e}")
|
||||
return None
|
||||
|
||||
def _remove_tracked_sandbox(
|
||||
self,
|
||||
sandbox_id: str,
|
||||
*,
|
||||
expected_info: SandboxInfo | None = None,
|
||||
) -> tuple[Sandbox | None, SandboxInfo | None, bool]:
|
||||
"""Remove a sandbox from in-process tracking maps.
|
||||
|
||||
When expected_info is provided, removal only happens if the currently
|
||||
tracked active or warm-pool entry is the exact info object that was
|
||||
checked. This prevents a stale health-check result from deleting a
|
||||
freshly recreated sandbox with the same deterministic id.
|
||||
"""
|
||||
thread_ids_to_remove: list[str] = []
|
||||
|
||||
with self._lock:
|
||||
active_info = self._sandbox_infos.get(sandbox_id)
|
||||
warm_item = self._warm_pool.get(sandbox_id)
|
||||
warm_info = warm_item[0] if warm_item is not None else None
|
||||
if expected_info is not None and active_info is not expected_info and warm_info is not expected_info:
|
||||
return None, None, False
|
||||
|
||||
sandbox = self._sandboxes.pop(sandbox_id, None)
|
||||
info = self._sandbox_infos.pop(sandbox_id, None)
|
||||
thread_ids_to_remove = [tid for tid, sid in self._thread_sandboxes.items() if sid == sandbox_id]
|
||||
for tid in thread_ids_to_remove:
|
||||
del self._thread_sandboxes[tid]
|
||||
self._last_activity.pop(sandbox_id, None)
|
||||
if info is None and sandbox_id in self._warm_pool:
|
||||
info, _ = self._warm_pool.pop(sandbox_id)
|
||||
else:
|
||||
self._warm_pool.pop(sandbox_id, None)
|
||||
|
||||
return sandbox, info, True
|
||||
|
||||
def _drop_unhealthy_sandbox(self, sandbox_id: str, reason: str, *, expected_info: SandboxInfo | None = None) -> None:
|
||||
"""Remove and destroy a sandbox after a definitive failed health check."""
|
||||
sandbox, info, removed = self._remove_tracked_sandbox(sandbox_id, expected_info=expected_info)
|
||||
if not removed:
|
||||
logger.info(f"Skipped dropping sandbox {sandbox_id}: tracked info changed after health check")
|
||||
return
|
||||
|
||||
if sandbox is not None:
|
||||
try:
|
||||
sandbox.close()
|
||||
except Exception as e:
|
||||
logger.warning(f"Error closing unhealthy sandbox {sandbox_id}: {e}")
|
||||
|
||||
if info is not None:
|
||||
try:
|
||||
self._backend.destroy(info)
|
||||
except Exception as e:
|
||||
logger.warning(f"Error destroying unhealthy sandbox {sandbox_id}: {e}")
|
||||
|
||||
logger.warning(f"Dropped unhealthy sandbox {sandbox_id}: {reason}")
|
||||
|
||||
def _replica_count(self) -> tuple[int, int]:
|
||||
"""Return configured replicas and currently tracked sandbox count."""
|
||||
replicas = self._config.get("replicas", DEFAULT_REPLICAS)
|
||||
@@ -617,7 +714,7 @@ class AioSandboxProvider(SandboxProvider):
|
||||
|
||||
async def _acquire_internal_async(self, thread_id: str | None) -> str:
|
||||
"""Async counterpart to ``_acquire_internal``."""
|
||||
cached_id = self._reuse_in_process_sandbox(thread_id)
|
||||
cached_id = await asyncio.to_thread(self._reuse_in_process_sandbox, thread_id)
|
||||
if cached_id is not None:
|
||||
return cached_id
|
||||
|
||||
@@ -625,7 +722,7 @@ class AioSandboxProvider(SandboxProvider):
|
||||
sandbox_id = self._sandbox_id_for_thread(thread_id)
|
||||
|
||||
# ── Layer 1.5: Warm pool (container still running, no cold-start) ──
|
||||
reclaimed_id = self._reclaim_warm_pool_sandbox(thread_id, sandbox_id)
|
||||
reclaimed_id = await asyncio.to_thread(self._reclaim_warm_pool_sandbox, thread_id, sandbox_id)
|
||||
if reclaimed_id is not None:
|
||||
return reclaimed_id
|
||||
|
||||
@@ -681,7 +778,7 @@ class AioSandboxProvider(SandboxProvider):
|
||||
locked = True
|
||||
# Re-check in-process caches under the file lock in case another
|
||||
# thread in this process won the race while we were waiting.
|
||||
cached_id = self._recheck_cached_sandbox(thread_id, sandbox_id)
|
||||
cached_id = await asyncio.to_thread(self._recheck_cached_sandbox, thread_id, sandbox_id)
|
||||
if cached_id is not None:
|
||||
return cached_id
|
||||
|
||||
@@ -837,22 +934,7 @@ class AioSandboxProvider(SandboxProvider):
|
||||
Args:
|
||||
sandbox_id: The ID of the sandbox to destroy.
|
||||
"""
|
||||
info = None
|
||||
sandbox = None
|
||||
thread_ids_to_remove: list[str] = []
|
||||
|
||||
with self._lock:
|
||||
sandbox = self._sandboxes.pop(sandbox_id, None)
|
||||
info = self._sandbox_infos.pop(sandbox_id, None)
|
||||
thread_ids_to_remove = [tid for tid, sid in self._thread_sandboxes.items() if sid == sandbox_id]
|
||||
for tid in thread_ids_to_remove:
|
||||
del self._thread_sandboxes[tid]
|
||||
self._last_activity.pop(sandbox_id, None)
|
||||
# Also pull from warm pool if it was parked there
|
||||
if info is None and sandbox_id in self._warm_pool:
|
||||
info, _ = self._warm_pool.pop(sandbox_id)
|
||||
else:
|
||||
self._warm_pool.pop(sandbox_id, None)
|
||||
sandbox, info, _ = self._remove_tracked_sandbox(sandbox_id)
|
||||
|
||||
if sandbox is not None:
|
||||
# Defense-in-depth: close() already swallows its own errors; this
|
||||
|
||||
@@ -169,6 +169,24 @@ def _resolve_docker_bind_host(sandbox_host: str | None = None, bind_host: str |
|
||||
return "0.0.0.0"
|
||||
|
||||
|
||||
def _is_no_such_container_error(stderr: str, container_name: str) -> bool:
|
||||
"""Return True only when stderr definitively says the container does not exist.
|
||||
|
||||
Docker reports "No such object" / "No such container". Apple Container
|
||||
reports a generic "not found", so that phrase is only trusted when the
|
||||
message also names the inspected container (or refers to a
|
||||
container/object); transient failures whose text happens to contain
|
||||
"not found" (e.g. "command not found", "context not found") must stay on
|
||||
the raise path instead of being misread as a dead container.
|
||||
"""
|
||||
message = stderr.lower()
|
||||
if "no such object" in message or "no such container" in message:
|
||||
return True
|
||||
if "not found" not in message:
|
||||
return False
|
||||
return container_name.lower() in message or "container" in message or "object" in message
|
||||
|
||||
|
||||
class LocalContainerBackend(SandboxBackend):
|
||||
"""Backend that manages sandbox containers locally using Docker or Apple Container.
|
||||
|
||||
@@ -335,11 +353,21 @@ class LocalContainerBackend(SandboxBackend):
|
||||
sandbox_id: The deterministic sandbox ID (determines container name).
|
||||
|
||||
Returns:
|
||||
SandboxInfo if container found and healthy, None otherwise.
|
||||
SandboxInfo if container found and healthy, None otherwise. A
|
||||
failed runtime check (e.g. transient daemon error) also returns
|
||||
None — discovery must not adopt a container it cannot verify, and
|
||||
falling through to create keeps acquire recoverable instead of
|
||||
hard-failing on a hiccup.
|
||||
"""
|
||||
container_name = f"{self._container_prefix}-{sandbox_id}"
|
||||
|
||||
if not self._is_container_running(container_name):
|
||||
try:
|
||||
running = self._is_container_running(container_name)
|
||||
except RuntimeError as e:
|
||||
logger.warning(f"Could not verify container {container_name} during discovery; not adopting it: {e}")
|
||||
return None
|
||||
|
||||
if not running:
|
||||
return None
|
||||
|
||||
port = self._get_container_port(container_name)
|
||||
@@ -582,6 +610,13 @@ class LocalContainerBackend(SandboxBackend):
|
||||
|
||||
This enables cross-process container discovery — any process can detect
|
||||
containers started by another process via the deterministic container name.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If the container runtime cannot answer the inspect
|
||||
query. A failed check is intentionally distinct from a
|
||||
definitive "container does not exist" result so callers do not
|
||||
destroy healthy containers during transient Docker/Container
|
||||
daemon failures.
|
||||
"""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
@@ -590,9 +625,14 @@ class LocalContainerBackend(SandboxBackend):
|
||||
text=True,
|
||||
timeout=5,
|
||||
)
|
||||
return result.returncode == 0 and result.stdout.strip().lower() == "true"
|
||||
except (subprocess.CalledProcessError, subprocess.TimeoutExpired):
|
||||
except subprocess.TimeoutExpired as exc:
|
||||
raise RuntimeError(f"Timed out checking container {container_name}") from exc
|
||||
|
||||
if result.returncode == 0:
|
||||
return result.stdout.strip().lower() == "true"
|
||||
if _is_no_such_container_error(result.stderr, container_name):
|
||||
return False
|
||||
raise RuntimeError(f"Failed to inspect container {container_name}: {result.stderr.strip()}")
|
||||
|
||||
def _get_container_port(self, container_name: str) -> int | None:
|
||||
"""Get the host port of a running container.
|
||||
|
||||
@@ -176,12 +176,16 @@ class RemoteSandboxBackend(SandboxBackend):
|
||||
f"{self._provisioner_url}/api/sandboxes/{sandbox_id}",
|
||||
timeout=10,
|
||||
)
|
||||
if resp.ok:
|
||||
data = resp.json()
|
||||
return data.get("status") == "Running"
|
||||
return False
|
||||
except requests.RequestException:
|
||||
except requests.RequestException as exc:
|
||||
raise RuntimeError(f"Provisioner health check failed for {sandbox_id}: {exc}") from exc
|
||||
|
||||
if resp.status_code == 404:
|
||||
return False
|
||||
if not resp.ok:
|
||||
raise RuntimeError(f"Provisioner health check failed for {sandbox_id}: HTTP {resp.status_code} {resp.text}")
|
||||
|
||||
data = resp.json()
|
||||
return data.get("status") == "Running"
|
||||
|
||||
def _provisioner_discover(self, sandbox_id: str) -> SandboxInfo | None:
|
||||
"""GET /api/sandboxes/{sandbox_id} → discover existing sandbox."""
|
||||
|
||||
Reference in New Issue
Block a user