mirror of
https://github.com/bytedance/deer-flow.git
synced 2026-05-22 07:56:48 +00:00
fix(sandbox): add startup reconciliation to prevent orphaned container leaks (#1976)
* fix(sandbox): add startup reconciliation to prevent orphaned container leaks Sandbox containers were never cleaned up when the managing process restarted, because all lifecycle tracking lived in in-memory dictionaries. This adds startup reconciliation that enumerates running containers via `docker ps` and either destroys orphans (age > idle_timeout) or adopts them into the warm pool. Closes #1972 * fix(sandbox): address Copilot review — adopt-all strategy, improved error handling - Reconciliation now adopts all containers into warm pool unconditionally, letting the idle checker decide cleanup. Avoids destroying containers that another concurrent process may still be using. - list_running() logs stderr on docker ps failure and catches FileNotFoundError/OSError. - Signal handler test restores SIGTERM/SIGINT in addition to SIGHUP. - E2E test docstring corrected to match actual coverage scope. * fix(sandbox): address maintainer review — batch inspect, lock tightening, import hygiene - _reconcile_orphans(): merge check-and-insert into a single lock acquisition per container to eliminate the TOCTOU window. - list_running(): batch the per-container docker inspect into a single call. Total subprocess calls drop from 2N+1 to 2 (one ps + one batch inspect). Parse port and created_at from the inspect JSON payload. - Extract _parse_docker_timestamp() and _extract_host_port() as module-level pure helpers and test them directly. - Move datetime/json imports to module top level. - _make_provider_for_reconciliation(): document the __new__ bypass and the lockstep coupling to AioSandboxProvider.__init__. - Add assertion that list_running() makes exactly ONE inspect call.
This commit is contained in:
@@ -112,6 +112,9 @@ class AioSandboxProvider(SandboxProvider):
|
||||
atexit.register(self.shutdown)
|
||||
self._register_signal_handlers()
|
||||
|
||||
# Reconcile orphaned containers from previous process lifecycles
|
||||
self._reconcile_orphans()
|
||||
|
||||
# Start idle checker if enabled
|
||||
if self._config.get("idle_timeout", DEFAULT_IDLE_TIMEOUT) > 0:
|
||||
self._start_idle_checker()
|
||||
@@ -175,6 +178,51 @@ class AioSandboxProvider(SandboxProvider):
|
||||
resolved[key] = str(value)
|
||||
return resolved
|
||||
|
||||
# ── Startup reconciliation ────────────────────────────────────────────
|
||||
|
||||
def _reconcile_orphans(self) -> None:
|
||||
"""Reconcile orphaned containers left by previous process lifecycles.
|
||||
|
||||
On startup, enumerate all running containers matching our prefix
|
||||
and adopt them all into the warm pool. The idle checker will reclaim
|
||||
containers that nobody re-acquires within ``idle_timeout``.
|
||||
|
||||
All containers are adopted unconditionally because we cannot
|
||||
distinguish "orphaned" from "actively used by another process"
|
||||
based on age alone — ``idle_timeout`` represents inactivity, not
|
||||
uptime. Adopting into the warm pool and letting the idle checker
|
||||
decide avoids destroying containers that a concurrent process may
|
||||
still be using.
|
||||
|
||||
This closes the fundamental gap where in-memory state loss (process
|
||||
restart, crash, SIGKILL) leaves Docker containers running forever.
|
||||
"""
|
||||
try:
|
||||
running = self._backend.list_running()
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to enumerate running containers during startup reconciliation: {e}")
|
||||
return
|
||||
|
||||
if not running:
|
||||
return
|
||||
|
||||
current_time = time.time()
|
||||
adopted = 0
|
||||
|
||||
for info in running:
|
||||
age = current_time - info.created_at if info.created_at > 0 else float("inf")
|
||||
# Single lock acquisition per container: atomic check-and-insert.
|
||||
# Avoids a TOCTOU window between the "already tracked?" check and
|
||||
# the warm-pool insert.
|
||||
with self._lock:
|
||||
if info.sandbox_id in self._sandboxes or info.sandbox_id in self._warm_pool:
|
||||
continue
|
||||
self._warm_pool[info.sandbox_id] = (info, current_time)
|
||||
adopted += 1
|
||||
logger.info(f"Adopted container {info.sandbox_id} into warm pool (age: {age:.0f}s)")
|
||||
|
||||
logger.info(f"Startup reconciliation complete: {adopted} adopted into warm pool, {len(running)} total found")
|
||||
|
||||
# ── Deterministic ID ─────────────────────────────────────────────────
|
||||
|
||||
@staticmethod
|
||||
@@ -316,13 +364,23 @@ class AioSandboxProvider(SandboxProvider):
|
||||
# ── Signal handling ──────────────────────────────────────────────────
|
||||
|
||||
def _register_signal_handlers(self) -> None:
|
||||
"""Register signal handlers for graceful shutdown."""
|
||||
"""Register signal handlers for graceful shutdown.
|
||||
|
||||
Handles SIGTERM, SIGINT, and SIGHUP (terminal close) to ensure
|
||||
sandbox containers are cleaned up even when the user closes the terminal.
|
||||
"""
|
||||
self._original_sigterm = signal.getsignal(signal.SIGTERM)
|
||||
self._original_sigint = signal.getsignal(signal.SIGINT)
|
||||
self._original_sighup = signal.getsignal(signal.SIGHUP) if hasattr(signal, "SIGHUP") else None
|
||||
|
||||
def signal_handler(signum, frame):
|
||||
self.shutdown()
|
||||
original = self._original_sigterm if signum == signal.SIGTERM else self._original_sigint
|
||||
if signum == signal.SIGTERM:
|
||||
original = self._original_sigterm
|
||||
elif hasattr(signal, "SIGHUP") and signum == signal.SIGHUP:
|
||||
original = self._original_sighup
|
||||
else:
|
||||
original = self._original_sigint
|
||||
if callable(original):
|
||||
original(signum, frame)
|
||||
elif original == signal.SIG_DFL:
|
||||
@@ -332,6 +390,8 @@ class AioSandboxProvider(SandboxProvider):
|
||||
try:
|
||||
signal.signal(signal.SIGTERM, signal_handler)
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
if hasattr(signal, "SIGHUP"):
|
||||
signal.signal(signal.SIGHUP, signal_handler)
|
||||
except ValueError:
|
||||
logger.debug("Could not register signal handlers (not main thread)")
|
||||
|
||||
|
||||
@@ -96,3 +96,19 @@ class SandboxBackend(ABC):
|
||||
SandboxInfo if found and healthy, None otherwise.
|
||||
"""
|
||||
...
|
||||
|
||||
def list_running(self) -> list[SandboxInfo]:
|
||||
"""Enumerate all running sandboxes managed by this backend.
|
||||
|
||||
Used for startup reconciliation: when the process restarts, it needs
|
||||
to discover containers started by previous processes so they can be
|
||||
adopted into the warm pool or destroyed if idle too long.
|
||||
|
||||
The default implementation returns an empty list, which is correct
|
||||
for backends that don't manage local containers (e.g., RemoteSandboxBackend
|
||||
delegates lifecycle to the provisioner which handles its own cleanup).
|
||||
|
||||
Returns:
|
||||
A list of SandboxInfo for all currently running sandboxes.
|
||||
"""
|
||||
return []
|
||||
|
||||
@@ -6,9 +6,11 @@ Handles container lifecycle, port allocation, and cross-process container discov
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
from datetime import datetime
|
||||
|
||||
from deerflow.utils.network import get_free_port, release_port
|
||||
|
||||
@@ -18,6 +20,52 @@ from .sandbox_info import SandboxInfo
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _parse_docker_timestamp(raw: str) -> float:
|
||||
"""Parse Docker's ISO 8601 timestamp into a Unix epoch float.
|
||||
|
||||
Docker returns timestamps with nanosecond precision and a trailing ``Z``
|
||||
(e.g. ``2026-04-08T01:22:50.123456789Z``). Python's ``fromisoformat``
|
||||
accepts at most microseconds and (pre-3.11) does not accept ``Z``, so the
|
||||
string is normalized before parsing. Returns ``0.0`` on empty input or
|
||||
parse failure so callers can use ``0.0`` as a sentinel for "unknown age".
|
||||
"""
|
||||
if not raw:
|
||||
return 0.0
|
||||
try:
|
||||
s = raw.strip()
|
||||
if "." in s:
|
||||
dot_pos = s.index(".")
|
||||
tz_start = dot_pos + 1
|
||||
while tz_start < len(s) and s[tz_start].isdigit():
|
||||
tz_start += 1
|
||||
frac = s[dot_pos + 1 : tz_start][:6] # truncate to microseconds
|
||||
tz_suffix = s[tz_start:]
|
||||
s = s[: dot_pos + 1] + frac + tz_suffix
|
||||
if s.endswith("Z"):
|
||||
s = s[:-1] + "+00:00"
|
||||
return datetime.fromisoformat(s).timestamp()
|
||||
except (ValueError, TypeError) as e:
|
||||
logger.debug(f"Could not parse docker timestamp {raw!r}: {e}")
|
||||
return 0.0
|
||||
|
||||
|
||||
def _extract_host_port(inspect_entry: dict, container_port: int) -> int | None:
|
||||
"""Extract the host port mapped to ``container_port/tcp`` from a docker inspect entry.
|
||||
|
||||
Returns None if the container has no port mapping for that port.
|
||||
"""
|
||||
try:
|
||||
ports = (inspect_entry.get("NetworkSettings") or {}).get("Ports") or {}
|
||||
bindings = ports.get(f"{container_port}/tcp") or []
|
||||
if bindings:
|
||||
host_port = bindings[0].get("HostPort")
|
||||
if host_port:
|
||||
return int(host_port)
|
||||
except (ValueError, TypeError, AttributeError):
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def _format_container_mount(runtime: str, host_path: str, container_path: str, read_only: bool) -> list[str]:
|
||||
"""Format a bind-mount argument for the selected runtime.
|
||||
|
||||
@@ -172,8 +220,12 @@ class LocalContainerBackend(SandboxBackend):
|
||||
|
||||
def destroy(self, info: SandboxInfo) -> None:
|
||||
"""Stop the container and release its port."""
|
||||
if info.container_id:
|
||||
self._stop_container(info.container_id)
|
||||
# Prefer container_id, fall back to container_name (both accepted by docker stop).
|
||||
# This ensures containers discovered via list_running() (which only has the name)
|
||||
# can also be stopped.
|
||||
stop_target = info.container_id or info.container_name
|
||||
if stop_target:
|
||||
self._stop_container(stop_target)
|
||||
# Extract port from sandbox_url for release
|
||||
try:
|
||||
from urllib.parse import urlparse
|
||||
@@ -222,6 +274,129 @@ class LocalContainerBackend(SandboxBackend):
|
||||
container_name=container_name,
|
||||
)
|
||||
|
||||
def list_running(self) -> list[SandboxInfo]:
|
||||
"""Enumerate all running containers matching the configured prefix.
|
||||
|
||||
Uses a single ``docker ps`` call to list container names, then a
|
||||
single batched ``docker inspect`` call to retrieve creation timestamp
|
||||
and port mapping for all containers at once. Total subprocess calls:
|
||||
2 (down from 2N+1 in the naive per-container approach).
|
||||
|
||||
Note: Docker's ``--filter name=`` performs *substring* matching,
|
||||
so a secondary ``startswith`` check is applied to ensure only
|
||||
containers with the exact prefix are included.
|
||||
|
||||
Containers without port mappings are still included (with empty
|
||||
sandbox_url) so that startup reconciliation can adopt orphans
|
||||
regardless of their port state.
|
||||
"""
|
||||
# Step 1: enumerate container names via docker ps
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[
|
||||
self._runtime,
|
||||
"ps",
|
||||
"--filter",
|
||||
f"name={self._container_prefix}-",
|
||||
"--format",
|
||||
"{{.Names}}",
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10,
|
||||
)
|
||||
if result.returncode != 0:
|
||||
stderr = (result.stderr or "").strip()
|
||||
logger.warning(
|
||||
"Failed to list running containers with %s ps (returncode=%s, stderr=%s)",
|
||||
self._runtime,
|
||||
result.returncode,
|
||||
stderr or "<empty>",
|
||||
)
|
||||
return []
|
||||
if not result.stdout.strip():
|
||||
return []
|
||||
except (subprocess.CalledProcessError, subprocess.TimeoutExpired, FileNotFoundError, OSError) as e:
|
||||
logger.warning(f"Failed to list running containers: {e}")
|
||||
return []
|
||||
|
||||
# Filter to names matching our exact prefix (docker filter is substring-based)
|
||||
container_names = [name.strip() for name in result.stdout.strip().splitlines() if name.strip().startswith(self._container_prefix + "-")]
|
||||
if not container_names:
|
||||
return []
|
||||
|
||||
# Step 2: batched docker inspect — single subprocess call for all containers
|
||||
inspections = self._batch_inspect(container_names)
|
||||
|
||||
infos: list[SandboxInfo] = []
|
||||
sandbox_host = os.environ.get("DEER_FLOW_SANDBOX_HOST", "localhost")
|
||||
for container_name in container_names:
|
||||
data = inspections.get(container_name)
|
||||
if data is None:
|
||||
# Container disappeared between ps and inspect, or inspect failed
|
||||
continue
|
||||
created_at, host_port = data
|
||||
sandbox_id = container_name[len(self._container_prefix) + 1 :]
|
||||
sandbox_url = f"http://{sandbox_host}:{host_port}" if host_port else ""
|
||||
|
||||
infos.append(
|
||||
SandboxInfo(
|
||||
sandbox_id=sandbox_id,
|
||||
sandbox_url=sandbox_url,
|
||||
container_name=container_name,
|
||||
created_at=created_at,
|
||||
)
|
||||
)
|
||||
|
||||
logger.info(f"Found {len(infos)} running sandbox container(s)")
|
||||
return infos
|
||||
|
||||
def _batch_inspect(self, container_names: list[str]) -> dict[str, tuple[float, int | None]]:
|
||||
"""Batch-inspect containers in a single subprocess call.
|
||||
|
||||
Returns a mapping of ``container_name -> (created_at, host_port)``.
|
||||
Missing containers or parse failures are silently dropped from the result.
|
||||
"""
|
||||
if not container_names:
|
||||
return {}
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[self._runtime, "inspect", *container_names],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=15,
|
||||
)
|
||||
except (subprocess.CalledProcessError, subprocess.TimeoutExpired, FileNotFoundError, OSError) as e:
|
||||
logger.warning(f"Failed to batch-inspect containers: {e}")
|
||||
return {}
|
||||
|
||||
if result.returncode != 0:
|
||||
stderr = (result.stderr or "").strip()
|
||||
logger.warning(
|
||||
"Failed to batch-inspect containers with %s inspect (returncode=%s, stderr=%s)",
|
||||
self._runtime,
|
||||
result.returncode,
|
||||
stderr or "<empty>",
|
||||
)
|
||||
return {}
|
||||
|
||||
try:
|
||||
payload = json.loads(result.stdout or "[]")
|
||||
except json.JSONDecodeError as e:
|
||||
logger.warning(f"Failed to parse docker inspect output as JSON: {e}")
|
||||
return {}
|
||||
|
||||
out: dict[str, tuple[float, int | None]] = {}
|
||||
for entry in payload:
|
||||
# ``Name`` is prefixed with ``/`` in the docker inspect response
|
||||
name = (entry.get("Name") or "").lstrip("/")
|
||||
if not name:
|
||||
continue
|
||||
created_at = _parse_docker_timestamp(entry.get("Created", ""))
|
||||
host_port = _extract_host_port(entry, 8080)
|
||||
out[name] = (created_at, host_port)
|
||||
return out
|
||||
|
||||
# ── Container operations ─────────────────────────────────────────────
|
||||
|
||||
def _start_container(
|
||||
|
||||
Reference in New Issue
Block a user