mirror of
https://github.com/bytedance/deer-flow.git
synced 2026-06-11 09:55:59 +00:00
[codex] Fix stale AIO sandbox cache reuse (#3494)
* Fix stale AIO sandbox cache reuse * Address AIO sandbox review feedback * Distinguish sandbox health check failures * Keep local discovery recoverable when the runtime check fails LocalContainerBackend.discover() shares _is_container_running, which now raises on transient daemon errors instead of returning False. Discovery has no exception handling in _discover_or_create_with_lock(_async), so a brief Docker hiccup turned a recoverable "could not verify, create instead" into a hard acquire failure. Catch the check failure inside discover() and return None so an unverifiable container is simply not adopted, restoring the pre-change fall-through while keeping raise-on-unknown semantics protecting the destroy path. Reported by fancy-agent on PR #3494. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com> * Narrow the not-found match in container inspect error handling A bare "not found" substring also matches transient failures like "command not found" or "context not found", which would misclassify a check error as "container definitely gone" and bypass the raise-on-unknown contract. Keep Docker's specific "No such object"/"No such container" phrases, and only trust a generic "not found" (Apple Container) when the message names the inspected container or refers to a container/object. Reported by WillemJiang on PR #3494. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com> --------- Co-authored-by: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
@@ -1,7 +1,10 @@
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
from types import SimpleNamespace
|
||||
|
||||
import pytest
|
||||
|
||||
from deerflow.community.aio_sandbox.local_backend import (
|
||||
LocalContainerBackend,
|
||||
_format_container_command_for_log,
|
||||
@@ -234,3 +237,99 @@ def test_start_container_keeps_apple_container_port_format(monkeypatch):
|
||||
captured_cmd = _capture_start_container_command(monkeypatch, backend, runtime="container")
|
||||
|
||||
assert captured_cmd[captured_cmd.index("-p") + 1] == "18080:8080"
|
||||
|
||||
|
||||
def _backend_for_inspect_tests() -> LocalContainerBackend:
|
||||
backend = LocalContainerBackend(
|
||||
image="sandbox:latest",
|
||||
base_port=8080,
|
||||
container_prefix="sandbox",
|
||||
config_mounts=[],
|
||||
environment={},
|
||||
)
|
||||
backend._runtime = "docker"
|
||||
return backend
|
||||
|
||||
|
||||
def test_is_container_running_false_when_container_missing(monkeypatch):
|
||||
backend = _backend_for_inspect_tests()
|
||||
|
||||
def fake_run(cmd, **kwargs):
|
||||
return SimpleNamespace(stdout="", stderr="Error: No such object: sandbox-missing", returncode=1)
|
||||
|
||||
monkeypatch.setattr("subprocess.run", fake_run)
|
||||
|
||||
assert backend._is_container_running("sandbox-missing") is False
|
||||
|
||||
|
||||
def test_is_container_running_raises_on_runtime_error(monkeypatch):
|
||||
backend = _backend_for_inspect_tests()
|
||||
|
||||
def fake_run(cmd, **kwargs):
|
||||
return SimpleNamespace(stdout="", stderr="Cannot connect to the Docker daemon", returncode=1)
|
||||
|
||||
monkeypatch.setattr("subprocess.run", fake_run)
|
||||
|
||||
with pytest.raises(RuntimeError, match="Failed to inspect container sandbox-busy"):
|
||||
backend._is_container_running("sandbox-busy")
|
||||
|
||||
|
||||
def test_is_container_running_raises_on_timeout(monkeypatch):
|
||||
backend = _backend_for_inspect_tests()
|
||||
|
||||
def fake_run(cmd, **kwargs):
|
||||
raise subprocess.TimeoutExpired(cmd=cmd, timeout=kwargs["timeout"])
|
||||
|
||||
monkeypatch.setattr("subprocess.run", fake_run)
|
||||
|
||||
with pytest.raises(RuntimeError, match="Timed out checking container sandbox-timeout"):
|
||||
backend._is_container_running("sandbox-timeout")
|
||||
|
||||
|
||||
def test_discover_returns_none_when_runtime_check_fails(monkeypatch):
|
||||
"""A transient daemon error during discovery must fall through to create, not fail acquire."""
|
||||
backend = _backend_for_inspect_tests()
|
||||
|
||||
def fake_run(cmd, **kwargs):
|
||||
return SimpleNamespace(stdout="", stderr="Cannot connect to the Docker daemon", returncode=1)
|
||||
|
||||
monkeypatch.setattr("subprocess.run", fake_run)
|
||||
|
||||
assert backend.discover("sandbox-blip") is None
|
||||
|
||||
|
||||
def test_discover_returns_none_when_runtime_check_times_out(monkeypatch):
|
||||
"""An inspect timeout during discovery must not propagate out of discover()."""
|
||||
backend = _backend_for_inspect_tests()
|
||||
|
||||
def fake_run(cmd, **kwargs):
|
||||
raise subprocess.TimeoutExpired(cmd=cmd, timeout=kwargs["timeout"])
|
||||
|
||||
monkeypatch.setattr("subprocess.run", fake_run)
|
||||
|
||||
assert backend.discover("sandbox-timeout") is None
|
||||
|
||||
|
||||
def test_is_container_running_false_on_apple_container_not_found(monkeypatch):
|
||||
"""Apple Container's generic "not found" is trusted when it names the container."""
|
||||
backend = _backend_for_inspect_tests()
|
||||
|
||||
def fake_run(cmd, **kwargs):
|
||||
return SimpleNamespace(stdout="", stderr='Error: not found: "sandbox-apple"', returncode=1)
|
||||
|
||||
monkeypatch.setattr("subprocess.run", fake_run)
|
||||
|
||||
assert backend._is_container_running("sandbox-apple") is False
|
||||
|
||||
|
||||
def test_is_container_running_raises_on_unrelated_not_found_error(monkeypatch):
|
||||
"""Transient errors whose text contains "not found" must not be misread as a dead container."""
|
||||
backend = _backend_for_inspect_tests()
|
||||
|
||||
def fake_run(cmd, **kwargs):
|
||||
return SimpleNamespace(stdout="", stderr="Error: credential helper not found in $PATH", returncode=1)
|
||||
|
||||
monkeypatch.setattr("subprocess.run", fake_run)
|
||||
|
||||
with pytest.raises(RuntimeError, match="Failed to inspect container sandbox-busy"):
|
||||
backend._is_container_running("sandbox-busy")
|
||||
|
||||
@@ -317,6 +317,28 @@ async def test_acquire_async_cancelled_waiter_does_not_block_successor(tmp_path,
|
||||
pytest.fail("provider thread lock was not released after successor acquire_async")
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_acquire_internal_async_offloads_cached_reuse_health_check(tmp_path, monkeypatch):
|
||||
"""Async cached reuse must keep backend health checks off the event loop."""
|
||||
aio_mod = importlib.import_module("deerflow.community.aio_sandbox.aio_sandbox_provider")
|
||||
provider, _sandbox, _ = _make_provider_with_active_sandbox(tmp_path, "sandbox-cached-async")
|
||||
provider._thread_sandboxes = {"thread-cached-async": "sandbox-cached-async"}
|
||||
provider._backend.is_alive = MagicMock(return_value=True)
|
||||
|
||||
to_thread_calls: list[tuple[object, tuple[object, ...]]] = []
|
||||
|
||||
async def fake_to_thread(func, /, *args, **kwargs):
|
||||
to_thread_calls.append((func, args))
|
||||
return func(*args, **kwargs)
|
||||
|
||||
monkeypatch.setattr(aio_mod.asyncio, "to_thread", fake_to_thread)
|
||||
|
||||
sandbox_id = await provider._acquire_internal_async("thread-cached-async")
|
||||
|
||||
assert sandbox_id == "sandbox-cached-async"
|
||||
assert to_thread_calls == [(provider._reuse_in_process_sandbox, ("thread-cached-async",))]
|
||||
|
||||
|
||||
def test_remote_backend_create_forwards_effective_user_id(monkeypatch):
|
||||
"""Provisioner mode must receive user_id so PVC subPath matches user isolation."""
|
||||
remote_mod = importlib.import_module("deerflow.community.aio_sandbox.remote_backend")
|
||||
@@ -424,6 +446,136 @@ def test_release_swallows_close_errors(tmp_path, caplog):
|
||||
assert "sandbox-rel-err" in provider._warm_pool
|
||||
|
||||
|
||||
def test_get_uses_in_memory_registry_only(tmp_path):
|
||||
"""get() must stay event-loop safe by avoiding backend health checks."""
|
||||
provider, sandbox, _ = _make_provider_with_active_sandbox(tmp_path, "sandbox-dead")
|
||||
provider._backend.is_alive = MagicMock(side_effect=AssertionError("get must not call backend health checks"))
|
||||
|
||||
assert provider.get("sandbox-dead") is sandbox
|
||||
|
||||
|
||||
def test_acquire_drops_dead_cached_sandbox(tmp_path, monkeypatch):
|
||||
"""acquire() must replace a stale active cache entry after its container dies."""
|
||||
aio_mod = importlib.import_module("deerflow.community.aio_sandbox.aio_sandbox_provider")
|
||||
provider, sandbox, _ = _make_provider_with_active_sandbox(tmp_path, "sandbox-dead")
|
||||
provider._thread_locks = {}
|
||||
provider._thread_sandboxes = {"thread-dead": "sandbox-dead"}
|
||||
provider._config = {"replicas": 3}
|
||||
provider._backend.is_alive = MagicMock(return_value=False)
|
||||
provider._backend.discover = MagicMock(return_value=None)
|
||||
provider._backend.create = MagicMock(
|
||||
return_value=aio_mod.SandboxInfo(
|
||||
sandbox_id="sandbox-dead",
|
||||
sandbox_url="http://fresh-sandbox",
|
||||
container_name="deer-flow-sandbox-sandbox-dead",
|
||||
)
|
||||
)
|
||||
|
||||
monkeypatch.setattr(aio_mod.AioSandboxProvider, "_sandbox_id_for_thread", lambda _self, _thread_id: "sandbox-dead")
|
||||
monkeypatch.setattr(aio_mod.AioSandboxProvider, "_get_extra_mounts", lambda _self, _thread_id: [])
|
||||
monkeypatch.setattr(aio_mod, "get_paths", lambda: Paths(base_dir=tmp_path))
|
||||
monkeypatch.setattr(aio_mod, "get_effective_user_id", lambda: None)
|
||||
monkeypatch.setattr(aio_mod, "wait_for_sandbox_ready", lambda _url, timeout=60: True)
|
||||
|
||||
sandbox_id = provider.acquire("thread-dead")
|
||||
|
||||
assert sandbox_id == "sandbox-dead"
|
||||
sandbox.close.assert_called_once_with()
|
||||
provider._backend.destroy.assert_called_once()
|
||||
provider._backend.create.assert_called_once()
|
||||
assert provider._thread_sandboxes["thread-dead"] == "sandbox-dead"
|
||||
assert provider._sandboxes["sandbox-dead"].base_url == "http://fresh-sandbox"
|
||||
|
||||
|
||||
def test_acquire_keeps_cached_sandbox_when_health_check_errors(tmp_path):
|
||||
"""Transient backend health-check errors must not destroy a tracked sandbox."""
|
||||
provider, sandbox, _ = _make_provider_with_active_sandbox(tmp_path, "sandbox-transient")
|
||||
provider._thread_locks = {}
|
||||
provider._thread_sandboxes = {"thread-transient": "sandbox-transient"}
|
||||
provider._backend.is_alive = MagicMock(side_effect=OSError("docker daemon busy"))
|
||||
|
||||
sandbox_id = provider.acquire("thread-transient")
|
||||
|
||||
assert sandbox_id == "sandbox-transient"
|
||||
sandbox.close.assert_not_called()
|
||||
provider._backend.destroy.assert_not_called()
|
||||
assert provider._sandboxes["sandbox-transient"] is sandbox
|
||||
|
||||
|
||||
def test_drop_unhealthy_sandbox_skips_recreated_entry(tmp_path):
|
||||
"""A stale health-check result must not delete a newly registered sandbox."""
|
||||
aio_mod = importlib.import_module("deerflow.community.aio_sandbox.aio_sandbox_provider")
|
||||
provider = _make_provider(tmp_path)
|
||||
provider._lock = aio_mod.threading.Lock()
|
||||
provider._warm_pool = {}
|
||||
provider._last_activity = {"sandbox-toctou": 1.0}
|
||||
provider._thread_sandboxes = {"thread-toctou": "sandbox-toctou"}
|
||||
old_info = aio_mod.SandboxInfo(sandbox_id="sandbox-toctou", sandbox_url="http://old-sandbox")
|
||||
new_info = aio_mod.SandboxInfo(sandbox_id="sandbox-toctou", sandbox_url="http://new-sandbox")
|
||||
new_sandbox = MagicMock()
|
||||
provider._sandbox_infos = {"sandbox-toctou": new_info}
|
||||
provider._sandboxes = {"sandbox-toctou": new_sandbox}
|
||||
provider._backend = SimpleNamespace(destroy=MagicMock())
|
||||
|
||||
provider._drop_unhealthy_sandbox("sandbox-toctou", "stale health check", expected_info=old_info)
|
||||
|
||||
new_sandbox.close.assert_not_called()
|
||||
provider._backend.destroy.assert_not_called()
|
||||
assert provider._sandbox_infos["sandbox-toctou"] is new_info
|
||||
assert provider._sandboxes["sandbox-toctou"] is new_sandbox
|
||||
assert provider._thread_sandboxes == {"thread-toctou": "sandbox-toctou"}
|
||||
|
||||
|
||||
def test_acquire_skips_dead_warm_pool_sandbox(tmp_path, monkeypatch):
|
||||
"""acquire() must create a fresh sandbox when the warm-pool entry died."""
|
||||
aio_mod = importlib.import_module("deerflow.community.aio_sandbox.aio_sandbox_provider")
|
||||
provider = _make_provider(tmp_path)
|
||||
provider._lock = aio_mod.threading.Lock()
|
||||
provider._thread_locks = {}
|
||||
provider._sandboxes = {}
|
||||
provider._sandbox_infos = {}
|
||||
provider._thread_sandboxes = {}
|
||||
provider._last_activity = {}
|
||||
provider._warm_pool = {
|
||||
"sandbox-warm-dead": (
|
||||
aio_mod.SandboxInfo(
|
||||
sandbox_id="sandbox-warm-dead",
|
||||
sandbox_url="http://stale-sandbox",
|
||||
container_name="deer-flow-sandbox-sandbox-warm-dead",
|
||||
),
|
||||
0.0,
|
||||
)
|
||||
}
|
||||
provider._config = {"replicas": 3}
|
||||
provider._backend = SimpleNamespace(
|
||||
is_alive=MagicMock(return_value=False),
|
||||
destroy=MagicMock(),
|
||||
discover=MagicMock(return_value=None),
|
||||
create=MagicMock(
|
||||
return_value=aio_mod.SandboxInfo(
|
||||
sandbox_id="sandbox-warm-dead",
|
||||
sandbox_url="http://fresh-sandbox",
|
||||
container_name="deer-flow-sandbox-sandbox-warm-dead",
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
monkeypatch.setattr(aio_mod.AioSandboxProvider, "_sandbox_id_for_thread", lambda _self, _thread_id: "sandbox-warm-dead")
|
||||
monkeypatch.setattr(aio_mod.AioSandboxProvider, "_get_extra_mounts", lambda _self, _thread_id: [])
|
||||
monkeypatch.setattr(aio_mod, "get_paths", lambda: Paths(base_dir=tmp_path))
|
||||
monkeypatch.setattr(aio_mod, "get_effective_user_id", lambda: None)
|
||||
monkeypatch.setattr(aio_mod, "wait_for_sandbox_ready", lambda _url, timeout=60: True)
|
||||
|
||||
sandbox_id = provider.acquire("thread-warm-dead")
|
||||
|
||||
assert sandbox_id == "sandbox-warm-dead"
|
||||
provider._backend.destroy.assert_called_once()
|
||||
provider._backend.create.assert_called_once()
|
||||
assert provider._warm_pool == {}
|
||||
assert provider._thread_sandboxes["thread-warm-dead"] == "sandbox-warm-dead"
|
||||
assert provider._sandboxes["sandbox-warm-dead"].base_url == "http://fresh-sandbox"
|
||||
|
||||
|
||||
def test_destroy_swallows_close_errors_and_still_destroys_backend(tmp_path, caplog):
|
||||
"""A failure in sandbox.close() must not skip backend container destruction."""
|
||||
provider, sandbox, _ = _make_provider_with_active_sandbox(tmp_path, "sandbox-dest-err")
|
||||
|
||||
@@ -257,14 +257,38 @@ def test_provisioner_is_alive_true_only_when_status_running(monkeypatch):
|
||||
assert backend._provisioner_is_alive("abc123") is False
|
||||
|
||||
|
||||
def test_provisioner_is_alive_returns_false_on_request_exception(monkeypatch):
|
||||
def test_provisioner_is_alive_returns_false_on_404(monkeypatch):
|
||||
backend = RemoteSandboxBackend("http://provisioner:8002")
|
||||
|
||||
def mock_get(url: str, timeout: int):
|
||||
return _StubResponse(status_code=404)
|
||||
|
||||
monkeypatch.setattr(requests, "get", mock_get)
|
||||
assert backend._provisioner_is_alive("abc123") is False
|
||||
|
||||
|
||||
def test_provisioner_is_alive_raises_on_request_exception(monkeypatch):
|
||||
backend = RemoteSandboxBackend("http://provisioner:8002")
|
||||
|
||||
def mock_get(url: str, timeout: int):
|
||||
raise requests.RequestException("boom")
|
||||
|
||||
monkeypatch.setattr(requests, "get", mock_get)
|
||||
assert backend._provisioner_is_alive("abc123") is False
|
||||
with pytest.raises(RuntimeError, match="Provisioner health check failed for abc123"):
|
||||
backend._provisioner_is_alive("abc123")
|
||||
|
||||
|
||||
def test_provisioner_is_alive_raises_on_server_error(monkeypatch):
|
||||
backend = RemoteSandboxBackend("http://provisioner:8002")
|
||||
|
||||
def mock_get(url: str, timeout: int):
|
||||
response = _StubResponse(status_code=503)
|
||||
response.text = "unavailable"
|
||||
return response
|
||||
|
||||
monkeypatch.setattr(requests, "get", mock_get)
|
||||
with pytest.raises(RuntimeError, match="HTTP 503 unavailable"):
|
||||
backend._provisioner_is_alive("abc123")
|
||||
|
||||
|
||||
def test_discover_delegates_to_provisioner_discover(monkeypatch):
|
||||
|
||||
Reference in New Issue
Block a user