[codex] Fix stale AIO sandbox cache reuse (#3494)

* Fix stale AIO sandbox cache reuse

* Address AIO sandbox review feedback

* Distinguish sandbox health check failures

* Keep local discovery recoverable when the runtime check fails

LocalContainerBackend.discover() shares _is_container_running, which now
raises on transient daemon errors instead of returning False. Discovery has
no exception handling in _discover_or_create_with_lock(_async), so a brief
Docker hiccup turned a recoverable "could not verify, create instead" into a
hard acquire failure. Catch the check failure inside discover() and return
None so an unverifiable container is simply not adopted, restoring the
pre-change fall-through while keeping raise-on-unknown semantics protecting
the destroy path.

Reported by fancy-agent on PR #3494.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>

* Narrow the not-found match in container inspect error handling

A bare "not found" substring also matches transient failures like "command
not found" or "context not found", which would misclassify a check error as
"container definitely gone" and bypass the raise-on-unknown contract. Keep
Docker's specific "No such object"/"No such container" phrases, and only
trust a generic "not found" (Apple Container) when the message names the
inspected container or refers to a container/object.

Reported by WillemJiang on PR #3494.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>

---------

Co-authored-by: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
DanielWalnut
2026-06-11 17:53:37 +08:00
committed by GitHub
parent 919d8bc279
commit f401e7baa6
8 changed files with 439 additions and 38 deletions
@@ -1,7 +1,10 @@
import logging
import os
import subprocess
from types import SimpleNamespace
import pytest
from deerflow.community.aio_sandbox.local_backend import (
LocalContainerBackend,
_format_container_command_for_log,
@@ -234,3 +237,99 @@ def test_start_container_keeps_apple_container_port_format(monkeypatch):
captured_cmd = _capture_start_container_command(monkeypatch, backend, runtime="container")
assert captured_cmd[captured_cmd.index("-p") + 1] == "18080:8080"
def _backend_for_inspect_tests() -> LocalContainerBackend:
backend = LocalContainerBackend(
image="sandbox:latest",
base_port=8080,
container_prefix="sandbox",
config_mounts=[],
environment={},
)
backend._runtime = "docker"
return backend
def test_is_container_running_false_when_container_missing(monkeypatch):
backend = _backend_for_inspect_tests()
def fake_run(cmd, **kwargs):
return SimpleNamespace(stdout="", stderr="Error: No such object: sandbox-missing", returncode=1)
monkeypatch.setattr("subprocess.run", fake_run)
assert backend._is_container_running("sandbox-missing") is False
def test_is_container_running_raises_on_runtime_error(monkeypatch):
backend = _backend_for_inspect_tests()
def fake_run(cmd, **kwargs):
return SimpleNamespace(stdout="", stderr="Cannot connect to the Docker daemon", returncode=1)
monkeypatch.setattr("subprocess.run", fake_run)
with pytest.raises(RuntimeError, match="Failed to inspect container sandbox-busy"):
backend._is_container_running("sandbox-busy")
def test_is_container_running_raises_on_timeout(monkeypatch):
backend = _backend_for_inspect_tests()
def fake_run(cmd, **kwargs):
raise subprocess.TimeoutExpired(cmd=cmd, timeout=kwargs["timeout"])
monkeypatch.setattr("subprocess.run", fake_run)
with pytest.raises(RuntimeError, match="Timed out checking container sandbox-timeout"):
backend._is_container_running("sandbox-timeout")
def test_discover_returns_none_when_runtime_check_fails(monkeypatch):
"""A transient daemon error during discovery must fall through to create, not fail acquire."""
backend = _backend_for_inspect_tests()
def fake_run(cmd, **kwargs):
return SimpleNamespace(stdout="", stderr="Cannot connect to the Docker daemon", returncode=1)
monkeypatch.setattr("subprocess.run", fake_run)
assert backend.discover("sandbox-blip") is None
def test_discover_returns_none_when_runtime_check_times_out(monkeypatch):
"""An inspect timeout during discovery must not propagate out of discover()."""
backend = _backend_for_inspect_tests()
def fake_run(cmd, **kwargs):
raise subprocess.TimeoutExpired(cmd=cmd, timeout=kwargs["timeout"])
monkeypatch.setattr("subprocess.run", fake_run)
assert backend.discover("sandbox-timeout") is None
def test_is_container_running_false_on_apple_container_not_found(monkeypatch):
"""Apple Container's generic "not found" is trusted when it names the container."""
backend = _backend_for_inspect_tests()
def fake_run(cmd, **kwargs):
return SimpleNamespace(stdout="", stderr='Error: not found: "sandbox-apple"', returncode=1)
monkeypatch.setattr("subprocess.run", fake_run)
assert backend._is_container_running("sandbox-apple") is False
def test_is_container_running_raises_on_unrelated_not_found_error(monkeypatch):
"""Transient errors whose text contains "not found" must not be misread as a dead container."""
backend = _backend_for_inspect_tests()
def fake_run(cmd, **kwargs):
return SimpleNamespace(stdout="", stderr="Error: credential helper not found in $PATH", returncode=1)
monkeypatch.setattr("subprocess.run", fake_run)
with pytest.raises(RuntimeError, match="Failed to inspect container sandbox-busy"):
backend._is_container_running("sandbox-busy")
+152
View File
@@ -317,6 +317,28 @@ async def test_acquire_async_cancelled_waiter_does_not_block_successor(tmp_path,
pytest.fail("provider thread lock was not released after successor acquire_async")
@pytest.mark.anyio
async def test_acquire_internal_async_offloads_cached_reuse_health_check(tmp_path, monkeypatch):
"""Async cached reuse must keep backend health checks off the event loop."""
aio_mod = importlib.import_module("deerflow.community.aio_sandbox.aio_sandbox_provider")
provider, _sandbox, _ = _make_provider_with_active_sandbox(tmp_path, "sandbox-cached-async")
provider._thread_sandboxes = {"thread-cached-async": "sandbox-cached-async"}
provider._backend.is_alive = MagicMock(return_value=True)
to_thread_calls: list[tuple[object, tuple[object, ...]]] = []
async def fake_to_thread(func, /, *args, **kwargs):
to_thread_calls.append((func, args))
return func(*args, **kwargs)
monkeypatch.setattr(aio_mod.asyncio, "to_thread", fake_to_thread)
sandbox_id = await provider._acquire_internal_async("thread-cached-async")
assert sandbox_id == "sandbox-cached-async"
assert to_thread_calls == [(provider._reuse_in_process_sandbox, ("thread-cached-async",))]
def test_remote_backend_create_forwards_effective_user_id(monkeypatch):
"""Provisioner mode must receive user_id so PVC subPath matches user isolation."""
remote_mod = importlib.import_module("deerflow.community.aio_sandbox.remote_backend")
@@ -424,6 +446,136 @@ def test_release_swallows_close_errors(tmp_path, caplog):
assert "sandbox-rel-err" in provider._warm_pool
def test_get_uses_in_memory_registry_only(tmp_path):
"""get() must stay event-loop safe by avoiding backend health checks."""
provider, sandbox, _ = _make_provider_with_active_sandbox(tmp_path, "sandbox-dead")
provider._backend.is_alive = MagicMock(side_effect=AssertionError("get must not call backend health checks"))
assert provider.get("sandbox-dead") is sandbox
def test_acquire_drops_dead_cached_sandbox(tmp_path, monkeypatch):
"""acquire() must replace a stale active cache entry after its container dies."""
aio_mod = importlib.import_module("deerflow.community.aio_sandbox.aio_sandbox_provider")
provider, sandbox, _ = _make_provider_with_active_sandbox(tmp_path, "sandbox-dead")
provider._thread_locks = {}
provider._thread_sandboxes = {"thread-dead": "sandbox-dead"}
provider._config = {"replicas": 3}
provider._backend.is_alive = MagicMock(return_value=False)
provider._backend.discover = MagicMock(return_value=None)
provider._backend.create = MagicMock(
return_value=aio_mod.SandboxInfo(
sandbox_id="sandbox-dead",
sandbox_url="http://fresh-sandbox",
container_name="deer-flow-sandbox-sandbox-dead",
)
)
monkeypatch.setattr(aio_mod.AioSandboxProvider, "_sandbox_id_for_thread", lambda _self, _thread_id: "sandbox-dead")
monkeypatch.setattr(aio_mod.AioSandboxProvider, "_get_extra_mounts", lambda _self, _thread_id: [])
monkeypatch.setattr(aio_mod, "get_paths", lambda: Paths(base_dir=tmp_path))
monkeypatch.setattr(aio_mod, "get_effective_user_id", lambda: None)
monkeypatch.setattr(aio_mod, "wait_for_sandbox_ready", lambda _url, timeout=60: True)
sandbox_id = provider.acquire("thread-dead")
assert sandbox_id == "sandbox-dead"
sandbox.close.assert_called_once_with()
provider._backend.destroy.assert_called_once()
provider._backend.create.assert_called_once()
assert provider._thread_sandboxes["thread-dead"] == "sandbox-dead"
assert provider._sandboxes["sandbox-dead"].base_url == "http://fresh-sandbox"
def test_acquire_keeps_cached_sandbox_when_health_check_errors(tmp_path):
"""Transient backend health-check errors must not destroy a tracked sandbox."""
provider, sandbox, _ = _make_provider_with_active_sandbox(tmp_path, "sandbox-transient")
provider._thread_locks = {}
provider._thread_sandboxes = {"thread-transient": "sandbox-transient"}
provider._backend.is_alive = MagicMock(side_effect=OSError("docker daemon busy"))
sandbox_id = provider.acquire("thread-transient")
assert sandbox_id == "sandbox-transient"
sandbox.close.assert_not_called()
provider._backend.destroy.assert_not_called()
assert provider._sandboxes["sandbox-transient"] is sandbox
def test_drop_unhealthy_sandbox_skips_recreated_entry(tmp_path):
"""A stale health-check result must not delete a newly registered sandbox."""
aio_mod = importlib.import_module("deerflow.community.aio_sandbox.aio_sandbox_provider")
provider = _make_provider(tmp_path)
provider._lock = aio_mod.threading.Lock()
provider._warm_pool = {}
provider._last_activity = {"sandbox-toctou": 1.0}
provider._thread_sandboxes = {"thread-toctou": "sandbox-toctou"}
old_info = aio_mod.SandboxInfo(sandbox_id="sandbox-toctou", sandbox_url="http://old-sandbox")
new_info = aio_mod.SandboxInfo(sandbox_id="sandbox-toctou", sandbox_url="http://new-sandbox")
new_sandbox = MagicMock()
provider._sandbox_infos = {"sandbox-toctou": new_info}
provider._sandboxes = {"sandbox-toctou": new_sandbox}
provider._backend = SimpleNamespace(destroy=MagicMock())
provider._drop_unhealthy_sandbox("sandbox-toctou", "stale health check", expected_info=old_info)
new_sandbox.close.assert_not_called()
provider._backend.destroy.assert_not_called()
assert provider._sandbox_infos["sandbox-toctou"] is new_info
assert provider._sandboxes["sandbox-toctou"] is new_sandbox
assert provider._thread_sandboxes == {"thread-toctou": "sandbox-toctou"}
def test_acquire_skips_dead_warm_pool_sandbox(tmp_path, monkeypatch):
"""acquire() must create a fresh sandbox when the warm-pool entry died."""
aio_mod = importlib.import_module("deerflow.community.aio_sandbox.aio_sandbox_provider")
provider = _make_provider(tmp_path)
provider._lock = aio_mod.threading.Lock()
provider._thread_locks = {}
provider._sandboxes = {}
provider._sandbox_infos = {}
provider._thread_sandboxes = {}
provider._last_activity = {}
provider._warm_pool = {
"sandbox-warm-dead": (
aio_mod.SandboxInfo(
sandbox_id="sandbox-warm-dead",
sandbox_url="http://stale-sandbox",
container_name="deer-flow-sandbox-sandbox-warm-dead",
),
0.0,
)
}
provider._config = {"replicas": 3}
provider._backend = SimpleNamespace(
is_alive=MagicMock(return_value=False),
destroy=MagicMock(),
discover=MagicMock(return_value=None),
create=MagicMock(
return_value=aio_mod.SandboxInfo(
sandbox_id="sandbox-warm-dead",
sandbox_url="http://fresh-sandbox",
container_name="deer-flow-sandbox-sandbox-warm-dead",
)
),
)
monkeypatch.setattr(aio_mod.AioSandboxProvider, "_sandbox_id_for_thread", lambda _self, _thread_id: "sandbox-warm-dead")
monkeypatch.setattr(aio_mod.AioSandboxProvider, "_get_extra_mounts", lambda _self, _thread_id: [])
monkeypatch.setattr(aio_mod, "get_paths", lambda: Paths(base_dir=tmp_path))
monkeypatch.setattr(aio_mod, "get_effective_user_id", lambda: None)
monkeypatch.setattr(aio_mod, "wait_for_sandbox_ready", lambda _url, timeout=60: True)
sandbox_id = provider.acquire("thread-warm-dead")
assert sandbox_id == "sandbox-warm-dead"
provider._backend.destroy.assert_called_once()
provider._backend.create.assert_called_once()
assert provider._warm_pool == {}
assert provider._thread_sandboxes["thread-warm-dead"] == "sandbox-warm-dead"
assert provider._sandboxes["sandbox-warm-dead"].base_url == "http://fresh-sandbox"
def test_destroy_swallows_close_errors_and_still_destroys_backend(tmp_path, caplog):
"""A failure in sandbox.close() must not skip backend container destruction."""
provider, sandbox, _ = _make_provider_with_active_sandbox(tmp_path, "sandbox-dest-err")
+26 -2
View File
@@ -257,14 +257,38 @@ def test_provisioner_is_alive_true_only_when_status_running(monkeypatch):
assert backend._provisioner_is_alive("abc123") is False
def test_provisioner_is_alive_returns_false_on_request_exception(monkeypatch):
def test_provisioner_is_alive_returns_false_on_404(monkeypatch):
backend = RemoteSandboxBackend("http://provisioner:8002")
def mock_get(url: str, timeout: int):
return _StubResponse(status_code=404)
monkeypatch.setattr(requests, "get", mock_get)
assert backend._provisioner_is_alive("abc123") is False
def test_provisioner_is_alive_raises_on_request_exception(monkeypatch):
backend = RemoteSandboxBackend("http://provisioner:8002")
def mock_get(url: str, timeout: int):
raise requests.RequestException("boom")
monkeypatch.setattr(requests, "get", mock_get)
assert backend._provisioner_is_alive("abc123") is False
with pytest.raises(RuntimeError, match="Provisioner health check failed for abc123"):
backend._provisioner_is_alive("abc123")
def test_provisioner_is_alive_raises_on_server_error(monkeypatch):
backend = RemoteSandboxBackend("http://provisioner:8002")
def mock_get(url: str, timeout: int):
response = _StubResponse(status_code=503)
response.text = "unavailable"
return response
monkeypatch.setattr(requests, "get", mock_get)
with pytest.raises(RuntimeError, match="HTTP 503 unavailable"):
backend._provisioner_is_alive("abc123")
def test_discover_delegates_to_provisioner_discover(monkeypatch):