mirror of
https://github.com/bytedance/deer-flow.git
synced 2026-06-17 13:05:58 +00:00
fix(sandbox): create shell session before retrying on a fresh id (#3577)
* fix(sandbox): create shell session before retrying on a fresh id The AIO sandbox recovery path generated a UUID and passed it straight to exec_command(id=...). The sandbox image only auto-creates a session when exec_command is called with *no* id; an exec carrying an unknown id returns HTTP 404 "Session not found". So every ErrorObservation recovery itself 404'd, turning a transient session lapse into an unrecoverable tool error that looped the run up to the LangGraph recursion limit. Explicitly create_session(id=fresh_id) before targeting that id on retry. create_session is idempotent (returns the existing session if the id already exists), so this is safe under the serializing lock. Updated the regression test to assert the retry targets exactly the created session id rather than a fabricated, uncreated one. * fix(sandbox): release the one-shot recovery session after retry The fresh session created on the ErrorObservation recovery path is used for exactly one command -- the next execute_command runs with no id and returns to the default session. Under persistent session corruption every command would create another session that is never reused or released, accumulating sessions on the container. Release it best-effort with cleanup_session() in a finally, swallowing any cleanup error so it never masks a successful retry. Addresses review feedback on #3577. --------- Co-authored-by: Willem Jiang <willem.jiang@gmail.com>
This commit is contained in:
@@ -132,10 +132,22 @@ class AioSandbox(Sandbox):
|
|||||||
output = result.data.output if result.data else ""
|
output = result.data.output if result.data else ""
|
||||||
|
|
||||||
if output and _ERROR_OBSERVATION_SIGNATURE in output:
|
if output and _ERROR_OBSERVATION_SIGNATURE in output:
|
||||||
logger.warning("ErrorObservation detected in sandbox output, retrying with a fresh session")
|
logger.warning("ErrorObservation detected in sandbox output, retrying on a fresh session")
|
||||||
|
# exec_command only auto-creates a session when called with
|
||||||
|
# no id, so the recovery session must be created explicitly
|
||||||
|
# before we target it on retry.
|
||||||
fresh_id = str(uuid.uuid4())
|
fresh_id = str(uuid.uuid4())
|
||||||
result = self._client.shell.exec_command(command=command, id=fresh_id, no_change_timeout=self._DEFAULT_NO_CHANGE_TIMEOUT)
|
self._client.shell.create_session(id=fresh_id)
|
||||||
output = result.data.output if result.data else ""
|
try:
|
||||||
|
result = self._client.shell.exec_command(command=command, id=fresh_id, no_change_timeout=self._DEFAULT_NO_CHANGE_TIMEOUT)
|
||||||
|
output = result.data.output if result.data else ""
|
||||||
|
finally:
|
||||||
|
# Release the one-shot recovery session, best-effort, so
|
||||||
|
# repeated corruption can't accumulate sessions.
|
||||||
|
try:
|
||||||
|
self._client.shell.cleanup_session(fresh_id)
|
||||||
|
except Exception as cleanup_error:
|
||||||
|
logger.warning(f"Failed to release recovery session {fresh_id}: {cleanup_error}")
|
||||||
|
|
||||||
return output if output else "(no output)"
|
return output if output else "(no output)"
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
@@ -79,23 +79,71 @@ class TestErrorObservationRetry:
|
|||||||
assert result == "success"
|
assert result == "success"
|
||||||
assert call_count == 2
|
assert call_count == 2
|
||||||
|
|
||||||
def test_retry_passes_fresh_session_id(self, sandbox):
|
def test_retry_creates_fresh_session_before_targeting_it(self, sandbox):
|
||||||
"""The retry call should include a new session id kwarg."""
|
"""Recovery must explicitly create a session, then exec against that id.
|
||||||
calls = []
|
|
||||||
|
The sandbox image only auto-creates a session when exec_command is
|
||||||
|
called with *no* id; an exec carrying an unknown id returns HTTP 404
|
||||||
|
"Session not found". So the retry must obtain a real, distinct session
|
||||||
|
via create_session() first and target that id, rather than fabricating
|
||||||
|
an id and handing it straight to exec_command (the regression that
|
||||||
|
404'd every recovery and looped runs to the recursion limit).
|
||||||
|
"""
|
||||||
|
exec_calls = []
|
||||||
|
created_ids = []
|
||||||
|
cleaned_ids = []
|
||||||
|
|
||||||
def mock_exec(command, **kwargs):
|
def mock_exec(command, **kwargs):
|
||||||
calls.append(kwargs)
|
exec_calls.append(kwargs)
|
||||||
if len(calls) == 1:
|
if len(exec_calls) == 1:
|
||||||
return SimpleNamespace(data=SimpleNamespace(output="'ErrorObservation' object has no attribute 'exit_code'"))
|
return SimpleNamespace(data=SimpleNamespace(output="'ErrorObservation' object has no attribute 'exit_code'"))
|
||||||
return SimpleNamespace(data=SimpleNamespace(output="ok"))
|
return SimpleNamespace(data=SimpleNamespace(output="ok"))
|
||||||
|
|
||||||
sandbox._client.shell.exec_command = mock_exec
|
def mock_create_session(id, **kwargs):
|
||||||
|
created_ids.append(id)
|
||||||
|
return SimpleNamespace(data=SimpleNamespace(session_id=id))
|
||||||
|
|
||||||
sandbox.execute_command("test")
|
def mock_cleanup_session(session_id, **kwargs):
|
||||||
assert len(calls) == 2
|
cleaned_ids.append(session_id)
|
||||||
assert "id" not in calls[0]
|
|
||||||
assert "id" in calls[1]
|
sandbox._client.shell.exec_command = mock_exec
|
||||||
assert len(calls[1]["id"]) == 36 # UUID format
|
sandbox._client.shell.create_session = mock_create_session
|
||||||
|
sandbox._client.shell.cleanup_session = mock_cleanup_session
|
||||||
|
|
||||||
|
result = sandbox.execute_command("test")
|
||||||
|
|
||||||
|
assert result == "ok"
|
||||||
|
assert len(exec_calls) == 2
|
||||||
|
# First attempt runs on the default session (no id).
|
||||||
|
assert "id" not in exec_calls[0]
|
||||||
|
# A fresh session was explicitly created...
|
||||||
|
assert len(created_ids) == 1
|
||||||
|
assert len(created_ids[0]) == 36 # UUID format
|
||||||
|
# ...and the retry targets exactly that created session, never an
|
||||||
|
# uncreated/fabricated id (which would 404).
|
||||||
|
assert exec_calls[1].get("id") == created_ids[0]
|
||||||
|
# ...and that one-shot recovery session is released afterwards so a
|
||||||
|
# sandbox that keeps hitting corruption doesn't accumulate sessions.
|
||||||
|
assert cleaned_ids == [created_ids[0]]
|
||||||
|
|
||||||
|
def test_cleanup_failure_does_not_mask_successful_retry(self, sandbox):
|
||||||
|
"""A failure releasing the recovery session must not lose the retry output."""
|
||||||
|
|
||||||
|
def mock_exec(command, **kwargs):
|
||||||
|
if "id" not in kwargs:
|
||||||
|
return SimpleNamespace(data=SimpleNamespace(output="'ErrorObservation' object has no attribute 'exit_code'"))
|
||||||
|
return SimpleNamespace(data=SimpleNamespace(output="recovered"))
|
||||||
|
|
||||||
|
def mock_cleanup_session(session_id, **kwargs):
|
||||||
|
raise RuntimeError("cleanup boom")
|
||||||
|
|
||||||
|
sandbox._client.shell.exec_command = mock_exec
|
||||||
|
sandbox._client.shell.create_session = lambda id, **kwargs: SimpleNamespace(data=SimpleNamespace(session_id=id))
|
||||||
|
sandbox._client.shell.cleanup_session = mock_cleanup_session
|
||||||
|
|
||||||
|
# The retry succeeded; the swallowed cleanup error must not turn this
|
||||||
|
# into an "Error: ..." result.
|
||||||
|
assert sandbox.execute_command("test") == "recovered"
|
||||||
|
|
||||||
def test_no_retry_on_clean_output(self, sandbox):
|
def test_no_retry_on_clean_output(self, sandbox):
|
||||||
"""Normal output should not trigger a retry."""
|
"""Normal output should not trigger a retry."""
|
||||||
|
|||||||
Reference in New Issue
Block a user