diff --git a/backend/packages/harness/deerflow/community/aio_sandbox/aio_sandbox.py b/backend/packages/harness/deerflow/community/aio_sandbox/aio_sandbox.py
index 08780b3a1..e30421fed 100644
--- a/backend/packages/harness/deerflow/community/aio_sandbox/aio_sandbox.py
+++ b/backend/packages/harness/deerflow/community/aio_sandbox/aio_sandbox.py
@@ -132,10 +132,22 @@ class AioSandbox(Sandbox):
                 output = result.data.output if result.data else ""
 
                 if output and _ERROR_OBSERVATION_SIGNATURE in output:
-                    logger.warning("ErrorObservation detected in sandbox output, retrying with a fresh session")
+                    logger.warning("ErrorObservation detected in sandbox output, retrying on a fresh session")
+                    # exec_command only auto-creates a session when called with
+                    # no id, so the recovery session must be created explicitly
+                    # before we target it on retry.
                     fresh_id = str(uuid.uuid4())
-                    result = self._client.shell.exec_command(command=command, id=fresh_id, no_change_timeout=self._DEFAULT_NO_CHANGE_TIMEOUT)
-                    output = result.data.output if result.data else ""
+                    self._client.shell.create_session(id=fresh_id)
+                    try:
+                        result = self._client.shell.exec_command(command=command, id=fresh_id, no_change_timeout=self._DEFAULT_NO_CHANGE_TIMEOUT)
+                        output = result.data.output if result.data else ""
+                    finally:
+                        # Release the one-shot recovery session, best-effort, so
+                        # repeated corruption can't accumulate sessions.
+                        try:
+                            self._client.shell.cleanup_session(fresh_id)
+                        except Exception as cleanup_error:
+                            logger.warning(f"Failed to release recovery session {fresh_id}: {cleanup_error}")
 
                 return output if output else "(no output)"
             except Exception as e:
diff --git a/backend/tests/test_aio_sandbox.py b/backend/tests/test_aio_sandbox.py
index a37cff8a1..37b0ade06 100644
--- a/backend/tests/test_aio_sandbox.py
+++ b/backend/tests/test_aio_sandbox.py
@@ -79,23 +79,71 @@ class TestErrorObservationRetry:
         assert result == "success"
         assert call_count == 2
 
-    def test_retry_passes_fresh_session_id(self, sandbox):
-        """The retry call should include a new session id kwarg."""
-        calls = []
+    def test_retry_creates_fresh_session_before_targeting_it(self, sandbox):
+        """Recovery must explicitly create a session, then exec against that id.
+
+        The sandbox image only auto-creates a session when exec_command is
+        called with *no* id; an exec carrying an unknown id returns HTTP 404
+        "Session not found". So the retry must obtain a real, distinct session
+        via create_session() first and target that id, rather than fabricating
+        an id and handing it straight to exec_command (the regression that
+        404'd every recovery and looped runs to the recursion limit).
+        """
+        exec_calls = []
+        created_ids = []
+        cleaned_ids = []
 
         def mock_exec(command, **kwargs):
-            calls.append(kwargs)
-            if len(calls) == 1:
+            exec_calls.append(kwargs)
+            if len(exec_calls) == 1:
                 return SimpleNamespace(data=SimpleNamespace(output="'ErrorObservation' object has no attribute 'exit_code'"))
             return SimpleNamespace(data=SimpleNamespace(output="ok"))
 
-        sandbox._client.shell.exec_command = mock_exec
+        def mock_create_session(id, **kwargs):
+            created_ids.append(id)
+            return SimpleNamespace(data=SimpleNamespace(session_id=id))
 
-        sandbox.execute_command("test")
-        assert len(calls) == 2
-        assert "id" not in calls[0]
-        assert "id" in calls[1]
-        assert len(calls[1]["id"]) == 36  # UUID format
+        def mock_cleanup_session(session_id, **kwargs):
+            cleaned_ids.append(session_id)
+
+        sandbox._client.shell.exec_command = mock_exec
+        sandbox._client.shell.create_session = mock_create_session
+        sandbox._client.shell.cleanup_session = mock_cleanup_session
+
+        result = sandbox.execute_command("test")
+
+        assert result == "ok"
+        assert len(exec_calls) == 2
+        # First attempt runs on the default session (no id).
+        assert "id" not in exec_calls[0]
+        # A fresh session was explicitly created...
+        assert len(created_ids) == 1
+        assert len(created_ids[0]) == 36  # UUID format
+        # ...and the retry targets exactly that created session, never an
+        # uncreated/fabricated id (which would 404).
+        assert exec_calls[1].get("id") == created_ids[0]
+        # ...and that one-shot recovery session is released afterwards so a
+        # sandbox that keeps hitting corruption doesn't accumulate sessions.
+        assert cleaned_ids == [created_ids[0]]
+
+    def test_cleanup_failure_does_not_mask_successful_retry(self, sandbox):
+        """A failure releasing the recovery session must not lose the retry output."""
+
+        def mock_exec(command, **kwargs):
+            if "id" not in kwargs:
+                return SimpleNamespace(data=SimpleNamespace(output="'ErrorObservation' object has no attribute 'exit_code'"))
+            return SimpleNamespace(data=SimpleNamespace(output="recovered"))
+
+        def mock_cleanup_session(session_id, **kwargs):
+            raise RuntimeError("cleanup boom")
+
+        sandbox._client.shell.exec_command = mock_exec
+        sandbox._client.shell.create_session = lambda id, **kwargs: SimpleNamespace(data=SimpleNamespace(session_id=id))
+        sandbox._client.shell.cleanup_session = mock_cleanup_session
+
+        # The retry succeeded; the swallowed cleanup error must not turn this
+        # into an "Error: ..." result.
+        assert sandbox.execute_command("test") == "recovered"
 
     def test_no_retry_on_clean_output(self, sandbox):
         """Normal output should not trigger a retry."""