fix(runs): restore historical runs from persistent store after gateway restart (#2989)

* fix(runs): restore historical runs from persistent store after gateway restart

  RunManager.list_by_thread() and get() only queried the in-memory _runs
  dict, returning empty results after a restart even when PostgreSQL had
  the records. Add store fallback to both read paths and a new async
  aget() for the API endpoint, keeping sync get() for internal callers
  that need live task/abort_event state.

    Fixes #2984

* Apply suggestions from code review

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>

* fix(runs): scope run store fallback reads by user id

Agent-Logs-Url: https://github.com/bytedance/deer-flow/sessions/e73daada-1215-4bc1-ab7d-7117826c5013

Co-authored-by: WillemJiang <219644+WillemJiang@users.noreply.github.com>

* test(runs): clarify ordering expectation and mock store filters

Agent-Logs-Url: https://github.com/bytedance/deer-flow/sessions/e73daada-1215-4bc1-ab7d-7117826c5013

Co-authored-by: WillemJiang <219644+WillemJiang@users.noreply.github.com>

* test(runs): make user filter fallback assertions explicit

Agent-Logs-Url: https://github.com/bytedance/deer-flow/sessions/e73daada-1215-4bc1-ab7d-7117826c5013

Co-authored-by: WillemJiang <219644+WillemJiang@users.noreply.github.com>

* test(runs): verify user-isolated fallback behavior with memory store

Agent-Logs-Url: https://github.com/bytedance/deer-flow/sessions/e73daada-1215-4bc1-ab7d-7117826c5013

Co-authored-by: WillemJiang <219644+WillemJiang@users.noreply.github.com>

* update the code with feedback from issue-2984

---------

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
This commit is contained in:
Willem Jiang
2026-05-17 20:03:21 +08:00
committed by GitHub
parent e74e126ed3
commit 39f901d3a5
5 changed files with 226 additions and 11 deletions
@@ -111,15 +111,60 @@ class RunManager:
return record
def get(self, run_id: str) -> RunRecord | None:
"""Return a run record by ID, or ``None``."""
"""Return an in-memory run record by ID, or ``None``."""
return self._runs.get(run_id)
async def list_by_thread(self, thread_id: str) -> list[RunRecord]:
"""Return all runs for a given thread, newest first."""
async def aget(self, run_id: str, *, user_id: str | None = None) -> RunRecord | None:
"""Return a run record by ID, checking the persistent store as fallback."""
record = self._runs.get(run_id)
if record is not None:
return record
if self._store is not None:
try:
d = await self._store.get(run_id, user_id=user_id)
if d is not None:
return self._store_dict_to_record(d)
except Exception:
logger.warning("Failed to query store for run %s", run_id, exc_info=True)
return None
def _store_dict_to_record(self, d: dict) -> RunRecord:
"""Convert a store dict back to a RunRecord for read-only use."""
return RunRecord(
run_id=d["run_id"],
thread_id=d["thread_id"],
assistant_id=d.get("assistant_id"),
status=RunStatus(d.get("status", RunStatus.error.value)),
on_disconnect=DisconnectMode.cancel,
multitask_strategy=d.get("multitask_strategy", "reject"),
metadata=d.get("metadata", {}),
kwargs=d.get("kwargs", {}),
created_at=d.get("created_at", ""),
updated_at=d.get("updated_at", ""),
model_name=d.get("model_name"),
error=d.get("error"),
)
async def list_by_thread(self, thread_id: str, *, user_id: str | None = None) -> list[RunRecord]:
"""Return all runs for a given thread, oldest first."""
async with self._lock:
# Dict insertion order matches creation order, so reversing it gives
# us deterministic newest-first results even when timestamps tie.
return [r for r in self._runs.values() if r.thread_id == thread_id]
in_memory = [r for r in self._runs.values() if r.thread_id == thread_id]
in_memory_ids = {r.run_id for r in in_memory}
store_records: list[RunRecord] = []
if self._store is not None:
try:
store_dicts = await self._store.list_by_thread(thread_id, user_id=user_id)
for d in store_dicts:
if d["run_id"] not in in_memory_ids:
store_records.append(self._store_dict_to_record(d))
except Exception:
logger.warning("Failed to query store for thread %s runs", thread_id, exc_info=True)
return sorted(
in_memory + store_records,
key=lambda record: record.created_at or "",
)
async def set_status(self, run_id: str, status: RunStatus, *, error: str | None = None) -> None:
"""Transition a run to a new status."""
@@ -34,7 +34,12 @@ class RunStore(abc.ABC):
pass
@abc.abstractmethod
async def get(self, run_id: str) -> dict[str, Any] | None:
async def get(
self,
run_id: str,
*,
user_id: str | None = None,
) -> dict[str, Any] | None:
pass
@abc.abstractmethod
@@ -46,8 +46,13 @@ class MemoryRunStore(RunStore):
"updated_at": now,
}
async def get(self, run_id):
return self._runs.get(run_id)
async def get(self, run_id, *, user_id=None):
run = self._runs.get(run_id)
if run is None:
return None
if user_id is not None and run.get("user_id") != user_id:
return None
return run
async def list_by_thread(self, thread_id, *, user_id=None, limit=100):
results = [r for r in self._runs.values() if r["thread_id"] == thread_id and (user_id is None or r.get("user_id") == user_id)]