Merge branch 'main' into rayhpeng/persistence-scaffold

# Conflicts: # .env.example # backend/packages/harness/deerflow/agents/middlewares/title_middleware.py
2026-05-21 23:46:50 +00:00 · 2026-04-04 21:28:07 +08:00
parent b55a9c8d28 2a150f5d4a
commit 4737fc3aa9
180 changed files with 10945 additions and 787 deletions
@@ -123,6 +123,11 @@ async def run_agent(
        # Inject runtime context so middlewares can access thread_id
        # (langgraph-cli does this automatically; we must do it manually)
        runtime = Runtime(context={"thread_id": thread_id}, store=store)
+        # If the caller already set a ``context`` key (LangGraph >= 0.6.0
+        # prefers it over ``configurable`` for thread-level data), make
+        # sure ``thread_id`` is available there too.
+        if "context" in config and isinstance(config["context"], dict):
+            config["context"].setdefault("thread_id", thread_id)
        config.setdefault("configurable", {})["__pregel_runtime"] = runtime

        # Inject RunJournal as a LangChain callback handler.
@@ -25,6 +25,7 @@ class MemoryStreamBridge(StreamBridge):
        self._maxsize = queue_maxsize
        self._queues: dict[str, asyncio.Queue[StreamEvent]] = {}
        self._counters: dict[str, int] = {}
+        self._dropped_counts: dict[str, int] = {}

    # -- helpers ---------------------------------------------------------------

@@ -32,6 +33,7 @@ class MemoryStreamBridge(StreamBridge):
        if run_id not in self._queues:
            self._queues[run_id] = asyncio.Queue(maxsize=self._maxsize)
            self._counters[run_id] = 0
+            self._dropped_counts[run_id] = 0
        return self._queues[run_id]

    def _next_id(self, run_id: str) -> str:
@@ -48,14 +50,41 @@ class MemoryStreamBridge(StreamBridge):
        try:
            await asyncio.wait_for(queue.put(entry), timeout=_PUBLISH_TIMEOUT)
        except TimeoutError:
-            logger.warning("Stream bridge queue full for run %s — dropping event %s", run_id, event)
+            self._dropped_counts[run_id] = self._dropped_counts.get(run_id, 0) + 1
+            logger.warning(
+                "Stream bridge queue full for run %s — dropping event %s (total dropped: %d)",
+                run_id,
+                event,
+                self._dropped_counts[run_id],
+            )

    async def publish_end(self, run_id: str) -> None:
        queue = self._get_or_create_queue(run_id)
-        try:
-            await asyncio.wait_for(queue.put(END_SENTINEL), timeout=_PUBLISH_TIMEOUT)
-        except TimeoutError:
-            logger.warning("Stream bridge queue full for run %s — dropping END sentinel", run_id)
+
+        # END sentinel is critical — it is the only signal that allows
+        # subscribers to terminate.  If the queue is full we evict the
+        # oldest *regular* events to make room rather than dropping END,
+        # which would cause the SSE connection to hang forever and leak
+        # the queue/counter resources for this run_id.
+        if queue.full():
+            evicted = 0
+            while queue.full():
+                try:
+                    queue.get_nowait()
+                    evicted += 1
+                except asyncio.QueueEmpty:
+                    break  # pragma: no cover – defensive
+            if evicted:
+                logger.warning(
+                    "Stream bridge queue full for run %s — evicted %d event(s) to guarantee END sentinel delivery",
+                    run_id,
+                    evicted,
+                )
+
+        # After eviction the queue is guaranteed to have space, so a
+        # simple non-blocking put is safe.  We still use put() (which
+        # blocks until space is available) as a defensive measure.
+        await queue.put(END_SENTINEL)

    async def subscribe(
        self,
@@ -84,7 +113,18 @@ class MemoryStreamBridge(StreamBridge):
            await asyncio.sleep(delay)
        self._queues.pop(run_id, None)
        self._counters.pop(run_id, None)
+        self._dropped_counts.pop(run_id, None)

    async def close(self) -> None:
        self._queues.clear()
        self._counters.clear()
+        self._dropped_counts.clear()
+
+    def dropped_count(self, run_id: str) -> int:
+        """Return the number of events dropped for *run_id*."""
+        return self._dropped_counts.get(run_id, 0)
+
+    @property
+    def dropped_total(self) -> int:
+        """Return the total number of events dropped across all runs."""
+        return sum(self._dropped_counts.values())