fix(summarization): tag summary LLM calls nostream to stop phantom stream messages (#2503) (#3378)

* fix(summarization): tag summary LLM calls nostream to stop phantom stream messages (#2503) The SummarizationMiddleware runs its summary LLM call inside a before_model hook. Without a nostream tag the summary tokens were captured by LangGraph's messages-tuple stream callback and broadcast to the frontend as a phantom AI message. Generate a dedicated summary model copy tagged with "nostream" (merged on top of any existing tags such as "middleware:summarize" so RunJournal attribution is preserved) and override _create_summary / _acreate_summary to invoke it directly. This avoids temporarily swapping the shared self.model, which would otherwise leak the RunnableBinding across concurrent runs and break parent logic that inspects the raw model (profile / _get_ls_params). Add regression tests covering nostream tagging, concurrent-run isolation, raw model preservation, and existing-tag merge. * fix(summarization): address nostream review feedback
2026-06-10 17:35:57 +00:00 · 2026-06-07 17:55:04 +08:00
parent 88e36d9686
commit d133b1119a
2 changed files with 185 additions and 1 deletions
@@ -9,6 +9,7 @@ from langchain.agents import create_agent
 from langchain_core.language_models import BaseChatModel
 from langchain_core.messages import AIMessage, HumanMessage, RemoveMessage, ToolMessage
 from langchain_core.outputs import ChatGeneration, ChatResult
+from langgraph.constants import TAG_NOSTREAM

 from deerflow.agents.memory.summarization_hook import memory_flush_hook
 from deerflow.agents.middlewares.dynamic_context_middleware import _DYNAMIC_CONTEXT_REMINDER_KEY, DynamicContextMiddleware
@@ -160,6 +161,120 @@ def test_summarization_middleware_emits_frontend_update_key_in_agent_stream() ->
    assert emitted[1].content == ("Here is a summary of the conversation to date:\n\ncompressed summary")


+def test_summary_model_is_tagged_nostream_to_avoid_stream_pollution() -> None:
+    tags_during_summary: list[list[str]] = []
+
+    class _RecordingChatModel(_StaticChatModel):
+        def _generate(self, messages, stop=None, run_manager=None, **kwargs):
+            tags_during_summary.append(list(run_manager.tags) if run_manager else [])
+            return super()._generate(messages, stop=stop, run_manager=run_manager, **kwargs)
+
+    model = _RecordingChatModel(text="compressed summary")
+    middleware = DeerFlowSummarizationMiddleware(
+        model=model,
+        trigger=("messages", 4),
+        keep=("messages", 2),
+        token_counter=len,
+    )
+
+    # The dedicated summary model must carry TAG_NOSTREAM so LangGraph's
+    # messages-tuple stream handler skips its tokens, while the raw model used by
+    # the parent for profile / token inspection stays untagged.
+    assert TAG_NOSTREAM in (middleware._summary_model.config.get("tags") or [])
+    assert TAG_NOSTREAM not in (getattr(middleware.model, "config", {}).get("tags") or [])
+
+    result = middleware.before_model({"messages": _messages()}, _runtime())
+
+    # The summary LLM call must actually run with the nostream tag (this is what the
+    # stream handler inspects), and the shared self.model must remain the raw,
+    # untagged model so parent logic (profile / _get_ls_params) keeps working.
+    assert tags_during_summary == [[TAG_NOSTREAM]]
+    assert middleware.model is model
+    assert result["messages"][1].content.startswith("Here is a summary")
+
+
+def test_summarization_does_not_mutate_shared_model_across_concurrent_runs() -> None:
+    """Concurrent runs must not observe a swapped-out self.model during summarization.
+
+    The agent/middleware instance is cached and reused, so summarization must never
+    temporarily replace the shared self.model: doing so would leak the nostream
+    RunnableBinding to other coroutines mid-flight and break parent logic that
+    inspects the raw model (profile / _get_ls_params).
+    """
+    import asyncio
+
+    observed_models: list[object] = []
+    started = asyncio.Event()
+    release = asyncio.Event()
+
+    class _BlockingChatModel(_StaticChatModel):
+        async def _agenerate(self, messages, stop=None, run_manager=None, **kwargs):
+            # Hold the summary call open so a concurrent run can inspect self.model.
+            started.set()
+            await release.wait()
+            return self._generate(messages, stop=stop, run_manager=run_manager, **kwargs)
+
+    model = _BlockingChatModel(text="compressed summary")
+    middleware = DeerFlowSummarizationMiddleware(
+        model=model,
+        trigger=("messages", 4),
+        keep=("messages", 2),
+        token_counter=len,
+    )
+
+    async def _run() -> None:
+        summarizing = asyncio.create_task(middleware.abefore_model({"messages": _messages()}, _runtime()))
+        # Wait until the summary task reaches the blocked LLM call.
+        await started.wait()
+        # A concurrent run reads the shared model while summarization is in flight.
+        observed_models.append(middleware.model)
+        release.set()
+        await summarizing
+
+    asyncio.run(_run())
+
+    assert observed_models == [model]
+
+
+def test_raw_model_is_preserved_for_parent_profile_inspection() -> None:
+    """self.model must stay the original model so attribute access does not drift."""
+    model = _StaticChatModel(text="compressed summary")
+    middleware = DeerFlowSummarizationMiddleware(
+        model=model,
+        trigger=("messages", 4),
+        keep=("messages", 2),
+        token_counter=len,
+    )
+
+    middleware.before_model({"messages": _messages()}, _runtime())
+
+    # The shared field is never reassigned to the RunnableBinding.
+    assert middleware.model is model
+    assert middleware._summary_model is not model
+
+
+def test_summary_model_preserves_existing_tags_when_adding_nostream() -> None:
+    """Adding TAG_NOSTREAM must not clobber tags already bound on the model.
+
+    lead_agent/agent.py binds "middleware:summarize" for RunJournal attribution. Because
+    RunnableBinding.with_config shallow-merges config, the summary model must explicitly
+    preserve existing tags instead of overwriting them with just [TAG_NOSTREAM].
+    """
+    tagged_model = _StaticChatModel(text="compressed summary").with_config(tags=["middleware:summarize"])
+    middleware = DeerFlowSummarizationMiddleware(
+        model=tagged_model,
+        trigger=("messages", 4),
+        keep=("messages", 2),
+        token_counter=len,
+    )
+
+    summary_tags = middleware._summary_model.config.get("tags") or []
+    assert "middleware:summarize" in summary_tags
+    assert TAG_NOSTREAM in summary_tags
+    # No duplicate TAG_NOSTREAM even if invoked when one was already present.
+    assert summary_tags.count(TAG_NOSTREAM) == 1
+
+
 def test_dynamic_context_reminder_is_preserved_across_summarization() -> None:
    captured: list[SummarizationEvent] = []
    middleware = _middleware(before_summarization=[captured.append])