fix(summarization): tag summary LLM calls nostream to stop phantom stream messages (#2503) (#3378)

* fix(summarization): tag summary LLM calls nostream to stop phantom stream messages (#2503)

The SummarizationMiddleware runs its summary LLM call inside a before_model
hook. Without a nostream tag the summary tokens were captured by LangGraph's
messages-tuple stream callback and broadcast to the frontend as a phantom AI
message.

Generate a dedicated summary model copy tagged with "nostream" (merged on top
of any existing tags such as "middleware:summarize" so RunJournal attribution
is preserved) and override _create_summary / _acreate_summary to invoke it
directly. This avoids temporarily swapping the shared self.model, which would
otherwise leak the RunnableBinding across concurrent runs and break parent
logic that inspects the raw model (profile / _get_ls_params).

Add regression tests covering nostream tagging, concurrent-run isolation, raw
model preservation, and existing-tag merge.

* fix(summarization): address nostream review feedback
This commit is contained in:
Ryker_Feng
2026-06-07 17:55:04 +08:00
committed by GitHub
parent 88e36d9686
commit d133b1119a
2 changed files with 185 additions and 1 deletions
@@ -9,6 +9,7 @@ from langchain.agents import create_agent
from langchain_core.language_models import BaseChatModel
from langchain_core.messages import AIMessage, HumanMessage, RemoveMessage, ToolMessage
from langchain_core.outputs import ChatGeneration, ChatResult
from langgraph.constants import TAG_NOSTREAM
from deerflow.agents.memory.summarization_hook import memory_flush_hook
from deerflow.agents.middlewares.dynamic_context_middleware import _DYNAMIC_CONTEXT_REMINDER_KEY, DynamicContextMiddleware
@@ -160,6 +161,120 @@ def test_summarization_middleware_emits_frontend_update_key_in_agent_stream() ->
assert emitted[1].content == ("Here is a summary of the conversation to date:\n\ncompressed summary")
def test_summary_model_is_tagged_nostream_to_avoid_stream_pollution() -> None:
tags_during_summary: list[list[str]] = []
class _RecordingChatModel(_StaticChatModel):
def _generate(self, messages, stop=None, run_manager=None, **kwargs):
tags_during_summary.append(list(run_manager.tags) if run_manager else [])
return super()._generate(messages, stop=stop, run_manager=run_manager, **kwargs)
model = _RecordingChatModel(text="compressed summary")
middleware = DeerFlowSummarizationMiddleware(
model=model,
trigger=("messages", 4),
keep=("messages", 2),
token_counter=len,
)
# The dedicated summary model must carry TAG_NOSTREAM so LangGraph's
# messages-tuple stream handler skips its tokens, while the raw model used by
# the parent for profile / token inspection stays untagged.
assert TAG_NOSTREAM in (middleware._summary_model.config.get("tags") or [])
assert TAG_NOSTREAM not in (getattr(middleware.model, "config", {}).get("tags") or [])
result = middleware.before_model({"messages": _messages()}, _runtime())
# The summary LLM call must actually run with the nostream tag (this is what the
# stream handler inspects), and the shared self.model must remain the raw,
# untagged model so parent logic (profile / _get_ls_params) keeps working.
assert tags_during_summary == [[TAG_NOSTREAM]]
assert middleware.model is model
assert result["messages"][1].content.startswith("Here is a summary")
def test_summarization_does_not_mutate_shared_model_across_concurrent_runs() -> None:
"""Concurrent runs must not observe a swapped-out self.model during summarization.
The agent/middleware instance is cached and reused, so summarization must never
temporarily replace the shared self.model: doing so would leak the nostream
RunnableBinding to other coroutines mid-flight and break parent logic that
inspects the raw model (profile / _get_ls_params).
"""
import asyncio
observed_models: list[object] = []
started = asyncio.Event()
release = asyncio.Event()
class _BlockingChatModel(_StaticChatModel):
async def _agenerate(self, messages, stop=None, run_manager=None, **kwargs):
# Hold the summary call open so a concurrent run can inspect self.model.
started.set()
await release.wait()
return self._generate(messages, stop=stop, run_manager=run_manager, **kwargs)
model = _BlockingChatModel(text="compressed summary")
middleware = DeerFlowSummarizationMiddleware(
model=model,
trigger=("messages", 4),
keep=("messages", 2),
token_counter=len,
)
async def _run() -> None:
summarizing = asyncio.create_task(middleware.abefore_model({"messages": _messages()}, _runtime()))
# Wait until the summary task reaches the blocked LLM call.
await started.wait()
# A concurrent run reads the shared model while summarization is in flight.
observed_models.append(middleware.model)
release.set()
await summarizing
asyncio.run(_run())
assert observed_models == [model]
def test_raw_model_is_preserved_for_parent_profile_inspection() -> None:
"""self.model must stay the original model so attribute access does not drift."""
model = _StaticChatModel(text="compressed summary")
middleware = DeerFlowSummarizationMiddleware(
model=model,
trigger=("messages", 4),
keep=("messages", 2),
token_counter=len,
)
middleware.before_model({"messages": _messages()}, _runtime())
# The shared field is never reassigned to the RunnableBinding.
assert middleware.model is model
assert middleware._summary_model is not model
def test_summary_model_preserves_existing_tags_when_adding_nostream() -> None:
"""Adding TAG_NOSTREAM must not clobber tags already bound on the model.
lead_agent/agent.py binds "middleware:summarize" for RunJournal attribution. Because
RunnableBinding.with_config shallow-merges config, the summary model must explicitly
preserve existing tags instead of overwriting them with just [TAG_NOSTREAM].
"""
tagged_model = _StaticChatModel(text="compressed summary").with_config(tags=["middleware:summarize"])
middleware = DeerFlowSummarizationMiddleware(
model=tagged_model,
trigger=("messages", 4),
keep=("messages", 2),
token_counter=len,
)
summary_tags = middleware._summary_model.config.get("tags") or []
assert "middleware:summarize" in summary_tags
assert TAG_NOSTREAM in summary_tags
# No duplicate TAG_NOSTREAM even if invoked when one was already present.
assert summary_tags.count(TAG_NOSTREAM) == 1
def test_dynamic_context_reminder_is_preserved_across_summarization() -> None:
captured: list[SummarizationEvent] = []
middleware = _middleware(before_summarization=[captured.append])