fix: hide summarization LLM output from frontend during context compression

Fixes #2804

  The useStream hook tracks messages-tuple mode which captures ALL LLM
  events, including the internal summarization LLM call. This caused the
  English summary text to briefly appear as an AI message in the UI before
  the REMOVE_ALL_MESSAGES state update replaced it.

  Fix by overriding _create_summary/_acreate_summary to pass callbacks=[]
  when invoking the summary model, preventing LangGraph from forwarding
  the internal LLM events to the frontend stream. Also add
  hide_from_ui=True to the summary HumanMessage's additional_kwargs as a
  belt-and-suspenders safety net alongside the existing name=summary
  check.
This commit is contained in:
Willem Jiang
2026-05-11 22:19:47 +08:00
parent 2b5bece744
commit c995c3a394
2 changed files with 125 additions and 1 deletions
@@ -10,6 +10,7 @@ from typing import Any, Protocol, override, runtime_checkable
from langchain.agents import AgentState
from langchain.agents.middleware import SummarizationMiddleware
from langchain_core.messages import AIMessage, AnyMessage, HumanMessage, RemoveMessage, ToolMessage
from langchain_core.messages.utils import get_buffer_string
from langgraph.config import get_config
from langgraph.graph.message import REMOVE_ALL_MESSAGES
from langgraph.runtime import Runtime
@@ -175,12 +176,76 @@ class DeerFlowSummarizationMiddleware(SummarizationMiddleware):
]
}
@override
def _create_summary(self, messages_to_summarize: list[AnyMessage]) -> str:
"""Generate summary without emitting streaming events to the client.
Suppresses callbacks to prevent the internal summarization LLM call from
producing visible AI message chunks in the frontend's ``messages-tuple``
stream (issue #2804).
"""
if not messages_to_summarize:
return "No previous conversation history."
trimmed = self._trim_messages_for_summary(messages_to_summarize)
if not trimmed:
return "Previous conversation was too long to summarize."
formatted = get_buffer_string(trimmed)
try:
response = self.model.invoke(
self.summary_prompt.format(messages=formatted).rstrip(),
config={
"metadata": {"lc_source": "summarization"},
"callbacks": [],
},
)
return response.text.strip()
except Exception as e:
return f"Error generating summary: {e!s}"
@override
async def _acreate_summary(self, messages_to_summarize: list[AnyMessage]) -> str:
"""Generate summary without emitting streaming events to the client.
Suppresses callbacks to prevent the internal summarization LLM call from
producing visible AI message chunks in the frontend's ``messages-tuple``
stream (issue #2804).
"""
if not messages_to_summarize:
return "No previous conversation history."
trimmed = self._trim_messages_for_summary(messages_to_summarize)
if not trimmed:
return "Previous conversation was too long to summarize."
formatted = get_buffer_string(trimmed)
try:
response = await self.model.ainvoke(
self.summary_prompt.format(messages=formatted).rstrip(),
config={
"metadata": {"lc_source": "summarization"},
"callbacks": [],
},
)
return response.text.strip()
except Exception as e:
return f"Error generating summary: {e!s}"
@override
def _build_new_messages(self, summary: str) -> list[HumanMessage]:
"""Override the base implementation to let the human message with the special name 'summary'.
And this message will be ignored to display in the frontend, but still can be used as context for the model.
"""
return [HumanMessage(content=f"Here is a summary of the conversation to date:\n\n{summary}", name="summary")]
return [
HumanMessage(
content=f"Here is a summary of the conversation to date:\n\n{summary}",
name="summary",
additional_kwargs={"hide_from_ui": True},
)
]
def _preserve_dynamic_context_reminders(
self,