mirror of
https://github.com/bytedance/deer-flow.git
synced 2026-06-13 19:06:01 +00:00
094296440f
ViewImageMiddleware persists full base64 image payloads in hide_from_ui human messages inside checkpoints. All REST endpoints that returned serialize_channel_values(channel_values) sent these multi-megabyte payloads to the frontend, freezing the UI on threads with images. Add strip_data_url_image_blocks() to remove data:-scheme image_url content blocks from hide_from_ui messages, and serialize_channel_values_for_api() as a convenience wrapper used by all six affected call sites across threads, runs, and thread_runs routers. SSE streaming is unaffected (still uses serialize_channel_values). Fixes #3496
129 lines
4.8 KiB
Python
129 lines
4.8 KiB
Python
"""Canonical serialization for LangChain / LangGraph objects.
|
|
|
|
Provides a single source of truth for converting LangChain message
|
|
objects, Pydantic models, and LangGraph state dicts into plain
|
|
JSON-serialisable Python structures.
|
|
|
|
Consumers: ``deerflow.runtime.runs.worker`` (SSE publishing) and
|
|
``app.gateway.routers.threads`` (REST responses).
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from typing import Any
|
|
|
|
|
|
def serialize_lc_object(obj: Any) -> Any:
|
|
"""Recursively serialize a LangChain object to a JSON-serialisable dict."""
|
|
if obj is None:
|
|
return None
|
|
if isinstance(obj, (str, int, float, bool)):
|
|
return obj
|
|
if isinstance(obj, dict):
|
|
return {k: serialize_lc_object(v) for k, v in obj.items()}
|
|
if isinstance(obj, (list, tuple)):
|
|
return [serialize_lc_object(item) for item in obj]
|
|
# Pydantic v2
|
|
if hasattr(obj, "model_dump"):
|
|
try:
|
|
return obj.model_dump()
|
|
except Exception:
|
|
pass
|
|
# Pydantic v1 / older objects
|
|
if hasattr(obj, "dict"):
|
|
try:
|
|
return obj.dict()
|
|
except Exception:
|
|
pass
|
|
# Last resort
|
|
try:
|
|
return str(obj)
|
|
except Exception:
|
|
return repr(obj)
|
|
|
|
|
|
def serialize_channel_values(channel_values: dict[str, Any]) -> dict[str, Any]:
|
|
"""Serialize channel values, stripping internal LangGraph keys.
|
|
|
|
Internal keys like ``__pregel_*`` and ``__interrupt__`` are removed
|
|
to match what the LangGraph Platform API returns.
|
|
"""
|
|
result: dict[str, Any] = {}
|
|
for key, value in channel_values.items():
|
|
if key.startswith("__pregel_") or key == "__interrupt__":
|
|
continue
|
|
result[key] = serialize_lc_object(value)
|
|
return result
|
|
|
|
|
|
def strip_data_url_image_blocks(messages: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
"""Remove ``data:``-scheme ``image_url`` blocks from *hide_from_ui* messages.
|
|
|
|
The history and run-wait endpoints return checkpoint-persisted messages to
|
|
the frontend. ``ViewImageMiddleware`` stores full base64 image payloads in
|
|
``hide_from_ui`` human messages — these are internal model context and must
|
|
not be sent over the wire (huge response bodies, no UI value).
|
|
|
|
Only content blocks of type ``image_url`` whose URL starts with ``data:``
|
|
are stripped. Text blocks, ``https://`` image URLs, and non-hidden
|
|
messages are left untouched so that message ordering and count are
|
|
preserved.
|
|
"""
|
|
result: list[dict[str, Any]] = []
|
|
for msg in messages:
|
|
if not isinstance(msg, dict):
|
|
result.append(msg)
|
|
continue
|
|
|
|
# Only touch messages explicitly flagged as hidden from the UI.
|
|
additional_kwargs = msg.get("additional_kwargs")
|
|
if not (isinstance(additional_kwargs, dict) and additional_kwargs.get("hide_from_ui") is True):
|
|
result.append(msg)
|
|
continue
|
|
|
|
content = msg.get("content")
|
|
if not isinstance(content, list):
|
|
result.append(msg)
|
|
continue
|
|
|
|
# Filter out image_url blocks with data: scheme.
|
|
filtered = [block for block in content if not (isinstance(block, dict) and block.get("type") == "image_url" and isinstance(block.get("image_url"), dict) and str(block["image_url"].get("url", "")).startswith("data:"))]
|
|
result.append({**msg, "content": filtered})
|
|
return result
|
|
|
|
|
|
def serialize_channel_values_for_api(channel_values: dict[str, Any]) -> dict[str, Any]:
|
|
"""Serialize channel values and strip base64 image data from messages.
|
|
|
|
Convenience wrapper combining :func:`serialize_channel_values` with
|
|
:func:`strip_data_url_image_blocks`. Use this in all REST endpoints
|
|
that return channel values to the frontend so that ``data:``-scheme
|
|
base64 image payloads are never sent over the wire.
|
|
"""
|
|
result = serialize_channel_values(channel_values)
|
|
if isinstance(result.get("messages"), list):
|
|
result["messages"] = strip_data_url_image_blocks(result["messages"])
|
|
return result
|
|
|
|
|
|
def serialize_messages_tuple(obj: Any) -> Any:
|
|
"""Serialize a messages-mode tuple ``(chunk, metadata)``."""
|
|
if isinstance(obj, tuple) and len(obj) == 2:
|
|
chunk, metadata = obj
|
|
return [serialize_lc_object(chunk), metadata if isinstance(metadata, dict) else {}]
|
|
return serialize_lc_object(obj)
|
|
|
|
|
|
def serialize(obj: Any, *, mode: str = "") -> Any:
|
|
"""Serialize LangChain objects with mode-specific handling.
|
|
|
|
* ``messages`` — obj is ``(message_chunk, metadata_dict)``
|
|
* ``values`` — obj is the full state dict; ``__pregel_*`` keys stripped
|
|
* everything else — recursive ``model_dump()`` / ``dict()`` fallback
|
|
"""
|
|
if mode == "messages":
|
|
return serialize_messages_tuple(obj)
|
|
if mode == "values":
|
|
return serialize_channel_values(obj) if isinstance(obj, dict) else serialize_lc_object(obj)
|
|
return serialize_lc_object(obj)
|