Files
deer-flow/backend/packages/harness/deerflow/runtime/serialization.py
T
hataa 094296440f fix(history): strip base64 image data from REST endpoint responses (#3535)
ViewImageMiddleware persists full base64 image payloads in hide_from_ui
human messages inside checkpoints. All REST endpoints that returned
serialize_channel_values(channel_values) sent these multi-megabyte
payloads to the frontend, freezing the UI on threads with images.

Add strip_data_url_image_blocks() to remove data:-scheme image_url
content blocks from hide_from_ui messages, and
serialize_channel_values_for_api() as a convenience wrapper used by all
six affected call sites across threads, runs, and thread_runs routers.
SSE streaming is unaffected (still uses serialize_channel_values).

Fixes #3496
2026-06-13 08:58:19 +08:00

129 lines
4.8 KiB
Python

"""Canonical serialization for LangChain / LangGraph objects.
Provides a single source of truth for converting LangChain message
objects, Pydantic models, and LangGraph state dicts into plain
JSON-serialisable Python structures.
Consumers: ``deerflow.runtime.runs.worker`` (SSE publishing) and
``app.gateway.routers.threads`` (REST responses).
"""
from __future__ import annotations
from typing import Any
def serialize_lc_object(obj: Any) -> Any:
"""Recursively serialize a LangChain object to a JSON-serialisable dict."""
if obj is None:
return None
if isinstance(obj, (str, int, float, bool)):
return obj
if isinstance(obj, dict):
return {k: serialize_lc_object(v) for k, v in obj.items()}
if isinstance(obj, (list, tuple)):
return [serialize_lc_object(item) for item in obj]
# Pydantic v2
if hasattr(obj, "model_dump"):
try:
return obj.model_dump()
except Exception:
pass
# Pydantic v1 / older objects
if hasattr(obj, "dict"):
try:
return obj.dict()
except Exception:
pass
# Last resort
try:
return str(obj)
except Exception:
return repr(obj)
def serialize_channel_values(channel_values: dict[str, Any]) -> dict[str, Any]:
"""Serialize channel values, stripping internal LangGraph keys.
Internal keys like ``__pregel_*`` and ``__interrupt__`` are removed
to match what the LangGraph Platform API returns.
"""
result: dict[str, Any] = {}
for key, value in channel_values.items():
if key.startswith("__pregel_") or key == "__interrupt__":
continue
result[key] = serialize_lc_object(value)
return result
def strip_data_url_image_blocks(messages: list[dict[str, Any]]) -> list[dict[str, Any]]:
"""Remove ``data:``-scheme ``image_url`` blocks from *hide_from_ui* messages.
The history and run-wait endpoints return checkpoint-persisted messages to
the frontend. ``ViewImageMiddleware`` stores full base64 image payloads in
``hide_from_ui`` human messages — these are internal model context and must
not be sent over the wire (huge response bodies, no UI value).
Only content blocks of type ``image_url`` whose URL starts with ``data:``
are stripped. Text blocks, ``https://`` image URLs, and non-hidden
messages are left untouched so that message ordering and count are
preserved.
"""
result: list[dict[str, Any]] = []
for msg in messages:
if not isinstance(msg, dict):
result.append(msg)
continue
# Only touch messages explicitly flagged as hidden from the UI.
additional_kwargs = msg.get("additional_kwargs")
if not (isinstance(additional_kwargs, dict) and additional_kwargs.get("hide_from_ui") is True):
result.append(msg)
continue
content = msg.get("content")
if not isinstance(content, list):
result.append(msg)
continue
# Filter out image_url blocks with data: scheme.
filtered = [block for block in content if not (isinstance(block, dict) and block.get("type") == "image_url" and isinstance(block.get("image_url"), dict) and str(block["image_url"].get("url", "")).startswith("data:"))]
result.append({**msg, "content": filtered})
return result
def serialize_channel_values_for_api(channel_values: dict[str, Any]) -> dict[str, Any]:
"""Serialize channel values and strip base64 image data from messages.
Convenience wrapper combining :func:`serialize_channel_values` with
:func:`strip_data_url_image_blocks`. Use this in all REST endpoints
that return channel values to the frontend so that ``data:``-scheme
base64 image payloads are never sent over the wire.
"""
result = serialize_channel_values(channel_values)
if isinstance(result.get("messages"), list):
result["messages"] = strip_data_url_image_blocks(result["messages"])
return result
def serialize_messages_tuple(obj: Any) -> Any:
"""Serialize a messages-mode tuple ``(chunk, metadata)``."""
if isinstance(obj, tuple) and len(obj) == 2:
chunk, metadata = obj
return [serialize_lc_object(chunk), metadata if isinstance(metadata, dict) else {}]
return serialize_lc_object(obj)
def serialize(obj: Any, *, mode: str = "") -> Any:
"""Serialize LangChain objects with mode-specific handling.
* ``messages`` — obj is ``(message_chunk, metadata_dict)``
* ``values`` — obj is the full state dict; ``__pregel_*`` keys stripped
* everything else — recursive ``model_dump()`` / ``dict()`` fallback
"""
if mode == "messages":
return serialize_messages_tuple(obj)
if mode == "values":
return serialize_channel_values(obj) if isinstance(obj, dict) else serialize_lc_object(obj)
return serialize_lc_object(obj)