From 9c03a71a07e4365be6799c70270bd3cc6a90ef8e Mon Sep 17 00:00:00 2001 From: Xinmin Zeng <135568692+fancyboi999@users.noreply.github.com> Date: Thu, 21 May 2026 21:06:19 +0800 Subject: [PATCH] fix(gateway): preserve message additional_kwargs in normalize_input (#3132) (#3136) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix(gateway): preserve message additional_kwargs in normalize_input (#3132) The gateway's hand-rolled dict→message coercion only forwarded `content` and collapsed every role to `HumanMessage`, silently dropping the frontend's `additional_kwargs.files` payload (along with `id`, `name`, and ai/system/tool roles). Effect on issue #3132: - `UploadsMiddleware` saw no `files` on the last human message, so the just-uploaded file got bucketed under "previous messages" while the current turn was reported as `(empty)`. - The persisted human message had no `files`, so the attachment chip on the message disappeared the moment the optimistic UI cleared. Delegate the conversion to `langchain_core.messages.utils.convert_to_messages` so `additional_kwargs`, `id`, `name`, and non-human roles round-trip unchanged. * fix(gateway): convert malformed-message ValueError into HTTP 400 normalize_input now sits at the request boundary, so a malformed input.messages[N] dict (missing role/type/content, unsupported role, etc.) should surface as 400 with the offending index — not bubble out of FastAPI as 500. Per Copilot review on #3136. --- backend/app/gateway/services.py | 39 ++++++++---- backend/tests/test_gateway_services.py | 88 ++++++++++++++++++++++++++ 2 files changed, 115 insertions(+), 12 deletions(-) diff --git a/backend/app/gateway/services.py b/backend/app/gateway/services.py index 4713d303e..95e26144a 100644 --- a/backend/app/gateway/services.py +++ b/backend/app/gateway/services.py @@ -15,7 +15,8 @@ from collections.abc import Mapping from typing import Any from fastapi import HTTPException, Request -from langchain_core.messages import HumanMessage +from langchain_core.messages import BaseMessage +from langchain_core.messages.utils import convert_to_messages from app.gateway.deps import get_run_context, get_run_manager, get_stream_bridge from app.gateway.utils import sanitize_log_param @@ -76,21 +77,35 @@ def normalize_stream_modes(raw: list[str] | str | None) -> list[str]: def normalize_input(raw_input: dict[str, Any] | None) -> dict[str, Any]: - """Convert LangGraph Platform input format to LangChain state dict.""" + """Convert LangGraph Platform input format to LangChain state dict. + + Delegates dict→message coercion to ``langchain_core.messages.utils.convert_to_messages`` + so that ``additional_kwargs`` (e.g. uploaded-file metadata — gh #3132), ``id``, + ``name``, and non-human roles (ai/system/tool) survive unchanged. An earlier + hand-rolled version only forwarded ``content`` and collapsed every role to + ``HumanMessage``, which silently stripped frontend-supplied attachments. + + Malformed message dicts (missing ``role``/``type``/``content``, unsupported + role, etc.) raise ``HTTPException(400)`` with the offending index, instead + of bubbling up as a 500. The gateway is a system boundary, so per-entry + validation errors are the right shape for clients to retry against. + """ if raw_input is None: return {} messages = raw_input.get("messages") if messages and isinstance(messages, list): - converted = [] - for msg in messages: - if isinstance(msg, dict): - role = msg.get("role", msg.get("type", "user")) - content = msg.get("content", "") - if role in ("user", "human"): - converted.append(HumanMessage(content=content)) - else: - # TODO: handle other message types (system, ai, tool) - converted.append(HumanMessage(content=content)) + converted: list[Any] = [] + for index, msg in enumerate(messages): + if isinstance(msg, BaseMessage): + converted.append(msg) + elif isinstance(msg, dict): + try: + converted.extend(convert_to_messages([msg])) + except (ValueError, TypeError, NotImplementedError) as exc: + raise HTTPException( + status_code=400, + detail=f"Invalid message at input.messages[{index}]: {exc}", + ) from exc else: converted.append(msg) return {**raw_input, "messages": converted} diff --git a/backend/tests/test_gateway_services.py b/backend/tests/test_gateway_services.py index aa9e20e78..2ccd372bf 100644 --- a/backend/tests/test_gateway_services.py +++ b/backend/tests/test_gateway_services.py @@ -81,6 +81,94 @@ def test_normalize_input_passthrough(): assert result == {"custom_key": "value"} +def test_normalize_input_preserves_additional_kwargs_and_id(): + """Regression: gh #3132 — frontend ships uploaded-file metadata in + additional_kwargs.files (and a client-side message id). The gateway must + not strip them before the graph runs, otherwise UploadsMiddleware reports + "(empty)" for new uploads and the frontend message loses its file chip. + """ + from langchain_core.messages import HumanMessage + + from app.gateway.services import normalize_input + + files = [{"filename": "a.csv", "size": 100, "path": "/mnt/user-data/uploads/a.csv", "status": "uploaded"}] + result = normalize_input( + { + "messages": [ + { + "type": "human", + "id": "client-msg-1", + "name": "user-input", + "content": [{"type": "text", "text": "clean it"}], + "additional_kwargs": {"files": files, "custom": "keep-me"}, + } + ] + } + ) + assert len(result["messages"]) == 1 + msg = result["messages"][0] + assert isinstance(msg, HumanMessage) + assert msg.id == "client-msg-1" + assert msg.name == "user-input" + assert msg.content == [{"type": "text", "text": "clean it"}] + assert msg.additional_kwargs == {"files": files, "custom": "keep-me"} + + +def test_normalize_input_passes_through_basemessage_instances(): + from langchain_core.messages import HumanMessage + + from app.gateway.services import normalize_input + + msg = HumanMessage(content="hello", id="m-1", additional_kwargs={"files": [{"filename": "x"}]}) + result = normalize_input({"messages": [msg]}) + assert result["messages"][0] is msg + + +def test_normalize_input_rejects_malformed_message_with_400(): + """Boundary validation: ``convert_to_messages`` raises ``ValueError`` when a + message dict is missing ``role``/``type``/``content``. ``normalize_input`` + runs inside the gateway HTTP boundary, so a malformed payload should surface + as a 400 referencing the offending entry — not bubble up as a 500. + + Raised after the Copilot review on PR #3136. + """ + import pytest + from fastapi import HTTPException + + from app.gateway.services import normalize_input + + with pytest.raises(HTTPException) as excinfo: + normalize_input({"messages": [{"role": "human", "content": "ok"}, {"oops": "no role here"}]}) + assert excinfo.value.status_code == 400 + assert "input.messages[1]" in excinfo.value.detail + + +def test_normalize_input_handles_non_human_roles(): + """The previous implementation collapsed every role to HumanMessage with a + `# TODO: handle other message types` comment. Resuming a thread with prior + AI/tool messages would silently rewrite them as human turns — corrupting + the conversation. Use langchain's standard conversion so ai/system/tool + roles round-trip correctly. + """ + from langchain_core.messages import AIMessage, SystemMessage, ToolMessage + + from app.gateway.services import normalize_input + + result = normalize_input( + { + "messages": [ + {"role": "system", "content": "sys"}, + {"role": "ai", "content": "hi", "id": "ai-1"}, + {"role": "tool", "content": "result", "tool_call_id": "call-1"}, + ] + } + ) + types = [type(m) for m in result["messages"]] + assert types == [SystemMessage, AIMessage, ToolMessage] + assert result["messages"][1].id == "ai-1" + assert result["messages"][2].tool_call_id == "call-1" + + def test_build_run_config_basic(): from app.gateway.services import build_run_config