fix(gateway): preserve message additional_kwargs in normalize_input (#3132) (#3136)

* fix(gateway): preserve message additional_kwargs in normalize_input (#3132) The gateway's hand-rolled dict→message coercion only forwarded `content` and collapsed every role to `HumanMessage`, silently dropping the frontend's `additional_kwargs.files` payload (along with `id`, `name`, and ai/system/tool roles). Effect on issue #3132: - `UploadsMiddleware` saw no `files` on the last human message, so the just-uploaded file got bucketed under "previous messages" while the current turn was reported as `(empty)`. - The persisted human message had no `files`, so the attachment chip on the message disappeared the moment the optimistic UI cleared. Delegate the conversion to `langchain_core.messages.utils.convert_to_messages` so `additional_kwargs`, `id`, `name`, and non-human roles round-trip unchanged. * fix(gateway): convert malformed-message ValueError into HTTP 400 normalize_input now sits at the request boundary, so a malformed input.messages[N] dict (missing role/type/content, unsupported role, etc.) should surface as 400 with the offending index — not bubble out of FastAPI as 500. Per Copilot review on #3136.
2026-05-21 15:36:48 +00:00 · 2026-05-21 21:06:19 +08:00
parent 1c5c585741
commit 9c03a71a07
2 changed files with 115 additions and 12 deletions
@@ -15,7 +15,8 @@ from collections.abc import Mapping
 from typing import Any

 from fastapi import HTTPException, Request
-from langchain_core.messages import HumanMessage
+from langchain_core.messages import BaseMessage
+from langchain_core.messages.utils import convert_to_messages

 from app.gateway.deps import get_run_context, get_run_manager, get_stream_bridge
 from app.gateway.utils import sanitize_log_param
@@ -76,21 +77,35 @@ def normalize_stream_modes(raw: list[str] | str | None) -> list[str]:


 def normalize_input(raw_input: dict[str, Any] | None) -> dict[str, Any]:
-    """Convert LangGraph Platform input format to LangChain state dict."""
+    """Convert LangGraph Platform input format to LangChain state dict.
+
+    Delegates dict→message coercion to ``langchain_core.messages.utils.convert_to_messages``
+    so that ``additional_kwargs`` (e.g. uploaded-file metadata — gh #3132), ``id``,
+    ``name``, and non-human roles (ai/system/tool) survive unchanged.  An earlier
+    hand-rolled version only forwarded ``content`` and collapsed every role to
+    ``HumanMessage``, which silently stripped frontend-supplied attachments.
+
+    Malformed message dicts (missing ``role``/``type``/``content``, unsupported
+    role, etc.) raise ``HTTPException(400)`` with the offending index, instead
+    of bubbling up as a 500.  The gateway is a system boundary, so per-entry
+    validation errors are the right shape for clients to retry against.
+    """
    if raw_input is None:
        return {}
    messages = raw_input.get("messages")
    if messages and isinstance(messages, list):
-        converted = []
-        for msg in messages:
-            if isinstance(msg, dict):
-                role = msg.get("role", msg.get("type", "user"))
-                content = msg.get("content", "")
-                if role in ("user", "human"):
-                    converted.append(HumanMessage(content=content))
-                else:
-                    # TODO: handle other message types (system, ai, tool)
-                    converted.append(HumanMessage(content=content))
+        converted: list[Any] = []
+        for index, msg in enumerate(messages):
+            if isinstance(msg, BaseMessage):
+                converted.append(msg)
+            elif isinstance(msg, dict):
+                try:
+                    converted.extend(convert_to_messages([msg]))
+                except (ValueError, TypeError, NotImplementedError) as exc:
+                    raise HTTPException(
+                        status_code=400,
+                        detail=f"Invalid message at input.messages[{index}]: {exc}",
+                    ) from exc
            else:
                converted.append(msg)
        return {**raw_input, "messages": converted}
@@ -81,6 +81,94 @@ def test_normalize_input_passthrough():
    assert result == {"custom_key": "value"}


+def test_normalize_input_preserves_additional_kwargs_and_id():
+    """Regression: gh #3132 — frontend ships uploaded-file metadata in
+    additional_kwargs.files (and a client-side message id).  The gateway must
+    not strip them before the graph runs, otherwise UploadsMiddleware reports
+    "(empty)" for new uploads and the frontend message loses its file chip.
+    """
+    from langchain_core.messages import HumanMessage
+
+    from app.gateway.services import normalize_input
+
+    files = [{"filename": "a.csv", "size": 100, "path": "/mnt/user-data/uploads/a.csv", "status": "uploaded"}]
+    result = normalize_input(
+        {
+            "messages": [
+                {
+                    "type": "human",
+                    "id": "client-msg-1",
+                    "name": "user-input",
+                    "content": [{"type": "text", "text": "clean it"}],
+                    "additional_kwargs": {"files": files, "custom": "keep-me"},
+                }
+            ]
+        }
+    )
+    assert len(result["messages"]) == 1
+    msg = result["messages"][0]
+    assert isinstance(msg, HumanMessage)
+    assert msg.id == "client-msg-1"
+    assert msg.name == "user-input"
+    assert msg.content == [{"type": "text", "text": "clean it"}]
+    assert msg.additional_kwargs == {"files": files, "custom": "keep-me"}
+
+
+def test_normalize_input_passes_through_basemessage_instances():
+    from langchain_core.messages import HumanMessage
+
+    from app.gateway.services import normalize_input
+
+    msg = HumanMessage(content="hello", id="m-1", additional_kwargs={"files": [{"filename": "x"}]})
+    result = normalize_input({"messages": [msg]})
+    assert result["messages"][0] is msg
+
+
+def test_normalize_input_rejects_malformed_message_with_400():
+    """Boundary validation: ``convert_to_messages`` raises ``ValueError`` when a
+    message dict is missing ``role``/``type``/``content``.  ``normalize_input``
+    runs inside the gateway HTTP boundary, so a malformed payload should surface
+    as a 400 referencing the offending entry — not bubble up as a 500.
+
+    Raised after the Copilot review on PR #3136.
+    """
+    import pytest
+    from fastapi import HTTPException
+
+    from app.gateway.services import normalize_input
+
+    with pytest.raises(HTTPException) as excinfo:
+        normalize_input({"messages": [{"role": "human", "content": "ok"}, {"oops": "no role here"}]})
+    assert excinfo.value.status_code == 400
+    assert "input.messages[1]" in excinfo.value.detail
+
+
+def test_normalize_input_handles_non_human_roles():
+    """The previous implementation collapsed every role to HumanMessage with a
+    `# TODO: handle other message types` comment.  Resuming a thread with prior
+    AI/tool messages would silently rewrite them as human turns — corrupting
+    the conversation.  Use langchain's standard conversion so ai/system/tool
+    roles round-trip correctly.
+    """
+    from langchain_core.messages import AIMessage, SystemMessage, ToolMessage
+
+    from app.gateway.services import normalize_input
+
+    result = normalize_input(
+        {
+            "messages": [
+                {"role": "system", "content": "sys"},
+                {"role": "ai", "content": "hi", "id": "ai-1"},
+                {"role": "tool", "content": "result", "tool_call_id": "call-1"},
+            ]
+        }
+    )
+    types = [type(m) for m in result["messages"]]
+    assert types == [SystemMessage, AIMessage, ToolMessage]
+    assert result["messages"][1].id == "ai-1"
+    assert result["messages"][2].tool_call_id == "call-1"
+
+
 def test_build_run_config_basic():
    from app.gateway.services import build_run_config