Merge branch 'main' into fix-3127

2026-05-23 00:16:48 +00:00 · 2026-05-22 21:56:04 +08:00
parent 4731605d99 f0bae28636
commit cfd9c61b9a
57 changed files with 4981 additions and 195 deletions
@@ -161,10 +161,16 @@ async def _migrate_orphaned_threads(store, admin_user_id: str) -> int:
 async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
    """Application lifespan handler."""

-    # Load config and check necessary environment variables at startup
+    # Load config and check necessary environment variables at startup.
+    # `startup_config` is a local snapshot used only for one-shot bootstrap
+    # work (logging level, langgraph_runtime engines, channels). Request-time
+    # config resolution always routes through `get_app_config()` in
+    # `app/gateway/deps.py::get_config()` so `config.yaml` edits become
+    # visible without a process restart. We deliberately do NOT cache this
+    # snapshot on `app.state` to keep that contract enforceable.
    try:
-        app.state.config = get_app_config()
-        apply_logging_level(app.state.config.log_level)
+        startup_config = get_app_config()
+        apply_logging_level(startup_config.log_level)
        logger.info("Configuration loaded successfully")
    except Exception as e:
        error_msg = f"Failed to load configuration during gateway startup: {e}"
@@ -174,7 +180,7 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
    logger.info(f"Starting API Gateway on {config.host}:{config.port}")

    # Initialize LangGraph runtime components (StreamBridge, RunManager, checkpointer, store)
-    async with langgraph_runtime(app):
+    async with langgraph_runtime(app, startup_config):
        logger.info("LangGraph runtime initialised")

        # Check admin bootstrap state and migrate orphan threads after admin exists.
@@ -185,7 +191,7 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
        try:
            from app.channels.service import start_channel_service

-            channel_service = await start_channel_service(app.state.config)
+            channel_service = await start_channel_service(startup_config)
            logger.info("Channel service started: %s", channel_service.get_status())
        except Exception:
            logger.exception("No IM channels configured or channel service failed to start")
@@ -3,11 +3,21 @@
 **Getters** (used by routers): raise 503 when a required dependency is
 missing, except ``get_store`` which returns ``None``.

+``AppConfig`` is intentionally *not* cached on ``app.state``. Routers and the
+run path resolve it through :func:`deerflow.config.app_config.get_app_config`,
+which performs mtime-based hot reload, so edits to ``config.yaml`` take
+effect on the next request without a process restart. The engines created in
+:func:`langgraph_runtime` (stream bridge, persistence, checkpointer, store,
+run-event store) accept a ``startup_config`` snapshot — they are
+restart-required by design and stay bound to that snapshot to keep the live
+process consistent with itself.
+
 Initialization is handled directly in ``app.py`` via :class:`AsyncExitStack`.
 """

 from __future__ import annotations

+import logging
 from collections.abc import AsyncGenerator, Callable
 from contextlib import AsyncExitStack, asynccontextmanager
 from typing import TYPE_CHECKING, TypeVar, cast
@@ -15,12 +25,14 @@ from typing import TYPE_CHECKING, TypeVar, cast
 from fastapi import FastAPI, HTTPException, Request
 from langgraph.types import Checkpointer

-from deerflow.config.app_config import AppConfig
+from deerflow.config.app_config import AppConfig, get_app_config
 from deerflow.persistence.feedback import FeedbackRepository
 from deerflow.runtime import RunContext, RunManager, StreamBridge
 from deerflow.runtime.events.store.base import RunEventStore
 from deerflow.runtime.runs.store.base import RunStore

+logger = logging.getLogger(__name__)
+
 if TYPE_CHECKING:
    from app.gateway.auth.local_provider import LocalAuthProvider
    from app.gateway.auth.repositories.sqlite import SQLiteUserRepository
@@ -30,21 +42,55 @@ if TYPE_CHECKING:
 T = TypeVar("T")


-def get_config(request: Request) -> AppConfig:
-    """Return the app-scoped ``AppConfig`` stored on ``app.state``."""
-    config = getattr(request.app.state, "config", None)
-    if config is None:
-        raise HTTPException(status_code=503, detail="Configuration not available")
-    return config
+def get_config() -> AppConfig:
+    """Return the freshest ``AppConfig`` for the current request.
+
+    Routes through :func:`deerflow.config.app_config.get_app_config`, which
+    honours runtime ``ContextVar`` overrides and reloads ``config.yaml`` from
+    disk when its mtime changes. ``AppConfig`` is not cached on ``app.state``
+    at all — the only startup-time snapshot lives as a local
+    ``startup_config`` variable inside ``lifespan()`` and is passed
+    explicitly into :func:`langgraph_runtime` for the engines that are
+    restart-required by design. Routing every request through
+    :func:`get_app_config` closes the bytedance/deer-flow issue #3107 BUG-001
+    split-brain where the worker / lead-agent thread saw a stale startup
+    snapshot.
+
+    Any failure to materialise the config (missing file, permission denied,
+    YAML parse error, validation error) is reported as 503 — semantically
+    "the gateway cannot serve requests without a usable configuration" — and
+    logged with the original exception so operators have something to debug.
+    """
+    try:
+        return get_app_config()
+    except Exception as exc:  # noqa: BLE001 - request boundary: log and degrade gracefully
+        logger.exception("Failed to load AppConfig at request time")
+        raise HTTPException(status_code=503, detail="Configuration not available") from exc


@asynccontextmanager
-async def langgraph_runtime(app: FastAPI) -> AsyncGenerator[None, None]:
+async def langgraph_runtime(app: FastAPI, startup_config: AppConfig) -> AsyncGenerator[None, None]:
    """Bootstrap and tear down all LangGraph runtime singletons.

+    ``startup_config`` is the ``AppConfig`` snapshot taken once during
+    ``lifespan()`` for one-shot infrastructure bootstrap. The engines and
+    stores constructed here (stream bridge, persistence engine, checkpointer,
+    store, run-event store) are restart-required by design — they hold live
+    connections, file handles, or singleton providers — so they bind to this
+    snapshot and survive across `config.yaml` edits. Request-time consumers
+    must still go through :func:`get_config` for any field that should be
+    hot-reloadable. See ``backend/CLAUDE.md`` "Config Hot-Reload Boundary".
+
+    The matching ``run_events_config`` is frozen onto ``app.state`` so
+    :func:`get_run_context` pairs a freshly-loaded ``AppConfig`` with the
+    *startup-time* run-events configuration the underlying ``event_store``
+    was built from — otherwise the runtime could end up combining a live
+    new ``run_events_config`` with an event store still bound to the
+    previous backend.
+
    Usage in ``app.py``::

-        async with langgraph_runtime(app):
+        async with langgraph_runtime(app, startup_config):
            yield
    """
    from deerflow.persistence.engine import close_engine, get_session_factory, init_engine_from_config
@@ -53,9 +99,7 @@ async def langgraph_runtime(app: FastAPI) -> AsyncGenerator[None, None]:
    from deerflow.runtime.events.store import make_run_event_store

    async with AsyncExitStack() as stack:
-        config = getattr(app.state, "config", None)
-        if config is None:
-            raise RuntimeError("langgraph_runtime() requires app.state.config to be initialized")
+        config = startup_config

        app.state.stream_bridge = await stack.enter_async_context(make_stream_bridge(config))

@@ -84,8 +128,12 @@ async def langgraph_runtime(app: FastAPI) -> AsyncGenerator[None, None]:

        app.state.thread_store = make_thread_store(sf, app.state.store)

-        # Run event store (has its own factory with config-driven backend selection)
+        # Run event store. The store and the matching ``run_events_config`` are
+        # both frozen at startup so ``get_run_context`` does not combine a
+        # freshly-reloaded ``AppConfig.run_events`` with a store still bound to
+        # the previous backend.
        run_events_config = getattr(config, "run_events", None)
+        app.state.run_events_config = run_events_config
        app.state.run_event_store = make_run_event_store(run_events_config)

        # RunManager with store backing for persistence
@@ -139,16 +187,20 @@ def get_thread_store(request: Request) -> ThreadMetaStore:
 def get_run_context(request: Request) -> RunContext:
    """Build a :class:`RunContext` from ``app.state`` singletons.

-    Returns a *base* context with infrastructure dependencies.
+    Returns a *base* context with infrastructure dependencies. The
+    ``app_config`` field is resolved live so per-run fields (e.g.
+    ``models[*].max_tokens``) follow ``config.yaml`` edits; the
+    ``event_store`` / ``run_events_config`` pair stays frozen to the snapshot
+    captured in :func:`langgraph_runtime` so callers never see a store bound
+    to one backend paired with a config pointing at another.
    """
-    config = get_config(request)
    return RunContext(
        checkpointer=get_checkpointer(request),
        store=get_store(request),
        event_store=get_run_event_store(request),
-        run_events_config=getattr(config, "run_events", None),
+        run_events_config=getattr(request.app.state, "run_events_config", None),
        thread_store=get_thread_store(request),
-        app_config=config,
+        app_config=get_config(),
    )


@@ -66,6 +66,14 @@ class RunResponse(BaseModel):
    multitask_strategy: str = "reject"
    created_at: str = ""
    updated_at: str = ""
+    total_input_tokens: int = 0
+    total_output_tokens: int = 0
+    total_tokens: int = 0
+    llm_call_count: int = 0
+    lead_agent_tokens: int = 0
+    subagent_tokens: int = 0
+    middleware_tokens: int = 0
+    message_count: int = 0


 class ThreadTokenUsageModelBreakdown(BaseModel):
@@ -111,6 +119,14 @@ def _record_to_response(record: RunRecord) -> RunResponse:
        multitask_strategy=record.multitask_strategy,
        created_at=record.created_at,
        updated_at=record.updated_at,
+        total_input_tokens=record.total_input_tokens,
+        total_output_tokens=record.total_output_tokens,
+        total_tokens=record.total_tokens,
+        llm_call_count=record.llm_call_count,
+        lead_agent_tokens=record.lead_agent_tokens,
+        subagent_tokens=record.subagent_tokens,
+        middleware_tokens=record.middleware_tokens,
+        message_count=record.message_count,
    )


@@ -402,8 +418,15 @@ async def list_run_events(

@router.get("/{thread_id}/token-usage", response_model=ThreadTokenUsageResponse)
@require_permission("threads", "read", owner_check=True)
-async def thread_token_usage(thread_id: str, request: Request) -> ThreadTokenUsageResponse:
+async def thread_token_usage(
+    thread_id: str,
+    request: Request,
+    include_active: bool = Query(default=False, description="Include running run progress snapshots"),
+) -> ThreadTokenUsageResponse:
    """Thread-level token usage aggregation."""
    run_store = get_run_store(request)
-    agg = await run_store.aggregate_tokens_by_thread(thread_id)
+    if include_active:
+        agg = await run_store.aggregate_tokens_by_thread(thread_id, include_active=True)
+    else:
+        agg = await run_store.aggregate_tokens_by_thread(thread_id)
    return ThreadTokenUsageResponse(thread_id=thread_id, **agg)
@@ -15,7 +15,8 @@ from collections.abc import Mapping
 from typing import Any

 from fastapi import HTTPException, Request
-from langchain_core.messages import HumanMessage
+from langchain_core.messages import BaseMessage
+from langchain_core.messages.utils import convert_to_messages

 from app.gateway.deps import get_run_context, get_run_manager, get_stream_bridge
 from app.gateway.utils import sanitize_log_param
@@ -76,21 +77,35 @@ def normalize_stream_modes(raw: list[str] | str | None) -> list[str]:


 def normalize_input(raw_input: dict[str, Any] | None) -> dict[str, Any]:
-    """Convert LangGraph Platform input format to LangChain state dict."""
+    """Convert LangGraph Platform input format to LangChain state dict.
+
+    Delegates dict→message coercion to ``langchain_core.messages.utils.convert_to_messages``
+    so that ``additional_kwargs`` (e.g. uploaded-file metadata — gh #3132), ``id``,
+    ``name``, and non-human roles (ai/system/tool) survive unchanged.  An earlier
+    hand-rolled version only forwarded ``content`` and collapsed every role to
+    ``HumanMessage``, which silently stripped frontend-supplied attachments.
+
+    Malformed message dicts (missing ``role``/``type``/``content``, unsupported
+    role, etc.) raise ``HTTPException(400)`` with the offending index, instead
+    of bubbling up as a 500.  The gateway is a system boundary, so per-entry
+    validation errors are the right shape for clients to retry against.
+    """
    if raw_input is None:
        return {}
    messages = raw_input.get("messages")
    if messages and isinstance(messages, list):
-        converted = []
-        for msg in messages:
-            if isinstance(msg, dict):
-                role = msg.get("role", msg.get("type", "user"))
-                content = msg.get("content", "")
-                if role in ("user", "human"):
-                    converted.append(HumanMessage(content=content))
-                else:
-                    # TODO: handle other message types (system, ai, tool)
-                    converted.append(HumanMessage(content=content))
+        converted: list[Any] = []
+        for index, msg in enumerate(messages):
+            if isinstance(msg, BaseMessage):
+                converted.append(msg)
+            elif isinstance(msg, dict):
+                try:
+                    converted.extend(convert_to_messages([msg]))
+                except (ValueError, TypeError, NotImplementedError) as exc:
+                    raise HTTPException(
+                        status_code=400,
+                        detail=f"Invalid message at input.messages[{index}]: {exc}",
+                    ) from exc
            else:
                converted.append(msg)
        return {**raw_input, "messages": converted}