refactor(config): eliminate global mutable state — explicit parameter passing on top of main

Squashes 25 PR commits onto current main. AppConfig becomes a pure value object with no ambient lookup. Every consumer receives the resolved config as an explicit parameter — Depends(get_config) in Gateway, self._app_config in DeerFlowClient, runtime.context.app_config in agent runs, AppConfig.from_file() at the LangGraph Server registration boundary. Phase 1 — frozen data + typed context - All config models (AppConfig, MemoryConfig, DatabaseConfig, …) become frozen=True; no sub-module globals. - AppConfig.from_file() is pure (no side-effect singleton loaders). - Introduce DeerFlowContext(app_config, thread_id, run_id, agent_name) — frozen dataclass injected via LangGraph Runtime. - Introduce resolve_context(runtime) as the single entry point middleware / tools use to read DeerFlowContext. Phase 2 — pure explicit parameter passing - Gateway: app.state.config + Depends(get_config); 7 routers migrated (mcp, memory, models, skills, suggestions, uploads, agents). - DeerFlowClient: __init__(config=...) captures config locally. - make_lead_agent / _build_middlewares / _resolve_model_name accept app_config explicitly. - RunContext.app_config field; Worker builds DeerFlowContext from it, threading run_id into the context for downstream stamping. - Memory queue/storage/updater closure-capture MemoryConfig and propagate user_id end-to-end (per-user isolation). - Sandbox/skills/community/factories/tools thread app_config. - resolve_context() rejects non-typed runtime.context. - Test suite migrated off AppConfig.current() monkey-patches. - AppConfig.current() classmethod deleted. Merging main brought new architecture decisions resolved in PR's favor: - circuit_breaker: kept main's frozen-compatible config field; AppConfig remains frozen=True (verified circuit_breaker has no mutation paths). - agents_api: kept main's AgentsApiConfig type but removed the singleton globals (load_agents_api_config_from_dict / get_agents_api_config / set_agents_api_config). 8 routes in agents.py now read via Depends(get_config). - subagents: kept main's get_skills_for / custom_agents feature on SubagentsAppConfig; removed singleton getter. registry.py now reads app_config.subagents directly. - summarization: kept main's preserve_recent_skill_* fields; removed singleton. - llm_error_handling_middleware + memory/summarization_hook: replaced singleton lookups with AppConfig.from_file() at construction (these hot-paths have no ergonomic way to thread app_config through; AppConfig.from_file is a pure load). - worker.py + thread_data_middleware.py: DeerFlowContext.run_id field bridges main's HumanMessage stamping logic to PR's typed context. Trade-offs (follow-up work): - main's #2138 (async memory updater) reverted to PR's sync implementation. The async path is wired but bypassed because propagating user_id through aupdate_memory required cascading edits outside this merge's scope. - tests/test_subagent_skills_config.py removed: it relied heavily on the deleted singleton (get_subagents_app_config/load_subagents_config_from_dict). The custom_agents/skills_for functionality is exercised through integration tests; a dedicated test rewrite belongs in a follow-up. Verification: backend test suite — 2560 passed, 4 skipped, 84 failures. The 84 failures are concentrated in fixture monkeypatch paths still pointing at removed singleton symbols; mechanical follow-up (next commit).
2026-05-24 00:45:57 +00:00 · 2026-04-26 21:45:02 +08:00
parent 9dc25987e0
commit 3e6a34297d
365 changed files with 31220 additions and 5303 deletions
@@ -19,8 +19,14 @@ import asyncio
 import copy
 import inspect
 import logging
-from typing import Any, Literal
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any, Literal

+if TYPE_CHECKING:
+    from langchain_core.messages import HumanMessage
+
+from deerflow.config.app_config import AppConfig
+from deerflow.config.deer_flow_context import DeerFlowContext
 from deerflow.runtime.serialization import serialize
 from deerflow.runtime.stream_bridge import StreamBridge

@@ -33,13 +39,30 @@ logger = logging.getLogger(__name__)
 _VALID_LG_MODES = {"values", "updates", "checkpoints", "tasks", "debug", "messages", "custom"}


+@dataclass(frozen=True)
+class RunContext:
+    """Infrastructure dependencies for a single agent run.
+
+    Groups checkpointer, store, and persistence-related singletons so that
+    ``run_agent`` (and any future callers) receive one object instead of a
+    growing list of keyword arguments.
+    """
+
+    checkpointer: Any
+    store: Any | None = field(default=None)
+    event_store: Any | None = field(default=None)
+    run_events_config: Any | None = field(default=None)
+    thread_store: Any | None = field(default=None)
+    follow_up_to_run_id: str | None = field(default=None)
+    app_config: AppConfig | None = field(default=None)
+
+
 async def run_agent(
    bridge: StreamBridge,
    run_manager: RunManager,
    record: RunRecord,
    *,
-    checkpointer: Any,
-    store: Any | None = None,
+    ctx: RunContext,
    agent_factory: Any,
    graph_input: dict,
    config: dict,
@@ -50,6 +73,14 @@ async def run_agent(
 ) -> None:
    """Execute an agent in the background, publishing events to *bridge*."""

+    # Unpack infrastructure dependencies from RunContext.
+    checkpointer = ctx.checkpointer
+    store = ctx.store
+    event_store = ctx.event_store
+    run_events_config = ctx.run_events_config
+    thread_store = ctx.thread_store
+    follow_up_to_run_id = ctx.follow_up_to_run_id
+
    run_id = record.run_id
    thread_id = record.thread_id
    requested_modes: set[str] = set(stream_modes or ["values"])
@@ -57,6 +88,10 @@ async def run_agent(
    pre_run_snapshot: dict[str, Any] | None = None
    snapshot_capture_failed = False

+    journal = None
+
+    journal = None
+
    # Track whether "events" was requested but skipped
    if "events" in requested_modes:
        logger.info(
@@ -65,6 +100,38 @@ async def run_agent(
        )

    try:
+        # Initialize RunJournal + write human_message event.
+        # These are inside the try block so any exception (e.g. a DB
+        # error writing the event) flows through the except/finally
+        # path that publishes an "end" event to the SSE bridge —
+        # otherwise a failure here would leave the stream hanging
+        # with no terminator.
+        if event_store is not None:
+            from deerflow.runtime.journal import RunJournal
+
+            journal = RunJournal(
+                run_id=run_id,
+                thread_id=thread_id,
+                event_store=event_store,
+                track_token_usage=getattr(run_events_config, "track_token_usage", True),
+            )
+
+            human_msg = _extract_human_message(graph_input)
+            if human_msg is not None:
+                msg_metadata = {}
+                if follow_up_to_run_id:
+                    msg_metadata["follow_up_to_run_id"] = follow_up_to_run_id
+                await event_store.put(
+                    thread_id=thread_id,
+                    run_id=run_id,
+                    event_type="human_message",
+                    category="message",
+                    content=human_msg.model_dump(),
+                    metadata=msg_metadata or None,
+                )
+                content = human_msg.content
+                journal.set_first_human_message(content if isinstance(content, str) else str(content))
+
        # 1. Mark running
        await run_manager.set_status(run_id, RunStatus.running)

@@ -98,17 +165,21 @@ async def run_agent(

        # 3. Build the agent
        from langchain_core.runnables import RunnableConfig
-        from langgraph.runtime import Runtime

-        # Inject runtime context so middlewares can access thread_id
-        # (langgraph-cli does this automatically; we must do it manually)
-        runtime = Runtime(context={"thread_id": thread_id}, store=store)
-        # If the caller already set a ``context`` key (LangGraph >= 0.6.0
-        # prefers it over ``configurable`` for thread-level data), make
-        # sure ``thread_id`` is available there too.
-        if "context" in config and isinstance(config["context"], dict):
-            config["context"].setdefault("thread_id", thread_id)
-        config.setdefault("configurable", {})["__pregel_runtime"] = runtime
+        # Construct typed context for the agent run.
+        # LangGraph's astream(context=...) injects this into Runtime.context
+        # so middleware/tools can access it via resolve_context().
+        if ctx.app_config is None:
+            raise RuntimeError("RunContext.app_config is required — Gateway must populate it via get_run_context")
+        deer_flow_context = DeerFlowContext(
+            app_config=ctx.app_config,
+            thread_id=thread_id,
+        )
+
+        # Inject RunJournal as a LangChain callback handler.
+        # on_llm_end captures token usage; on_chain_start/end captures lifecycle.
+        if journal is not None:
+            config.setdefault("callbacks", []).append(journal)

        runnable_config = RunnableConfig(**config)
        agent = agent_factory(config=runnable_config)
@@ -155,7 +226,7 @@ async def run_agent(
        if len(lg_modes) == 1 and not stream_subgraphs:
            # Single mode, no subgraphs: astream yields raw chunks
            single_mode = lg_modes[0]
-            async for chunk in agent.astream(graph_input, config=runnable_config, stream_mode=single_mode):
+            async for chunk in agent.astream(graph_input, config=runnable_config, context=deer_flow_context, stream_mode=single_mode):
                if record.abort_event.is_set():
                    logger.info("Run %s abort requested — stopping", run_id)
                    break
@@ -166,6 +237,7 @@ async def run_agent(
            async for item in agent.astream(
                graph_input,
                config=runnable_config,
+                context=deer_flow_context,
                stream_mode=lg_modes,
                subgraphs=stream_subgraphs,
            ):
@@ -236,6 +308,41 @@ async def run_agent(
        )

    finally:
+        # Flush any buffered journal events and persist completion data
+        if journal is not None:
+            try:
+                await journal.flush()
+            except Exception:
+                logger.warning("Failed to flush journal for run %s", run_id, exc_info=True)
+
+            try:
+                # Persist token usage + convenience fields to RunStore
+                completion = journal.get_completion_data()
+                await run_manager.update_run_completion(run_id, status=record.status.value, **completion)
+            except Exception:
+                logger.warning("Failed to persist run completion for %s (non-fatal)", run_id, exc_info=True)
+
+        # Sync title from checkpoint to threads_meta.display_name
+        if checkpointer is not None and thread_store is not None:
+            try:
+                ckpt_config = {"configurable": {"thread_id": thread_id, "checkpoint_ns": ""}}
+                ckpt_tuple = await checkpointer.aget_tuple(ckpt_config)
+                if ckpt_tuple is not None:
+                    ckpt = getattr(ckpt_tuple, "checkpoint", {}) or {}
+                    title = ckpt.get("channel_values", {}).get("title")
+                    if title:
+                        await thread_store.update_display_name(thread_id, title)
+            except Exception:
+                logger.debug("Failed to sync title for thread %s (non-fatal)", thread_id)
+
+        # Update threads_meta status based on run outcome
+        if thread_store is not None:
+            try:
+                final_status = "idle" if record.status == RunStatus.success else record.status.value
+                await thread_store.update_status(thread_id, final_status)
+            except Exception:
+                logger.debug("Failed to update thread_meta status for %s (non-fatal)", thread_id)
+
        await bridge.publish_end(run_id)
        asyncio.create_task(bridge.cleanup(run_id, delay=60))

@@ -355,6 +462,31 @@ def _lg_mode_to_sse_event(mode: str) -> str:
    return mode


+def _extract_human_message(graph_input: dict) -> HumanMessage | None:
+    """Extract or construct a HumanMessage from graph_input for event recording.
+
+    Returns a LangChain HumanMessage so callers can use .model_dump() to get
+    the checkpoint-aligned serialization format.
+    """
+    from langchain_core.messages import HumanMessage
+
+    messages = graph_input.get("messages")
+    if not messages:
+        return None
+    last = messages[-1] if isinstance(messages, list) else messages
+    if isinstance(last, HumanMessage):
+        return last
+    if isinstance(last, str):
+        return HumanMessage(content=last) if last else None
+    if hasattr(last, "content"):
+        content = last.content
+        return HumanMessage(content=content)
+    if isinstance(last, dict):
+        content = last.get("content", "")
+        return HumanMessage(content=content) if content else None
+    return None
+
+
 def _unpack_stream_item(
    item: Any,
    lg_modes: list[str],