diff --git a/backend/CLAUDE.md b/backend/CLAUDE.md index 11e255777..4732a57ee 100644 --- a/backend/CLAUDE.md +++ b/backend/CLAUDE.md @@ -184,7 +184,17 @@ Setup: Copy `config.example.yaml` to `config.yaml` in the **project root** direc **Config Caching**: `get_app_config()` caches the parsed config, but automatically reloads it when the resolved config path changes or the file's mtime increases. This keeps Gateway and LangGraph reads aligned with `config.yaml` edits without requiring a manual process restart. -**Config Hot-Reload Boundary**: Gateway dependencies route through `get_app_config()` on every request, so per-run fields like `models[*].max_tokens`, `summarization.*`, `title.*`, `memory.*`, `subagents.*`, `tools[*]`, and the agent system prompt pick up `config.yaml` edits on the next message. Infrastructure fields that the gateway captures once at startup are **restart-required**: `database.*`, `checkpointer.*` (including SQLite WAL/journal settings), `run_events.*`, `stream_bridge.*`, `sandbox.use`, `log_level`, and the `channels.*` IM platform credentials. Changing those without restart leaves the live process bound to the old engines. +**Config Hot-Reload Boundary**: Gateway dependencies route through `get_app_config()` on every request, so per-run fields like `models[*].max_tokens`, `summarization.*`, `title.*`, `memory.*`, `subagents.*`, `tools[*]`, and the agent system prompt pick up `config.yaml` edits on the next message. Infrastructure fields that the gateway captures once at startup are **restart-required**: + +| Field | Why a restart is required | +|---|---| +| `database.*` | `init_engine_from_config()` runs once during `langgraph_runtime()` startup; the SQLAlchemy engine holds the connection pool. | +| `checkpointer.*` (including SQLite WAL/journal settings) | `make_checkpointer()` binds the persistent checkpointer once at startup. | +| `run_events.*` | `make_run_event_store()` selects memory- vs. SQL-backed implementation at startup. | +| `stream_bridge.*` | `make_stream_bridge()` constructs the bridge object once. | +| `sandbox.use` | `get_sandbox_provider()` caches the provider singleton (`_default_sandbox_provider`); a new class path takes effect only on next process start. | +| `log_level` | `apply_logging_level()` is called only in `app.py` startup; it mutates the root logger's level, and `get_app_config()` returning a fresh `AppConfig` does not retrigger it. | +| `channels.*` IM platform credentials | `start_channel_service()` is invoked once during startup; live channels are not rebuilt on config change. | Configuration priority: 1. Explicit `config_path` argument diff --git a/backend/app/gateway/deps.py b/backend/app/gateway/deps.py index 51363014b..d81bdb2c9 100644 --- a/backend/app/gateway/deps.py +++ b/backend/app/gateway/deps.py @@ -8,6 +8,7 @@ Initialization is handled directly in ``app.py`` via :class:`AsyncExitStack`. from __future__ import annotations +import logging from collections.abc import AsyncGenerator, Callable from contextlib import AsyncExitStack, asynccontextmanager from typing import TYPE_CHECKING, TypeVar, cast @@ -21,6 +22,8 @@ from deerflow.runtime import RunContext, RunManager, StreamBridge from deerflow.runtime.events.store.base import RunEventStore from deerflow.runtime.runs.store.base import RunStore +logger = logging.getLogger(__name__) + if TYPE_CHECKING: from app.gateway.auth.local_provider import LocalAuthProvider from app.gateway.auth.repositories.sqlite import SQLiteUserRepository @@ -30,7 +33,7 @@ if TYPE_CHECKING: T = TypeVar("T") -def get_config(request: Request) -> AppConfig: # noqa: ARG001 - kept for FastAPI Depends signature +def get_config() -> AppConfig: """Return the freshest ``AppConfig`` for the current request. Routes through :func:`deerflow.config.app_config.get_app_config`, which @@ -41,10 +44,16 @@ def get_config(request: Request) -> AppConfig: # noqa: ARG001 - kept for FastAP Reading from ``get_app_config`` here closes the bytedance/deer-flow issue #3107 BUG-001 split-brain where the worker / lead-agent thread saw a stale startup snapshot. + + Any failure to materialise the config (missing file, permission denied, + YAML parse error, validation error) is reported as 503 — semantically + "the gateway cannot serve requests without a usable configuration" — and + logged with the original exception so operators have something to debug. """ try: return get_app_config() - except FileNotFoundError as exc: + except Exception as exc: # noqa: BLE001 - request boundary: log and degrade gracefully + logger.exception("Failed to load AppConfig at request time") raise HTTPException(status_code=503, detail="Configuration not available") from exc @@ -151,7 +160,7 @@ def get_run_context(request: Request) -> RunContext: Returns a *base* context with infrastructure dependencies. """ - config = get_config(request) + config = get_config() return RunContext( checkpointer=get_checkpointer(request), store=get_store(request), diff --git a/backend/tests/test_gateway_config_freshness.py b/backend/tests/test_gateway_config_freshness.py index 9dc7d05cb..b0d4268af 100644 --- a/backend/tests/test_gateway_config_freshness.py +++ b/backend/tests/test_gateway_config_freshness.py @@ -20,6 +20,7 @@ import pytest from fastapi import Depends, FastAPI from fastapi.testclient import TestClient +from app.gateway import deps as gateway_deps from app.gateway.deps import get_config from deerflow.config.app_config import ( AppConfig, @@ -108,3 +109,35 @@ def test_get_config_respects_test_set_app_config(): app = _build_app() client = TestClient(app) assert client.get("/probe").json() == {"log_level": "warning"} + + +@pytest.mark.parametrize( + "exception", + [ + FileNotFoundError("config.yaml not found"), + PermissionError("config.yaml not readable"), + ValueError("invalid config"), + RuntimeError("yaml parse error"), + ], +) +def test_get_config_returns_503_on_any_load_failure(monkeypatch, exception): + """Any failure to materialise the config must surface as 503, not 500. + + Bytedance/deer-flow issue #3107 BUG-001 review: the original snapshot + contract returned 503 when ``app.state.config is None``. The first cut of + this fix only mapped ``FileNotFoundError`` to 503, which left + ``PermissionError`` / ``yaml.YAMLError`` / ``ValidationError`` etc. bubbling + up as 500. Catch every load failure at the request boundary. + """ + + def _broken_get_app_config(): + raise exception + + monkeypatch.setattr(gateway_deps, "get_app_config", _broken_get_app_config) + + app = _build_app() + client = TestClient(app, raise_server_exceptions=False) + response = client.get("/probe") + + assert response.status_code == 503 + assert response.json() == {"detail": "Configuration not available"}