fix(auth): replace setup-status 429 rate limit with cached response (#2915)

* fix(auth): replace setup-status 429 rate limit with cached response The /api/v1/auth/setup-status endpoint had a 60-second cooldown that returned HTTP 429 for all but the first request per IP. When the service restarted with multiple browser tabs open, all tabs hit this endpoint simultaneously from the same source IP, causing a storm of 429 errors that blocked the login flow. Replace the cooldown-with-429 model with a per-IP response cache that returns the previously computed result within the TTL. The database query (count_admin_users) still only runs once per IP per 60 seconds, preserving the original performance goal while eliminating spurious 429 errors on multi-tab reconnection. Fixes #2902 * fix(auth): address setup-status cache review issues Agent-Logs-Url: https://github.com/bytedance/deer-flow/sessions/439a0e8c-8b64-41d4-a3cd-fe9a00eec534 Co-authored-by: WillemJiang <219644+WillemJiang@users.noreply.github.com> * test(auth): improve readability of setup-status concurrency assertion Agent-Logs-Url: https://github.com/bytedance/deer-flow/sessions/439a0e8c-8b64-41d4-a3cd-fe9a00eec534 Co-authored-by: WillemJiang <219644+WillemJiang@users.noreply.github.com> * Apply suggestions from code review Co-authored-by: Copilot Autofix powered by AI <223894421+github-code-quality[bot]@users.noreply.github.com> * fix the unit test error --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: Copilot Autofix powered by AI <223894421+github-code-quality[bot]@users.noreply.github.com>
2026-05-18 22:07:01 +08:00
parent 39f901d3a5
commit b5108e3520
2 changed files with 133 additions and 36 deletions
@@ -1,5 +1,6 @@
 """Authentication endpoints."""

+import asyncio
 import logging
 import os
 import time
@@ -382,9 +383,15 @@ async def get_me(request: Request):
    return UserResponse(id=str(user.id), email=user.email, system_role=user.system_role, needs_setup=user.needs_setup)


-_SETUP_STATUS_COOLDOWN: dict[str, float] = {}
-_SETUP_STATUS_COOLDOWN_SECONDS = 60
+# Per-IP cache: ip → (timestamp, result_dict).
+# Returns the cached result within the TTL instead of 429, because
+# the answer (whether an admin exists) rarely changes and returning
+# 429 breaks multi-tab / post-restart reconnection storms.
+_SETUP_STATUS_CACHE: dict[str, tuple[float, dict]] = {}
+_SETUP_STATUS_CACHE_TTL_SECONDS = 60
 _MAX_TRACKED_SETUP_STATUS_IPS = 10000
+_SETUP_STATUS_INFLIGHT: dict[str, asyncio.Task[dict]] = {}
+_SETUP_STATUS_INFLIGHT_GUARD = asyncio.Lock()


@router.get("/setup-status")
@@ -392,29 +399,56 @@ async def setup_status(request: Request):
    """Check if an admin account exists. Returns needs_setup=True when no admin exists."""
    client_ip = _get_client_ip(request)
    now = time.time()
-    last_check = _SETUP_STATUS_COOLDOWN.get(client_ip, 0)
-    elapsed = now - last_check
-    if elapsed < _SETUP_STATUS_COOLDOWN_SECONDS:
-        retry_after = max(1, int(_SETUP_STATUS_COOLDOWN_SECONDS - elapsed))
-        raise HTTPException(
-            status_code=status.HTTP_429_TOO_MANY_REQUESTS,
-            detail="Setup status check is rate limited",
-            headers={"Retry-After": str(retry_after)},
-        )
-    # Evict stale entries when dict grows too large to bound memory usage.
-    if len(_SETUP_STATUS_COOLDOWN) >= _MAX_TRACKED_SETUP_STATUS_IPS:
-        cutoff = now - _SETUP_STATUS_COOLDOWN_SECONDS
-        stale = [k for k, t in _SETUP_STATUS_COOLDOWN.items() if t < cutoff]
-        for k in stale:
-            del _SETUP_STATUS_COOLDOWN[k]
-        # If still too large after evicting expired entries, remove oldest half.
-        if len(_SETUP_STATUS_COOLDOWN) >= _MAX_TRACKED_SETUP_STATUS_IPS:
-            by_time = sorted(_SETUP_STATUS_COOLDOWN.items(), key=lambda kv: kv[1])
-            for k, _ in by_time[: len(by_time) // 2]:
-                del _SETUP_STATUS_COOLDOWN[k]
-    _SETUP_STATUS_COOLDOWN[client_ip] = now
-    admin_count = await get_local_provider().count_admin_users()
-    return {"needs_setup": admin_count == 0}
+
+    # Return cached result when within TTL — avoids 429 on multi-tab reconnection.
+    cached = _SETUP_STATUS_CACHE.get(client_ip)
+    if cached is not None:
+        cached_time, cached_result = cached
+        if now - cached_time < _SETUP_STATUS_CACHE_TTL_SECONDS:
+            return cached_result
+
+    async with _SETUP_STATUS_INFLIGHT_GUARD:
+        # Recheck cache after waiting for the inflight guard.
+        now = time.time()
+        cached = _SETUP_STATUS_CACHE.get(client_ip)
+        if cached is not None:
+            cached_time, cached_result = cached
+            if now - cached_time < _SETUP_STATUS_CACHE_TTL_SECONDS:
+                return cached_result
+
+        task = _SETUP_STATUS_INFLIGHT.get(client_ip)
+        if task is None:
+            # Evict stale entries when dict grows too large to bound memory usage.
+            if len(_SETUP_STATUS_CACHE) >= _MAX_TRACKED_SETUP_STATUS_IPS:
+                cutoff = now - _SETUP_STATUS_CACHE_TTL_SECONDS
+                stale = [k for k, (t, _) in _SETUP_STATUS_CACHE.items() if t < cutoff]
+                for k in stale:
+                    del _SETUP_STATUS_CACHE[k]
+                if len(_SETUP_STATUS_CACHE) >= _MAX_TRACKED_SETUP_STATUS_IPS:
+                    by_time = sorted(_SETUP_STATUS_CACHE.items(), key=lambda entry: entry[1][0])
+                    for k, _ in by_time[: len(by_time) // 2]:
+                        del _SETUP_STATUS_CACHE[k]
+
+            async def _compute_setup_status() -> dict:
+                admin_count = await get_local_provider().count_admin_users()
+                return {"needs_setup": admin_count == 0}
+
+            task = asyncio.create_task(_compute_setup_status())
+            _SETUP_STATUS_INFLIGHT[client_ip] = task
+
+    try:
+        result = await task
+    finally:
+        async with _SETUP_STATUS_INFLIGHT_GUARD:
+            if _SETUP_STATUS_INFLIGHT.get(client_ip) is task:
+                del _SETUP_STATUS_INFLIGHT[client_ip]
+
+    # Cache only the stable "initialized" result to avoid stale setup redirects.
+    if result["needs_setup"] is False:
+        _SETUP_STATUS_CACHE[client_ip] = (time.time(), result)
+    else:
+        _SETUP_STATUS_CACHE.pop(client_ip, None)
+    return result


 class InitializeAdminRequest(BaseModel):
@@ -22,7 +22,7 @@ _TEST_SECRET = "test-secret-key-initialize-admin-min-32"
 def _setup_auth(tmp_path):
    """Fresh SQLite engine + auth config per test."""
    from app.gateway import deps
-    from app.gateway.routers.auth import _SETUP_STATUS_COOLDOWN
+    from app.gateway.routers.auth import _SETUP_STATUS_CACHE, _SETUP_STATUS_INFLIGHT
    from deerflow.persistence.engine import close_engine, init_engine

    set_auth_config(AuthConfig(jwt_secret=_TEST_SECRET))
@@ -30,13 +30,15 @@ def _setup_auth(tmp_path):
    asyncio.run(init_engine("sqlite", url=url, sqlite_dir=str(tmp_path)))
    deps._cached_local_provider = None
    deps._cached_repo = None
-    _SETUP_STATUS_COOLDOWN.clear()
+    _SETUP_STATUS_CACHE.clear()
+    _SETUP_STATUS_INFLIGHT.clear()
    try:
        yield
    finally:
        deps._cached_local_provider = None
        deps._cached_repo = None
-        _SETUP_STATUS_COOLDOWN.clear()
+        _SETUP_STATUS_CACHE.clear()
+        _SETUP_STATUS_INFLIGHT.clear()
        asyncio.run(close_engine())


@@ -168,15 +170,76 @@ def test_setup_status_false_when_only_regular_user_exists(client):
    assert resp.json()["needs_setup"] is True


-def test_setup_status_rate_limited_on_second_call(client):
-    """Second /setup-status call within the cooldown window returns 429 with Retry-After."""
-    # First call succeeds.
+def test_setup_status_returns_cached_result_on_rapid_calls(client):
+    """Rapid /setup-status calls return the cached result (200) instead of 429."""
+    client.post("/api/v1/auth/initialize", json=_init_payload())
+
+    # First call succeeds and computes the result.
    resp1 = client.get("/api/v1/auth/setup-status")
    assert resp1.status_code == 200

-    # Immediate second call is rate-limited.
+    # Immediate second call returns cached result, not 429.
    resp2 = client.get("/api/v1/auth/setup-status")
-    assert resp2.status_code == 429
-    assert "Retry-After" in resp2.headers
-    retry_after = int(resp2.headers["Retry-After"])
-    assert 1 <= retry_after <= 60
+    assert resp2.status_code == 200
+    assert resp2.json() == resp1.json()
+    assert resp2.json()["needs_setup"] is False
+
+
+def test_setup_status_does_not_return_stale_true_after_initialize(client):
+    """A pre-initialize setup-status response should not stay cached as True."""
+    before = client.get("/api/v1/auth/setup-status")
+    assert before.status_code == 200
+    assert before.json()["needs_setup"] is True
+
+    init = client.post("/api/v1/auth/initialize", json=_init_payload())
+    assert init.status_code == 201
+
+    after = client.get("/api/v1/auth/setup-status")
+    assert after.status_code == 200
+    assert after.json()["needs_setup"] is False
+
+
+@pytest.mark.asyncio
+async def test_setup_status_single_flight_per_ip(monkeypatch):
+    """Concurrent requests from same IP share one in-flight DB query."""
+    from starlette.requests import Request
+
+    from app.gateway.routers.auth import (
+        _SETUP_STATUS_CACHE,
+        _SETUP_STATUS_INFLIGHT,
+        setup_status,
+    )
+
+    class _Provider:
+        def __init__(self):
+            self.calls = 0
+
+        async def count_admin_users(self):
+            self.calls += 1
+            await asyncio.sleep(0.05)
+            return 0
+
+    provider = _Provider()
+    monkeypatch.setattr("app.gateway.routers.auth.get_local_provider", lambda: provider)
+    _SETUP_STATUS_CACHE.clear()
+    _SETUP_STATUS_INFLIGHT.clear()
+
+    def _request() -> Request:
+        return Request(
+            {
+                "type": "http",
+                "method": "GET",
+                "path": "/api/v1/auth/setup-status",
+                "headers": [],
+                "client": ("127.0.0.1", 12345),
+            }
+        )
+
+    results = await asyncio.gather(
+        setup_status(_request()),
+        setup_status(_request()),
+        setup_status(_request()),
+    )
+
+    assert all(result["needs_setup"] is True for result in results)
+    assert provider.calls == 1