fix(auth): replace setup-status 429 rate limit with cached response (#2915)

* fix(auth): replace setup-status 429 rate limit with cached response

  The /api/v1/auth/setup-status endpoint had a 60-second cooldown that
  returned HTTP 429 for all but the first request per IP. When the service
  restarted with multiple browser tabs open, all tabs hit this endpoint
  simultaneously from the same source IP, causing a storm of 429 errors
  that blocked the login flow.

  Replace the cooldown-with-429 model with a per-IP response cache that
  returns the previously computed result within the TTL. The database
  query (count_admin_users) still only runs once per IP per 60 seconds,
  preserving the original performance goal while eliminating spurious
  429 errors on multi-tab reconnection.

  Fixes #2902

* fix(auth): address setup-status cache review issues

Agent-Logs-Url: https://github.com/bytedance/deer-flow/sessions/439a0e8c-8b64-41d4-a3cd-fe9a00eec534

Co-authored-by: WillemJiang <219644+WillemJiang@users.noreply.github.com>

* test(auth): improve readability of setup-status concurrency assertion

Agent-Logs-Url: https://github.com/bytedance/deer-flow/sessions/439a0e8c-8b64-41d4-a3cd-fe9a00eec534

Co-authored-by: WillemJiang <219644+WillemJiang@users.noreply.github.com>

* Apply suggestions from code review

Co-authored-by: Copilot Autofix powered by AI <223894421+github-code-quality[bot]@users.noreply.github.com>

* fix the unit test error

---------

Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
Co-authored-by: Copilot Autofix powered by AI <223894421+github-code-quality[bot]@users.noreply.github.com>
This commit is contained in:
Willem Jiang
2026-05-18 22:07:01 +08:00
committed by GitHub
parent 39f901d3a5
commit b5108e3520
2 changed files with 133 additions and 36 deletions
+54 -20
View File
@@ -1,5 +1,6 @@
"""Authentication endpoints.""" """Authentication endpoints."""
import asyncio
import logging import logging
import os import os
import time import time
@@ -382,9 +383,15 @@ async def get_me(request: Request):
return UserResponse(id=str(user.id), email=user.email, system_role=user.system_role, needs_setup=user.needs_setup) return UserResponse(id=str(user.id), email=user.email, system_role=user.system_role, needs_setup=user.needs_setup)
_SETUP_STATUS_COOLDOWN: dict[str, float] = {} # Per-IP cache: ip → (timestamp, result_dict).
_SETUP_STATUS_COOLDOWN_SECONDS = 60 # Returns the cached result within the TTL instead of 429, because
# the answer (whether an admin exists) rarely changes and returning
# 429 breaks multi-tab / post-restart reconnection storms.
_SETUP_STATUS_CACHE: dict[str, tuple[float, dict]] = {}
_SETUP_STATUS_CACHE_TTL_SECONDS = 60
_MAX_TRACKED_SETUP_STATUS_IPS = 10000 _MAX_TRACKED_SETUP_STATUS_IPS = 10000
_SETUP_STATUS_INFLIGHT: dict[str, asyncio.Task[dict]] = {}
_SETUP_STATUS_INFLIGHT_GUARD = asyncio.Lock()
@router.get("/setup-status") @router.get("/setup-status")
@@ -392,30 +399,57 @@ async def setup_status(request: Request):
"""Check if an admin account exists. Returns needs_setup=True when no admin exists.""" """Check if an admin account exists. Returns needs_setup=True when no admin exists."""
client_ip = _get_client_ip(request) client_ip = _get_client_ip(request)
now = time.time() now = time.time()
last_check = _SETUP_STATUS_COOLDOWN.get(client_ip, 0)
elapsed = now - last_check # Return cached result when within TTL — avoids 429 on multi-tab reconnection.
if elapsed < _SETUP_STATUS_COOLDOWN_SECONDS: cached = _SETUP_STATUS_CACHE.get(client_ip)
retry_after = max(1, int(_SETUP_STATUS_COOLDOWN_SECONDS - elapsed)) if cached is not None:
raise HTTPException( cached_time, cached_result = cached
status_code=status.HTTP_429_TOO_MANY_REQUESTS, if now - cached_time < _SETUP_STATUS_CACHE_TTL_SECONDS:
detail="Setup status check is rate limited", return cached_result
headers={"Retry-After": str(retry_after)},
) async with _SETUP_STATUS_INFLIGHT_GUARD:
# Recheck cache after waiting for the inflight guard.
now = time.time()
cached = _SETUP_STATUS_CACHE.get(client_ip)
if cached is not None:
cached_time, cached_result = cached
if now - cached_time < _SETUP_STATUS_CACHE_TTL_SECONDS:
return cached_result
task = _SETUP_STATUS_INFLIGHT.get(client_ip)
if task is None:
# Evict stale entries when dict grows too large to bound memory usage. # Evict stale entries when dict grows too large to bound memory usage.
if len(_SETUP_STATUS_COOLDOWN) >= _MAX_TRACKED_SETUP_STATUS_IPS: if len(_SETUP_STATUS_CACHE) >= _MAX_TRACKED_SETUP_STATUS_IPS:
cutoff = now - _SETUP_STATUS_COOLDOWN_SECONDS cutoff = now - _SETUP_STATUS_CACHE_TTL_SECONDS
stale = [k for k, t in _SETUP_STATUS_COOLDOWN.items() if t < cutoff] stale = [k for k, (t, _) in _SETUP_STATUS_CACHE.items() if t < cutoff]
for k in stale: for k in stale:
del _SETUP_STATUS_COOLDOWN[k] del _SETUP_STATUS_CACHE[k]
# If still too large after evicting expired entries, remove oldest half. if len(_SETUP_STATUS_CACHE) >= _MAX_TRACKED_SETUP_STATUS_IPS:
if len(_SETUP_STATUS_COOLDOWN) >= _MAX_TRACKED_SETUP_STATUS_IPS: by_time = sorted(_SETUP_STATUS_CACHE.items(), key=lambda entry: entry[1][0])
by_time = sorted(_SETUP_STATUS_COOLDOWN.items(), key=lambda kv: kv[1])
for k, _ in by_time[: len(by_time) // 2]: for k, _ in by_time[: len(by_time) // 2]:
del _SETUP_STATUS_COOLDOWN[k] del _SETUP_STATUS_CACHE[k]
_SETUP_STATUS_COOLDOWN[client_ip] = now
async def _compute_setup_status() -> dict:
admin_count = await get_local_provider().count_admin_users() admin_count = await get_local_provider().count_admin_users()
return {"needs_setup": admin_count == 0} return {"needs_setup": admin_count == 0}
task = asyncio.create_task(_compute_setup_status())
_SETUP_STATUS_INFLIGHT[client_ip] = task
try:
result = await task
finally:
async with _SETUP_STATUS_INFLIGHT_GUARD:
if _SETUP_STATUS_INFLIGHT.get(client_ip) is task:
del _SETUP_STATUS_INFLIGHT[client_ip]
# Cache only the stable "initialized" result to avoid stale setup redirects.
if result["needs_setup"] is False:
_SETUP_STATUS_CACHE[client_ip] = (time.time(), result)
else:
_SETUP_STATUS_CACHE.pop(client_ip, None)
return result
class InitializeAdminRequest(BaseModel): class InitializeAdminRequest(BaseModel):
"""Request model for first-boot admin account creation.""" """Request model for first-boot admin account creation."""
+74 -11
View File
@@ -22,7 +22,7 @@ _TEST_SECRET = "test-secret-key-initialize-admin-min-32"
def _setup_auth(tmp_path): def _setup_auth(tmp_path):
"""Fresh SQLite engine + auth config per test.""" """Fresh SQLite engine + auth config per test."""
from app.gateway import deps from app.gateway import deps
from app.gateway.routers.auth import _SETUP_STATUS_COOLDOWN from app.gateway.routers.auth import _SETUP_STATUS_CACHE, _SETUP_STATUS_INFLIGHT
from deerflow.persistence.engine import close_engine, init_engine from deerflow.persistence.engine import close_engine, init_engine
set_auth_config(AuthConfig(jwt_secret=_TEST_SECRET)) set_auth_config(AuthConfig(jwt_secret=_TEST_SECRET))
@@ -30,13 +30,15 @@ def _setup_auth(tmp_path):
asyncio.run(init_engine("sqlite", url=url, sqlite_dir=str(tmp_path))) asyncio.run(init_engine("sqlite", url=url, sqlite_dir=str(tmp_path)))
deps._cached_local_provider = None deps._cached_local_provider = None
deps._cached_repo = None deps._cached_repo = None
_SETUP_STATUS_COOLDOWN.clear() _SETUP_STATUS_CACHE.clear()
_SETUP_STATUS_INFLIGHT.clear()
try: try:
yield yield
finally: finally:
deps._cached_local_provider = None deps._cached_local_provider = None
deps._cached_repo = None deps._cached_repo = None
_SETUP_STATUS_COOLDOWN.clear() _SETUP_STATUS_CACHE.clear()
_SETUP_STATUS_INFLIGHT.clear()
asyncio.run(close_engine()) asyncio.run(close_engine())
@@ -168,15 +170,76 @@ def test_setup_status_false_when_only_regular_user_exists(client):
assert resp.json()["needs_setup"] is True assert resp.json()["needs_setup"] is True
def test_setup_status_rate_limited_on_second_call(client): def test_setup_status_returns_cached_result_on_rapid_calls(client):
"""Second /setup-status call within the cooldown window returns 429 with Retry-After.""" """Rapid /setup-status calls return the cached result (200) instead of 429."""
# First call succeeds. client.post("/api/v1/auth/initialize", json=_init_payload())
# First call succeeds and computes the result.
resp1 = client.get("/api/v1/auth/setup-status") resp1 = client.get("/api/v1/auth/setup-status")
assert resp1.status_code == 200 assert resp1.status_code == 200
# Immediate second call is rate-limited. # Immediate second call returns cached result, not 429.
resp2 = client.get("/api/v1/auth/setup-status") resp2 = client.get("/api/v1/auth/setup-status")
assert resp2.status_code == 429 assert resp2.status_code == 200
assert "Retry-After" in resp2.headers assert resp2.json() == resp1.json()
retry_after = int(resp2.headers["Retry-After"]) assert resp2.json()["needs_setup"] is False
assert 1 <= retry_after <= 60
def test_setup_status_does_not_return_stale_true_after_initialize(client):
"""A pre-initialize setup-status response should not stay cached as True."""
before = client.get("/api/v1/auth/setup-status")
assert before.status_code == 200
assert before.json()["needs_setup"] is True
init = client.post("/api/v1/auth/initialize", json=_init_payload())
assert init.status_code == 201
after = client.get("/api/v1/auth/setup-status")
assert after.status_code == 200
assert after.json()["needs_setup"] is False
@pytest.mark.asyncio
async def test_setup_status_single_flight_per_ip(monkeypatch):
"""Concurrent requests from same IP share one in-flight DB query."""
from starlette.requests import Request
from app.gateway.routers.auth import (
_SETUP_STATUS_CACHE,
_SETUP_STATUS_INFLIGHT,
setup_status,
)
class _Provider:
def __init__(self):
self.calls = 0
async def count_admin_users(self):
self.calls += 1
await asyncio.sleep(0.05)
return 0
provider = _Provider()
monkeypatch.setattr("app.gateway.routers.auth.get_local_provider", lambda: provider)
_SETUP_STATUS_CACHE.clear()
_SETUP_STATUS_INFLIGHT.clear()
def _request() -> Request:
return Request(
{
"type": "http",
"method": "GET",
"path": "/api/v1/auth/setup-status",
"headers": [],
"client": ("127.0.0.1", 12345),
}
)
results = await asyncio.gather(
setup_status(_request()),
setup_status(_request()),
setup_status(_request()),
)
assert all(result["needs_setup"] is True for result in results)
assert provider.calls == 1