fix(security): harden auth system and fix run journal logic bug (#2593)

* fix(security): harden auth system and fix run journal logic bug - Fix inverted condition in RunJournal.on_chat_model_start that prevented first human message capture (not messages → messages) - Pre-hash passwords with SHA-256 before bcrypt to avoid silent 72-byte truncation vulnerability - Move load_dotenv() from module scope into get_auth_config() to prevent import-time os.environ mutation breaking test isolation - Return generic ‘Invalid token’ instead of exposing specific error variants (expired, malformed, invalid_signature) to clients - Make @require_auth independently enforce 401 instead of silently passing through when AuthMiddleware is absent - Rate-limit /setup-status endpoint with per-IP cooldown to mitigate initialization-state information leak - Document in-process rate limiter limitation for multi-worker deployments * fix(security): return 429+Retry-After on setup-status rate limit, bound cooldown dict Agent-Logs-Url: https://github.com/bytedance/deer-flow/sessions/070d0be8-99a5-46c8-85bb-6b81b5284021 Co-authored-by: WillemJiang <219644+WillemJiang@users.noreply.github.com> * fix(security): add versioned password hashes with auto-migration on login The SHA-256 pre-hash change silently broke verification for any existing bcrypt-only password hashes. Introduce a <N>$ prefix scheme so hashes are self-describing: - v2 (current): bcrypt(b64(sha256(password))) with $ prefix - v1 (legacy): plain bcrypt, prefixed $ or bare (no prefix) verify_password auto-detects the version and falls back to v1 for older hashes. LocalAuthProvider.authenticate() now rehashes legacy hashes to v2 on successful login via needs_rehash(), so existing users upgrade transparently without a dedicated migration step. * fix(auth): harden verify_password, best-effort rehash, update require_auth docstring, downgrade journal logging - password.py: wrap bcrypt.checkpw in try/except → return False for malformed/corrupt hashes instead of crashing - local_provider.py: wrap auto-rehash update_user() in try/except so transient DB errors don't fail valid logins - authz.py: update require_auth docstring to reflect independent 401 enforcement - journal.py: downgrade on_chat_model_start from INFO to DEBUG, log only metadata (batch_count, message_counts) instead of full serialized/messages content Agent-Logs-Url: https://github.com/bytedance/deer-flow/sessions/48c5cf31-a4ab-418a-982a-6343c37bb299 Co-authored-by: WillemJiang <219644+WillemJiang@users.noreply.github.com> * fix(auth): address code review - narrow ValueError catch, add rehash warning log, rename num_batches Agent-Logs-Url: https://github.com/bytedance/deer-flow/sessions/48c5cf31-a4ab-418a-982a-6343c37bb299 Co-authored-by: WillemJiang <219644+WillemJiang@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
2026-05-23 00:16:48 +00:00 · 2026-04-28 11:34:07 +08:00
parent b8bc4826d8
commit 4e4e4f92a0
10 changed files with 245 additions and 22 deletions
@@ -4,11 +4,8 @@ import logging
 import os
 import secrets

-from dotenv import load_dotenv
 from pydantic import BaseModel, Field

-load_dotenv()
-
 logger = logging.getLogger(__name__)


@@ -37,6 +34,9 @@ def get_auth_config() -> AuthConfig:
    """Get the global AuthConfig instance. Parses from env on first call."""
    global _auth_config
    if _auth_config is None:
+        from dotenv import load_dotenv
+
+        load_dotenv()
        jwt_secret = os.environ.get("AUTH_JWT_SECRET")
        if not jwt_secret:
            jwt_secret = secrets.token_urlsafe(32)
@@ -1,10 +1,14 @@
 """Local email/password authentication provider."""

+import logging
+
 from app.gateway.auth.models import User
-from app.gateway.auth.password import hash_password_async, verify_password_async
+from app.gateway.auth.password import hash_password_async, needs_rehash, verify_password_async
 from app.gateway.auth.providers import AuthProvider
 from app.gateway.auth.repositories.base import UserRepository

+logger = logging.getLogger(__name__)
+

 class LocalAuthProvider(AuthProvider):
    """Email/password authentication provider using local database."""
@@ -43,6 +47,15 @@ class LocalAuthProvider(AuthProvider):
        if not await verify_password_async(password, user.password_hash):
            return None

+        if needs_rehash(user.password_hash):
+            try:
+                user.password_hash = await hash_password_async(password)
+                await self._repo.update_user(user)
+            except Exception:
+                # Rehash is an opportunistic upgrade; a transient DB error must not
+                # prevent an otherwise-valid login from succeeding.
+                logger.warning("Failed to rehash password for user %s; login will still succeed", user.email, exc_info=True)
+
        return user

    async def get_user(self, user_id: str) -> User | None:
@@ -1,18 +1,66 @@
-"""Password hashing utilities using bcrypt directly."""
+"""Password hashing utilities with versioned hash format.
+
+Hash format: ``$dfv<N>$<bcrypt_hash>`` where ``<N>`` is the version.
+
+- **v1** (legacy): ``bcrypt(password)`` — plain bcrypt, susceptible to
+  72-byte silent truncation.
+- **v2** (current): ``bcrypt(b64(sha256(password)))`` — SHA-256 pre-hash
+  avoids the 72-byte truncation limit so the full password contributes
+  to the hash.
+
+Verification auto-detects the version and falls back to v1 for hashes
+without a prefix, so existing deployments upgrade transparently on next
+login.
+"""

 import asyncio
+import base64
+import hashlib

 import bcrypt

+_CURRENT_VERSION = 2
+_PREFIX_V2 = "$dfv2$"
+_PREFIX_V1 = "$dfv1$"
+
+
+def _pre_hash_v2(password: str) -> bytes:
+    """SHA-256 pre-hash to bypass bcrypt's 72-byte limit."""
+    return base64.b64encode(hashlib.sha256(password.encode("utf-8")).digest())
+

 def hash_password(password: str) -> str:
-    """Hash a password using bcrypt."""
-    return bcrypt.hashpw(password.encode("utf-8"), bcrypt.gensalt()).decode("utf-8")
+    """Hash a password (current version: v2 — SHA-256 + bcrypt)."""
+    raw = bcrypt.hashpw(_pre_hash_v2(password), bcrypt.gensalt()).decode("utf-8")
+    return f"{_PREFIX_V2}{raw}"


 def verify_password(plain_password: str, hashed_password: str) -> bool:
-    """Verify a password against its hash."""
-    return bcrypt.checkpw(plain_password.encode("utf-8"), hashed_password.encode("utf-8"))
+    """Verify a password, auto-detecting the hash version.
+
+    Accepts v2 (``$dfv2$…``), v1 (``$dfv1$…``), and bare bcrypt hashes
+    (treated as v1 for backward compatibility with pre-versioning data).
+    """
+    try:
+        if hashed_password.startswith(_PREFIX_V2):
+            bcrypt_hash = hashed_password[len(_PREFIX_V2) :]
+            return bcrypt.checkpw(_pre_hash_v2(plain_password), bcrypt_hash.encode("utf-8"))
+
+        if hashed_password.startswith(_PREFIX_V1):
+            bcrypt_hash = hashed_password[len(_PREFIX_V1) :]
+        else:
+            bcrypt_hash = hashed_password
+
+        return bcrypt.checkpw(plain_password.encode("utf-8"), bcrypt_hash.encode("utf-8"))
+    except ValueError:
+        # bcrypt raises ValueError for malformed or corrupt hashes (e.g., invalid salt).
+        # Fail closed rather than crashing the request.
+        return False
+
+
+def needs_rehash(hashed_password: str) -> bool:
+    """Return True if the hash uses an older version and should be rehashed."""
+    return not hashed_password.startswith(_PREFIX_V2)


 async def hash_password_async(password: str) -> str:
@@ -145,7 +145,11 @@ async def _authenticate(request: Request) -> AuthContext:


 def require_auth[**P, T](func: Callable[P, T]) -> Callable[P, T]:
-    """Decorator that authenticates the request and sets AuthContext.
+    """Decorator that authenticates the request and enforces authentication.
+
+    Independently raises HTTP 401 for unauthenticated requests, regardless of
+    whether ``AuthMiddleware`` is present in the ASGI stack. Sets the resolved
+    ``AuthContext`` on ``request.state.auth`` for downstream handlers.

    Must be placed ABOVE other decorators (executes after them).

@@ -158,7 +162,8 @@ def require_auth[**P, T](func: Callable[P, T]) -> Callable[P, T]:
            ...

    Raises:
-        ValueError: If 'request' parameter is missing
+        HTTPException: 401 if the request is unauthenticated.
+        ValueError: If 'request' parameter is missing.
    """

    @functools.wraps(func)
@@ -181,6 +186,9 @@ def require_auth[**P, T](func: Callable[P, T]) -> Callable[P, T]:
        auth_context = await _authenticate(request)
        request.state.auth = auth_context

+        if not auth_context.is_authenticated:
+            raise HTTPException(status_code=401, detail="Authentication required")
+
        return await func(*args, **kwargs)

    return wrapper
@@ -73,7 +73,7 @@ async def authenticate(request):
    if isinstance(payload, TokenError):
        raise Auth.exceptions.HTTPException(
            status_code=401,
-            detail=f"Token error: {payload.value}",
+            detail="Invalid token",
        )

    user = await get_local_provider().get_user(payload.sub)
@@ -146,7 +146,13 @@ def _set_session_cookie(response: Response, token: str, request: Request) -> Non


 # ── Rate Limiting ────────────────────────────────────────────────────────
-# In-process dict — not shared across workers. Sufficient for single-worker deployments.
+# In-process dict — not shared across workers.
+#
+# **Limitation**: with multi-worker deployments (e.g., gunicorn -w N), each
+# worker maintains its own lockout table, so an attacker effectively gets
+# N × _MAX_LOGIN_ATTEMPTS guesses before being locked out everywhere. For
+# production multi-worker setups, replace this with a shared store (Redis,
+# database-backed counter) to enforce a true per-IP limit.

 _MAX_LOGIN_ATTEMPTS = 5
 _LOCKOUT_SECONDS = 300  # 5 minutes
@@ -376,9 +382,37 @@ async def get_me(request: Request):
    return UserResponse(id=str(user.id), email=user.email, system_role=user.system_role, needs_setup=user.needs_setup)


+_SETUP_STATUS_COOLDOWN: dict[str, float] = {}
+_SETUP_STATUS_COOLDOWN_SECONDS = 60
+_MAX_TRACKED_SETUP_STATUS_IPS = 10000
+
+
@router.get("/setup-status")
-async def setup_status():
+async def setup_status(request: Request):
    """Check if an admin account exists. Returns needs_setup=True when no admin exists."""
+    client_ip = _get_client_ip(request)
+    now = time.time()
+    last_check = _SETUP_STATUS_COOLDOWN.get(client_ip, 0)
+    elapsed = now - last_check
+    if elapsed < _SETUP_STATUS_COOLDOWN_SECONDS:
+        retry_after = max(1, int(_SETUP_STATUS_COOLDOWN_SECONDS - elapsed))
+        raise HTTPException(
+            status_code=status.HTTP_429_TOO_MANY_REQUESTS,
+            detail="Setup status check is rate limited",
+            headers={"Retry-After": str(retry_after)},
+        )
+    # Evict stale entries when dict grows too large to bound memory usage.
+    if len(_SETUP_STATUS_COOLDOWN) >= _MAX_TRACKED_SETUP_STATUS_IPS:
+        cutoff = now - _SETUP_STATUS_COOLDOWN_SECONDS
+        stale = [k for k, t in _SETUP_STATUS_COOLDOWN.items() if t < cutoff]
+        for k in stale:
+            del _SETUP_STATUS_COOLDOWN[k]
+        # If still too large after evicting expired entries, remove oldest half.
+        if len(_SETUP_STATUS_COOLDOWN) >= _MAX_TRACKED_SETUP_STATUS_IPS:
+            by_time = sorted(_SETUP_STATUS_COOLDOWN.items(), key=lambda kv: kv[1])
+            for k, _ in by_time[: len(by_time) // 2]:
+                del _SETUP_STATUS_COOLDOWN[k]
+    _SETUP_STATUS_COOLDOWN[client_ip] = now
    admin_count = await get_local_provider().count_admin_users()
    return {"needs_setup": admin_count == 0}