mirror of
https://github.com/bytedance/deer-flow.git
synced 2026-06-10 17:35:57 +00:00
fix(middleware): offload memory injection off event loop to prevent tiktoken blocking (#3402) (#3411)
* fix(middleware): offload memory injection off event loop to prevent tiktoken blocking (#3402) DynamicContextMiddleware.abefore_agent() called _inject() synchronously on the asyncio event loop. The first time memory is injected (second request), _inject() → format_memory_for_injection() → _count_tokens() → tiktoken.get_encoding("cl100k_base") needs to download the BPE data from openaipublic.blob.core.windows.net. In network-restricted environments this download blocks until the OS TCP timeout (~26 min), starving ALL concurrent handlers including /api/v1/auth/me. Fix: - abefore_agent now uses asyncio.to_thread(self._inject, state) so file I/O and tiktoken never block the event loop. - Extract _get_tiktoken_encoding() with a module-level cache so tiktoken.get_encoding() is called at most once per encoding name. - Add warm_tiktoken_cache() startup helper; gateway lifespan pre-warms the cache via asyncio.to_thread so the first request never triggers a cold download. - _count_tokens falls back to len(text) // 4 on any encoding failure. Tests: - tests/test_tiktoken_cache_and_count_tokens.py (12 tests): cache hit/miss, fallback paths, warm-up helper. - tests/blocking_io/test_dynamic_context_middleware.py (2 tests): Blockbuster gate verifies abefore_agent does not block the event loop; async/sync parity check. Fixes #3402 * Apply suggestions from code review Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> * fix the lint error * fix(memory): use future annotations to avoid NameError when tiktoken is absent Add `from __future__ import annotations` to prompt.py so that tiktoken.Encoding type hints are never evaluated at runtime. Without this, environments where tiktoken is not installed could raise NameError on the module-level cache and function return annotations. Addresses Copilot review comment on PR #3411. * fix(middleware): bound abefore_agent injection with timeout to prevent hung requests Wrap the asyncio.to_thread(self._inject) offload in asyncio.wait_for() with a 5-second cap. If the startup warm-up failed silently (e.g. network blip during deploy), a cold tiktoken BPE download on the first request can block until the OS TCP timeout (~26 min). The bounded timeout ensures the request degrades gracefully (no memory/date context for that turn) rather than hanging. Adds test_abefore_agent_returns_none_on_timeout to the blocking-IO regression anchors. Addresses review feedback from xg-gh-25 on PR #3411. --------- Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -1,9 +1,14 @@
|
||||
"""Prompt templates for memory update and injection."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import math
|
||||
import re
|
||||
from typing import Any
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
try:
|
||||
import tiktoken
|
||||
|
||||
@@ -160,6 +165,39 @@ Rules:
|
||||
Return ONLY valid JSON."""
|
||||
|
||||
|
||||
# Module-level tiktoken encoding cache. Populated lazily on first use;
|
||||
# subsequent calls are a dict lookup (no network I/O). Pre-warming at
|
||||
# startup via :func:`warm_tiktoken_cache` avoids blocking a request on the
|
||||
# (potentially slow) first ``get_encoding`` call.
|
||||
_tiktoken_encoding_cache: dict[str, tiktoken.Encoding] = {}
|
||||
|
||||
|
||||
def _get_tiktoken_encoding(encoding_name: str = "cl100k_base") -> tiktoken.Encoding | None:
|
||||
"""Return a cached tiktoken encoding, or ``None`` on failure / unavailability.
|
||||
|
||||
On the very first call for a given *encoding_name*, tiktoken may need to
|
||||
download the BPE data from ``openaipublic.blob.core.windows.net``. In
|
||||
network-restricted environments (e.g. deployments behind the GFW) this
|
||||
download can block for tens of minutes before the OS TCP timeout kicks in.
|
||||
The caller must therefore be prepared for this to block and should run it
|
||||
off the event loop (e.g. via ``asyncio.to_thread``).
|
||||
"""
|
||||
if not TIKTOKEN_AVAILABLE:
|
||||
return None
|
||||
|
||||
cached = _tiktoken_encoding_cache.get(encoding_name)
|
||||
if cached is not None:
|
||||
return cached
|
||||
|
||||
try:
|
||||
encoding = tiktoken.get_encoding(encoding_name)
|
||||
_tiktoken_encoding_cache[encoding_name] = encoding
|
||||
return encoding
|
||||
except Exception:
|
||||
logger.warning("Failed to load tiktoken encoding %r; falling back to char-based estimation", encoding_name, exc_info=True)
|
||||
return None
|
||||
|
||||
|
||||
def _count_tokens(text: str, encoding_name: str = "cl100k_base") -> int:
|
||||
"""Count tokens in text using tiktoken.
|
||||
|
||||
@@ -170,18 +208,30 @@ def _count_tokens(text: str, encoding_name: str = "cl100k_base") -> int:
|
||||
Returns:
|
||||
The number of tokens in the text.
|
||||
"""
|
||||
if not TIKTOKEN_AVAILABLE:
|
||||
encoding = _get_tiktoken_encoding(encoding_name)
|
||||
if encoding is None:
|
||||
# Fallback to character-based estimation if tiktoken is not available
|
||||
# or the encoding failed to load.
|
||||
return len(text) // 4
|
||||
|
||||
try:
|
||||
encoding = tiktoken.get_encoding(encoding_name)
|
||||
return len(encoding.encode(text))
|
||||
except Exception:
|
||||
# Fallback to character-based estimation on error
|
||||
return len(text) // 4
|
||||
|
||||
|
||||
def warm_tiktoken_cache() -> bool:
|
||||
"""Pre-warm the tiktoken encoding cache.
|
||||
|
||||
Call at startup (off the event loop) so the first request never blocks
|
||||
on the BPE download. Returns ``True`` if the encoding was loaded
|
||||
successfully (or was already cached), ``False`` if tiktoken is
|
||||
unavailable or the download failed.
|
||||
"""
|
||||
return _get_tiktoken_encoding("cl100k_base") is not None
|
||||
|
||||
|
||||
def _coerce_confidence(value: Any, default: float = 0.0) -> float:
|
||||
"""Coerce a confidence-like value to a bounded float in [0, 1].
|
||||
|
||||
|
||||
@@ -28,6 +28,7 @@ Date-update format:
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import re
|
||||
import uuid
|
||||
@@ -43,6 +44,12 @@ if TYPE_CHECKING:
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Upper bound (seconds) for a single _inject() offload. If the warm-up at
|
||||
# gateway startup failed silently, the first request may still hit a cold
|
||||
# tiktoken BPE download that blocks until the OS TCP timeout (~26 min).
|
||||
# This cap ensures the request degrades gracefully instead of hanging.
|
||||
_INJECT_TIMEOUT_SECONDS = 5.0
|
||||
|
||||
_DATE_RE = re.compile(r"<current_date>([^<]+)</current_date>")
|
||||
_DYNAMIC_CONTEXT_REMINDER_KEY = "dynamic_context_reminder"
|
||||
_SUMMARY_MESSAGE_NAME = "summary"
|
||||
@@ -201,4 +208,25 @@ class DynamicContextMiddleware(AgentMiddleware):
|
||||
|
||||
@override
|
||||
async def abefore_agent(self, state, runtime: Runtime) -> dict | None:
|
||||
return self._inject(state)
|
||||
# _inject() performs synchronous file I/O (memory JSON loading) and
|
||||
# potentially blocking network calls (tiktoken encoding download on
|
||||
# first use). Offload to a thread so the event loop is never blocked
|
||||
# — a blocking call here starves all concurrent HTTP handlers (auth,
|
||||
# SSE heartbeats, etc.). See issue #3402.
|
||||
#
|
||||
# Bounded timeout: if startup warm-up failed silently (e.g. network
|
||||
# blip during deploy), the first request's cold tiktoken download can
|
||||
# block for tens of minutes (OS TCP timeout). Time-box injection so
|
||||
# the request degrades gracefully (no memory context) rather than
|
||||
# hanging.
|
||||
try:
|
||||
return await asyncio.wait_for(
|
||||
asyncio.to_thread(self._inject, state),
|
||||
timeout=_INJECT_TIMEOUT_SECONDS,
|
||||
)
|
||||
except TimeoutError:
|
||||
logger.warning(
|
||||
"DynamicContextMiddleware: injection timed out (%.1fs); skipping memory/date injection for this turn",
|
||||
_INJECT_TIMEOUT_SECONDS,
|
||||
)
|
||||
return None
|
||||
|
||||
Reference in New Issue
Block a user