feat(memory): add memory.token_counting config to avoid tiktoken network dependency (#3429) (#3465)

* feat(memory): add memory.token_counting config to avoid tiktoken network dependency (#3429) Add a `memory.token_counting` option (`tiktoken` | `char`) so deployments in network-restricted environments can opt out of tiktoken entirely. In `char` mode the memory-injection budget uses a network-free character-based estimate and never triggers the BPE download from openaipublic.blob.core.windows.net, which could otherwise block for tens of minutes (see #3402). Also harden the default `tiktoken` path: - cache an in-flight LOADING sentinel so concurrent callers fall back immediately instead of spawning more blocking get_encoding threads when the first load is still running (e.g. under the 5s startup warm-up timeout); - cache failures with a timestamp and retry after a cooldown so a transient network outage self-heals back to accurate counting without a restart; - skip startup warm-up entirely in char mode. The new config is surfaced via the memory config API and config.example.yaml (config_version bumped). Default remains `tiktoken`, so existing deployments are unaffected. * fix(memory): use CJK-aware char token estimate and address review feedback - Replace the flat len(text)//4 fallback with a CJK-aware estimate so Chinese/Japanese/Korean memory content does not over-fill the injection budget - Document the internal tiktoken retry cooldown and char-mode escape hatch - Sync CLAUDE.md / config.example.yaml / MEMORY_IMPROVEMENTS.md wording - Fix MemoryConfigResponse mocks/assertions and add CJK estimate tests
2026-06-11 01:45:58 +00:00 · 2026-06-10 23:26:15 +08:00
parent ba9cc5e972
commit 167ef4512f
13 changed files with 364 additions and 43 deletions
@@ -184,21 +184,27 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
    # Pre-warm tiktoken encoding cache so the first memory-injection request
    # never blocks on the BPE data download (which hits an OpenAI/Azure URL
    # that may be unreachable in restricted networks — see issue #3402).
-    try:
-        from deerflow.agents.memory.prompt import warm_tiktoken_cache
+    # When memory.token_counting is "char", token counting never touches
+    # tiktoken, so skip the warm-up entirely (avoids even the 5s probe in
+    # network-restricted deployments — see issue #3429).
+    if startup_config.memory.token_counting == "char":
+        logger.info("memory.token_counting='char'; skipping tiktoken warm-up (network-free token estimation)")
+    else:
+        try:
+            from deerflow.agents.memory.prompt import warm_tiktoken_cache

-        warmed = await asyncio.wait_for(
-            asyncio.to_thread(warm_tiktoken_cache),
-            timeout=5,
-        )
-        if warmed:
-            logger.info("tiktoken encoding cache warmed successfully")
-        else:
-            logger.warning("tiktoken encoding cache warm-up failed; token counting will use character-based fallback")
-    except TimeoutError:
-        logger.warning("tiktoken encoding cache warm-up timed out; token counting will use character-based fallback")
-    except Exception:
-        logger.warning("tiktoken warm-up skipped", exc_info=True)
+            warmed = await asyncio.wait_for(
+                asyncio.to_thread(warm_tiktoken_cache),
+                timeout=5,
+            )
+            if warmed:
+                logger.info("tiktoken encoding cache warmed successfully")
+            else:
+                logger.warning("tiktoken encoding cache warm-up failed; token counting will use character-based fallback until tiktoken loads successfully")
+        except TimeoutError:
+            logger.warning("tiktoken encoding cache warm-up timed out; token counting will use character-based fallback until tiktoken loads successfully")
+        except Exception:
+            logger.warning("tiktoken warm-up skipped", exc_info=True)

    # Initialize LangGraph runtime components (StreamBridge, RunManager, checkpointer, store)
    async with langgraph_runtime(app, startup_config):
@@ -98,6 +98,7 @@ class MemoryConfigResponse(BaseModel):
    fact_confidence_threshold: float = Field(..., description="Minimum confidence threshold for facts")
    injection_enabled: bool = Field(..., description="Whether memory injection is enabled")
    max_injection_tokens: int = Field(..., description="Maximum tokens for memory injection")
+    token_counting: str = Field(..., description="Token counting strategy for memory injection ('tiktoken' or 'char')")


 class MemoryStatusResponse(BaseModel):
@@ -310,7 +311,8 @@ async def get_memory_config_endpoint() -> MemoryConfigResponse:
            "max_facts": 100,
            "fact_confidence_threshold": 0.7,
            "injection_enabled": true,
-            "max_injection_tokens": 2000
+            "max_injection_tokens": 2000,
+            "token_counting": "tiktoken"
        }
        ```
    """
@@ -323,6 +325,7 @@ async def get_memory_config_endpoint() -> MemoryConfigResponse:
        fact_confidence_threshold=config.fact_confidence_threshold,
        injection_enabled=config.injection_enabled,
        max_injection_tokens=config.max_injection_tokens,
+        token_counting=config.token_counting,
    )


@@ -351,6 +354,7 @@ async def get_memory_status() -> MemoryStatusResponse:
            fact_confidence_threshold=config.fact_confidence_threshold,
            injection_enabled=config.injection_enabled,
            max_injection_tokens=config.max_injection_tokens,
+            token_counting=config.token_counting,
        ),
        data=MemoryResponse(**memory_data),
    )