mirror of
https://github.com/bytedance/deer-flow.git
synced 2026-06-18 21:55:59 +00:00
fix(channels): add operational guardrails (#3584)
* fix(channels): add operational guardrails * make format * fix(channels): converge with #3582 to avoid merge-order conflicts Drop this PR's DingTalk INFO-log redaction and hand it to #3582, which already restructures that handler and will redact the same log there. This PR no longer touches dingtalk.py, so the two PRs can merge to main in any order without a conflict. For WeChat, drop the contested thread_ts priority reorder (review #3) and keep only what inbound dedupe needs: a server-stable message_id in the inbound metadata (message_id/msg_id, no client_id per review #6). This is a single added line inside the metadata dict, a region #3582 never touches, so it auto-merges regardless of order. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> * fix(channels): address three correctness review findings 1. Connect-code cap was racy (willem #1): _create_state ran delete-expired, count, and insert as three separate transactions, so concurrent connect POSTs from one owner could each see count < cap and all insert past it. Add ChannelConnectionRepository.create_oauth_state_within_cap which does delete+count+insert in a single transaction serialized per (owner, provider) — Postgres via pg_advisory_xact_lock, SQLite via the write lock the leading DELETE takes — and have the router use it. 2. Inbound dedupe key fell back to "" workspace (willem #3): two workspaces delivering without team/guild/aibotid would collapse to the same key and dedupe each other's messages. _inbound_dedupe_key now fails closed (returns None) when no workspace identifier is present. 3. Dedupe key was recorded on receipt and never released on failure (ShenAC #1): a transient error (DB blip, Gateway 503) left the key in place for the full TTL, so a provider redelivery of the same message_id — exactly the retry dedupe should absorb — was silently dropped. _handle_message now releases the key in the unexpected-exception branch so redelivery can recover, while keeping record-on-receipt so retries during handling are still deduped. Tests: repo cap enforcement incl. concurrent-issuance non-leak; dedupe fail-closed; dedupe key release-on-failure redelivery recovery. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> * fix(channels): address cleanup/efficiency and test review findings Efficiency / cleanup: - Dedupe key set drops client-generated ids (client_msg_id, client_id); keep only server-stable event_id/message_id/msg_id, which a provider's own redelivery preserves (ShenAC #6). Every provider already emits message_id. - TTL/overflow pruning of _recent_inbound_events is now O(k): switch to an OrderedDict and popitem(last=False) from the front instead of scanning all 4096 entries on every inbound (willem #4). - Log "received inbound" only after the dedupe check so a provider retrying N times no longer logs N accepts; document that manager dedupe covers the agent run/final answer, not provider ack side-effects (willem #5, ShenAC #2). - Slack drops the redundant `team_id or event.get("team")` fallback the caller already resolved (willem #6). - create_oauth_state_within_cap prunes only this owner/provider's expired codes instead of a global DELETE on every connect POST; global cleanup still runs on consume_oauth_state (willem #7). Tests: - Dedupe test uses tmp_path instead of a leaked mkdtemp, uses distinct objects per publish, and adds a negative control: a different message_id is still processed, catching over-dedupe regressions (willem #8, ShenAC #4). - Slack HTTP-mode rejection test supplies app_token so the missing-token early return can't mask the guard, giving the state assertions teeth (ShenAC #3). - count_oauth_states test pins that the active row survives, not just the count (ShenAC #5). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> * make format --------- Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -7,6 +7,7 @@ import logging
|
||||
import mimetypes
|
||||
import re
|
||||
import time
|
||||
from collections import OrderedDict
|
||||
from collections.abc import Awaitable, Callable, Mapping
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
@@ -62,6 +63,12 @@ MESSAGE_STREAM_EVENTS = ("messages-tuple", "messages")
|
||||
THREAD_BUSY_MESSAGE = "This conversation is already processing another request. Please wait for it to finish and try again."
|
||||
BOUND_IDENTITY_REQUIRED_MESSAGE = "Connect this channel from DeerFlow Settings, complete the in-channel connect step, then send your message again."
|
||||
BOUND_IDENTITY_UNAVAILABLE_MESSAGE = "Channel connection verification is temporarily unavailable. Please try again later or contact the DeerFlow operator."
|
||||
INBOUND_DEDUPE_TTL_SECONDS = 10 * 60
|
||||
INBOUND_DEDUPE_MAX_ENTRIES = 4096
|
||||
# Only server-stable provider message ids: client-generated ids (client_msg_id,
|
||||
# client_id) are not guaranteed identical across a provider's own redelivery, so
|
||||
# keying dedupe on them would miss exactly the retries we want to absorb.
|
||||
INBOUND_DEDUPE_METADATA_KEYS = ("event_id", "message_id", "msg_id")
|
||||
|
||||
CHANNEL_CAPABILITIES = {
|
||||
"dingtalk": {"supports_streaming": False},
|
||||
@@ -774,6 +781,10 @@ class ChannelManager:
|
||||
self._semaphore: asyncio.Semaphore | None = None
|
||||
self._running = False
|
||||
self._task: asyncio.Task | None = None
|
||||
# Insertion order == chronological (keys are never re-inserted), so an
|
||||
# OrderedDict lets us evict expired/overflow entries from the front in
|
||||
# O(k) instead of scanning all entries on every inbound message.
|
||||
self._recent_inbound_events: OrderedDict[tuple[str, str, str, str], float] = OrderedDict()
|
||||
|
||||
@staticmethod
|
||||
def _channel_supports_streaming(channel_name: str) -> bool:
|
||||
@@ -919,16 +930,94 @@ class ChannelManager:
|
||||
except asyncio.CancelledError:
|
||||
break
|
||||
|
||||
# Dedupe before logging "received" so a provider retrying an event N
|
||||
# times does not log N accepts; duplicates are logged once as ignored.
|
||||
# Note: this manager-level dedupe only guards the agent run / final
|
||||
# answer. Provider adapters may emit ack side-effects (a "Working on
|
||||
# it…" reply, an "eyes" reaction) before publish_inbound, so those are
|
||||
# intentionally not deduped here.
|
||||
if self._is_duplicate_inbound(msg):
|
||||
continue
|
||||
logger.info(
|
||||
"[Manager] received inbound: channel=%s, chat_id=%s, type=%s, text=%r",
|
||||
"[Manager] received inbound: channel=%s, chat_id=%s, type=%s, text_len=%d, files=%d",
|
||||
msg.channel_name,
|
||||
msg.chat_id,
|
||||
msg.msg_type.value,
|
||||
msg.text[:100] if msg.text else "",
|
||||
len(msg.text or ""),
|
||||
len(msg.files),
|
||||
)
|
||||
task = asyncio.create_task(self._handle_message(msg))
|
||||
task.add_done_callback(self._log_task_error)
|
||||
|
||||
@staticmethod
|
||||
def _inbound_dedupe_key(msg: InboundMessage) -> tuple[str, str, str, str] | None:
|
||||
metadata = msg.metadata or {}
|
||||
message_id = None
|
||||
for key in INBOUND_DEDUPE_METADATA_KEYS:
|
||||
value = metadata.get(key)
|
||||
if value:
|
||||
message_id = str(value)
|
||||
break
|
||||
if message_id is None:
|
||||
raw_message = metadata.get("raw_message")
|
||||
if isinstance(raw_message, Mapping):
|
||||
for key in INBOUND_DEDUPE_METADATA_KEYS:
|
||||
value = raw_message.get(key)
|
||||
if value:
|
||||
message_id = str(value)
|
||||
break
|
||||
if message_id is None:
|
||||
return None
|
||||
|
||||
# Fail closed: without a workspace/team/guild identifier we cannot tell two
|
||||
# workspaces apart (e.g. Slack channel ids are not globally unique), so
|
||||
# skip dedupe rather than risk collapsing distinct workspaces' messages.
|
||||
workspace_id = msg.workspace_id or metadata.get("workspace_id") or metadata.get("team_id") or metadata.get("guild_id") or metadata.get("aibotid")
|
||||
if not workspace_id:
|
||||
return None
|
||||
return (msg.channel_name, str(workspace_id), msg.chat_id, message_id)
|
||||
|
||||
def _is_duplicate_inbound(self, msg: InboundMessage) -> bool:
|
||||
key = self._inbound_dedupe_key(msg)
|
||||
if key is None:
|
||||
return False
|
||||
|
||||
now = time.monotonic()
|
||||
# Entries are in chronological insertion order, so expired ones cluster at
|
||||
# the front: pop from the front until we hit a still-live entry.
|
||||
while self._recent_inbound_events:
|
||||
_, oldest_at = next(iter(self._recent_inbound_events.items()))
|
||||
if now - oldest_at > INBOUND_DEDUPE_TTL_SECONDS:
|
||||
self._recent_inbound_events.popitem(last=False)
|
||||
else:
|
||||
break
|
||||
while len(self._recent_inbound_events) > INBOUND_DEDUPE_MAX_ENTRIES:
|
||||
self._recent_inbound_events.popitem(last=False)
|
||||
|
||||
if key in self._recent_inbound_events:
|
||||
logger.info(
|
||||
"[Manager] duplicate inbound ignored: channel=%s, chat_id=%s, message_id=%s",
|
||||
msg.channel_name,
|
||||
msg.chat_id,
|
||||
key[-1],
|
||||
)
|
||||
return True
|
||||
|
||||
self._recent_inbound_events[key] = now
|
||||
return False
|
||||
|
||||
def _release_inbound_dedupe_key(self, msg: InboundMessage) -> None:
|
||||
"""Drop a recorded dedupe key so a provider redelivery can be reprocessed.
|
||||
|
||||
Called only on transient/unexpected handling failures: the key was
|
||||
recorded on receipt so retries arriving *while* the message is being
|
||||
handled are still deduped, but if handling fails we must not turn a
|
||||
recoverable error into a TTL-long black hole for the same message_id.
|
||||
"""
|
||||
key = self._inbound_dedupe_key(msg)
|
||||
if key is not None:
|
||||
self._recent_inbound_events.pop(key, None)
|
||||
|
||||
@staticmethod
|
||||
def _log_task_error(task: asyncio.Task) -> None:
|
||||
"""Surface unhandled exceptions from background tasks."""
|
||||
@@ -979,6 +1068,10 @@ class ChannelManager:
|
||||
msg.channel_name,
|
||||
msg.chat_id,
|
||||
)
|
||||
# Transient/unexpected failure: release the dedupe key so a provider
|
||||
# redelivery of the same message can recover instead of being dropped
|
||||
# for the dedupe TTL.
|
||||
self._release_inbound_dedupe_key(msg)
|
||||
await self._send_error(msg, "An internal error occurred. Please try again.")
|
||||
|
||||
# -- chat handling -----------------------------------------------------
|
||||
@@ -1169,7 +1262,7 @@ class ChannelManager:
|
||||
)
|
||||
return
|
||||
|
||||
logger.info("[Manager] invoking runs.wait(thread_id=%s, text=%r)", thread_id, msg.text[:100])
|
||||
logger.info("[Manager] invoking runs.wait(thread_id=%s, text_len=%d)", thread_id, len(msg.text or ""))
|
||||
run_kwargs: dict[str, Any] = {
|
||||
"input": {"messages": [human_message]},
|
||||
"config": run_config,
|
||||
@@ -1236,7 +1329,7 @@ class ChannelManager:
|
||||
run_context: dict[str, Any],
|
||||
human_message: dict[str, Any],
|
||||
) -> None:
|
||||
logger.info("[Manager] invoking runs.stream(thread_id=%s, text=%r)", thread_id, msg.text[:100])
|
||||
logger.info("[Manager] invoking runs.stream(thread_id=%s, text_len=%d)", thread_id, len(msg.text or ""))
|
||||
|
||||
last_values: dict[str, Any] | list | None = None
|
||||
streamed_buffers: dict[str, str] = {}
|
||||
|
||||
Reference in New Issue
Block a user