mirror of
https://github.com/bytedance/deer-flow.git
synced 2026-05-24 17:06:00 +00:00
fix(actor): harden lifecycle, supervision, Redis mailbox, and add comprehensive tests
- Fix spawn() zombie cell: clean up registry on start() failure - Fix shutdown(): cancel + await tasks that exceed graceful timeout - Fix _shutdown(): await mailbox.close() to release backend resources - Fix escalate directive: stop failing child before propagating to grandparent - Fix RedisMailbox.put(): wrap Redis errors in try/except, return False on failure - Fix retry.py: replace assert with proper raise for last_exc - Add put_batch() to Mailbox abstraction for single-roundtrip bulk enqueue - Add RedisMailbox.put_batch() with atomic Lua script for bounded queues - Add MailboxFullError exception type for semantic backpressure handling - Add redis>=7.4.0 dependency with public PyPI sources in uv.lock Tests added (31 total, up from 27): - test_middleware_on_restart_hook: verifies middleware.on_restart() on supervision restart - test_ask_propagates_actor_exception: ask() re-raises original exception type - test_ask_propagates_exception_while_supervised: exception propagates; root actor survives - test_ask_timeout_late_reply_no_exception: late reply after timeout is silent no-op - test_actor_backpressure.py: MailboxFullError + dead letter on full mailbox - test_actor_retry.py: ask_with_retry with exponential backoff - test_mailbox_redis.py: RedisMailbox put/get/batch/close - bench_actor_redis.py: RedisMailbox throughput benchmarks
This commit is contained in:
@@ -11,7 +11,7 @@ from typing import Any
|
||||
from .actor import Actor, ActorContext
|
||||
from .mailbox import Empty, Mailbox, MemoryMailbox
|
||||
from .middleware import ActorMailboxContext, Middleware, NextFn, build_middleware_chain
|
||||
from .ref import ActorRef, ActorStoppedError, ReplyChannel, _Envelope, _ReplyMessage, _ReplyRegistry, _Stop
|
||||
from .ref import ActorRef, ActorStoppedError, MailboxFullError, ReplyChannel, _Envelope, _ReplyMessage, _ReplyRegistry, _Stop
|
||||
from .supervision import Directive, SupervisorStrategy
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -87,7 +87,11 @@ class ActorSystem:
|
||||
middlewares=middlewares or [],
|
||||
)
|
||||
self._root_cells[name] = cell
|
||||
await cell.start()
|
||||
try:
|
||||
await cell.start()
|
||||
except Exception:
|
||||
del self._root_cells[name]
|
||||
raise
|
||||
return cell.ref
|
||||
|
||||
async def shutdown(self, *, timeout: float = 10.0) -> None:
|
||||
@@ -99,7 +103,12 @@ class ActorSystem:
|
||||
if cell.task is not None:
|
||||
tasks.append(cell.task)
|
||||
if tasks:
|
||||
await asyncio.wait(tasks, timeout=timeout)
|
||||
_, pending = await asyncio.wait(tasks, timeout=timeout)
|
||||
# Cancel tasks that didn't finish within the timeout to prevent zombie tasks
|
||||
for t in pending:
|
||||
t.cancel()
|
||||
if pending:
|
||||
await asyncio.wait(pending, timeout=2.0)
|
||||
self._root_cells.clear()
|
||||
self._replies.reject_all(ActorStoppedError("ActorSystem shutting down"))
|
||||
await self._reply_channel.stop_listener()
|
||||
@@ -188,16 +197,25 @@ class _ActorCell:
|
||||
self.task = asyncio.create_task(self._run(), name=f"actor:{self.path}")
|
||||
|
||||
async def enqueue(self, msg: _Envelope | _Stop) -> None:
|
||||
if not self.mailbox.put_nowait(msg):
|
||||
# Try non-blocking first (fast path for MemoryMailbox)
|
||||
if self.mailbox.put_nowait(msg):
|
||||
return
|
||||
# Fallback to async put (required for Redis and other async backends)
|
||||
if not await self.mailbox.put(msg):
|
||||
if isinstance(msg, _Envelope) and msg.correlation_id is not None:
|
||||
self.system._replies.reject(msg.correlation_id, RuntimeError(f"Mailbox full: {self.path}"))
|
||||
self.system._replies.reject(msg.correlation_id, MailboxFullError(f"Mailbox full: {self.path}"))
|
||||
elif isinstance(msg, _Envelope):
|
||||
self.system._dead_letter(self.ref, msg.payload, msg.sender)
|
||||
|
||||
def request_stop(self) -> None:
|
||||
"""Request graceful shutdown. Falls back to task.cancel() if mailbox full."""
|
||||
"""Request graceful shutdown.
|
||||
|
||||
Tries put_nowait first. If that fails (full or unsupported backend),
|
||||
cancels the task directly so _run exits via CancelledError → finally → _shutdown.
|
||||
"""
|
||||
if not self.stopped:
|
||||
if not self.mailbox.put_nowait(_Stop()):
|
||||
# Redis/async backends can't put_nowait — cancel the task
|
||||
if self.task is not None and not self.task.done():
|
||||
self.task.cancel()
|
||||
else:
|
||||
@@ -223,7 +241,11 @@ class _ActorCell:
|
||||
middlewares=middlewares or [],
|
||||
)
|
||||
self.children[name] = child
|
||||
await child.start()
|
||||
try:
|
||||
await child.start()
|
||||
except Exception:
|
||||
del self.children[name]
|
||||
raise
|
||||
return child.ref
|
||||
|
||||
# -- Processing loop -------------------------------------------------------
|
||||
@@ -310,6 +332,11 @@ class _ActorCell:
|
||||
# Remove from parent
|
||||
if self.parent is not None:
|
||||
self.parent.children.pop(self.name, None)
|
||||
# Close mailbox to release backend resources (e.g. Redis connections)
|
||||
try:
|
||||
await self.mailbox.close()
|
||||
except Exception:
|
||||
logger.exception("Error closing mailbox for %s", self.path)
|
||||
|
||||
# -- Supervision -----------------------------------------------------------
|
||||
|
||||
@@ -337,8 +364,16 @@ class _ActorCell:
|
||||
return
|
||||
|
||||
if directive == Directive.escalate:
|
||||
logger.info("Supervisor %s: escalate %s", self.path, type(error).__name__)
|
||||
raise error
|
||||
# Stop the failing child, then propagate failure up the supervision chain.
|
||||
# We cannot use `raise error` here — that would crash the child's _run
|
||||
# loop instead of notifying the grandparent's supervisor.
|
||||
child.request_stop()
|
||||
if self.parent is not None:
|
||||
logger.info("Supervisor %s: escalate %s to grandparent %s", self.path, type(error).__name__, self.parent.path)
|
||||
await self.parent._handle_child_failure(self, error)
|
||||
else:
|
||||
logger.error("Uncaught escalation at root actor %s: %s", self.path, error)
|
||||
return
|
||||
|
||||
if directive == Directive.restart:
|
||||
for name in affected:
|
||||
|
||||
Reference in New Issue
Block a user