fix(actor): harden lifecycle, supervision, Redis mailbox, and add comprehensive tests

- Fix spawn() zombie cell: clean up registry on start() failure - Fix shutdown(): cancel + await tasks that exceed graceful timeout - Fix _shutdown(): await mailbox.close() to release backend resources - Fix escalate directive: stop failing child before propagating to grandparent - Fix RedisMailbox.put(): wrap Redis errors in try/except, return False on failure - Fix retry.py: replace assert with proper raise for last_exc - Add put_batch() to Mailbox abstraction for single-roundtrip bulk enqueue - Add RedisMailbox.put_batch() with atomic Lua script for bounded queues - Add MailboxFullError exception type for semantic backpressure handling - Add redis>=7.4.0 dependency with public PyPI sources in uv.lock Tests added (31 total, up from 27): - test_middleware_on_restart_hook: verifies middleware.on_restart() on supervision restart - test_ask_propagates_actor_exception: ask() re-raises original exception type - test_ask_propagates_exception_while_supervised: exception propagates; root actor survives - test_ask_timeout_late_reply_no_exception: late reply after timeout is silent no-op - test_actor_backpressure.py: MailboxFullError + dead letter on full mailbox - test_actor_retry.py: ask_with_retry with exponential backoff - test_mailbox_redis.py: RedisMailbox put/get/batch/close - bench_actor_redis.py: RedisMailbox throughput benchmarks
feat: asyncio-native Actor framework with supervision, middleware, and pluggable mailbox
2026-05-20 07:01:03 +00:00 · 2026-03-31 10:09:05 +08:00 · 2026-03-30 23:50:54 +08:00
18 changed files with 4981 additions and 2263 deletions
@@ -2,6 +2,8 @@
 docker/.cache/
 # oh-my-claudecode state
 .omc/
 # Collaborator plugin state
 .collaborator/
 # OS generated files
 .DS_Store
 *.local
@@ -0,0 +1,46 @@
 """Async Actor framework — lightweight, asyncio-native, supervision-ready.
 Usage::
    from deerflow.actor import Actor, ActorSystem
    class Greeter(Actor):
        async def on_receive(self, message):
            return f"Hello, {message}!"
    async def main():
        system = ActorSystem("app")
        ref = await system.spawn(Greeter, "greeter")
        reply = await ref.ask("World", timeout=5.0)
        print(reply)  # Hello, World!
        await system.shutdown()
 """
 from .actor import Actor, ActorContext
 from .mailbox import Mailbox, MemoryMailbox
 from .middleware import Middleware
 from .ref import ActorRef, MailboxFullError, ReplyChannel
 from .retry import IdempotentActorMixin, IdempotencyStore, RetryEnvelope, ask_with_retry
 from .supervision import AllForOneStrategy, Directive, OneForOneStrategy, SupervisorStrategy
 from .system import ActorSystem, DeadLetter
 __all__ = [
    "Actor",
    "ActorContext",
    "ActorRef",
    "ActorSystem",
    "AllForOneStrategy",
    "DeadLetter",
    "Directive",
    "Mailbox",
    "MailboxFullError",
    "MemoryMailbox",
    "Middleware",
    "OneForOneStrategy",
    "ReplyChannel",
    "RetryEnvelope",
    "SupervisorStrategy",
    "IdempotentActorMixin",
    "IdempotencyStore",
    "ask_with_retry",
 ]
@@ -0,0 +1,109 @@
 """Actor base class and per-actor context."""
 from __future__ import annotations
 from collections.abc import Callable
 from typing import TYPE_CHECKING, Any, Generic, TypeVar
 from .supervision import OneForOneStrategy, SupervisorStrategy
 if TYPE_CHECKING:
    from .ref import ActorRef
 # Message type variable — use Actor[MyMsg] for typed actors
 M = TypeVar("M")
 R = TypeVar("R")
 class ActorContext:
    """Per-actor runtime context, injected before ``on_started``.
    Provides access to the actor's identity, parent, children,
    and the ability to spawn child actors.
    """
    __slots__ = ("_cell",)
    def __init__(self, cell: Any) -> None:
        self._cell = cell
    @property
    def self_ref(self) -> ActorRef:
        return self._cell.ref
    @property
    def parent(self) -> ActorRef | None:
        p = self._cell.parent
        return p.ref if p is not None else None
    @property
    def children(self) -> dict[str, ActorRef]:
        return {name: c.ref for name, c in self._cell.children.items()}
    @property
    def system(self) -> Any:
        return self._cell.system
    async def spawn(
        self,
        actor_cls: type[Actor],
        name: str,
        *,
        mailbox_size: int = 256,
        middlewares: list | None = None,
    ) -> ActorRef:
        """Spawn a child actor supervised by this actor."""
        return await self._cell.spawn_child(actor_cls, name, mailbox_size=mailbox_size, middlewares=middlewares)
    async def run_in_executor(self, fn: Callable[..., Any], *args: Any) -> Any:
        """Run a blocking function in the system's thread pool.
        Usage::
            result = await self.context.run_in_executor(requests.get, url)
        """
        import asyncio
        executor = self._cell.system._executor
        return await asyncio.get_running_loop().run_in_executor(executor, fn, *args)
 class Actor(Generic[M]):
    """Base class for all actors.
    Type parameter ``M`` constrains the message type::
        class Greeter(Actor[str]):
            async def on_receive(self, message: str) -> str:
                return f"Hello, {message}!"
        class Calculator(Actor[int | tuple[str, int, int]]):
            async def on_receive(self, message: int | tuple[str, int, int]) -> int:
                ...
    Unparameterized ``Actor`` accepts ``Any`` (backward-compatible).
    """
    context: ActorContext
    async def on_receive(self, message: M) -> Any:
        """Handle an incoming message.
        Return value is sent back as reply for ``ask`` calls.
        For ``tell`` calls, the return value is discarded.
        """
    async def on_started(self) -> None:
        """Called after creation, before receiving messages."""
    async def on_stopped(self) -> None:
        """Called on graceful shutdown. Release resources here."""
    async def on_restart(self, error: Exception) -> None:
        """Called on the *new* instance before resuming after a crash."""
    def supervisor_strategy(self) -> SupervisorStrategy:
        """Override to customize how this actor supervises its children.
        Default: OneForOne, up to 3 restarts per 60 seconds, always restart.
        """
        return OneForOneStrategy()
@@ -0,0 +1,121 @@
 """Pluggable mailbox abstraction — Akka-inspired enqueue/dequeue interface.
 Built-in implementations:
 - ``MemoryMailbox``: asyncio.Queue backed (default)
 - Extend ``Mailbox`` for Redis, RabbitMQ, Kafka, etc.
 """
 from __future__ import annotations
 import abc
 import asyncio
 from typing import Any
 BACKPRESSURE_BLOCK = "block"
 BACKPRESSURE_DROP_NEW = "drop_new"
 BACKPRESSURE_FAIL = "fail"
 BACKPRESSURE_POLICIES = {BACKPRESSURE_BLOCK, BACKPRESSURE_DROP_NEW, BACKPRESSURE_FAIL}
 class Mailbox(abc.ABC):
    """Abstract mailbox — the message queue for an actor.
    Implementations must be async-safe for single-consumer usage.
    Multiple producers may call ``put`` concurrently.
    """
    @abc.abstractmethod
    async def put(self, msg: Any) -> bool:
        """Enqueue a message. Returns True if accepted, False if dropped."""
    @abc.abstractmethod
    def put_nowait(self, msg: Any) -> bool:
        """Non-blocking enqueue. Returns True if accepted, False if dropped."""
    @abc.abstractmethod
    async def get(self) -> Any:
        """Dequeue the next message. Blocks until available."""
    @abc.abstractmethod
    def get_nowait(self) -> Any:
        """Non-blocking dequeue. Raises ``Empty`` if no message."""
    @abc.abstractmethod
    def empty(self) -> bool:
        """Return True if no messages are queued."""
    @property
    @abc.abstractmethod
    def full(self) -> bool:
        """Return True if mailbox is at capacity."""
    async def put_batch(self, msgs: list[Any]) -> int:
        """Enqueue multiple messages. Returns count accepted.
        Default implementation falls back to sequential ``put`` calls.
        Backends like Redis should override this for efficient bulk push.
        """
        count = 0
        for msg in msgs:
            if await self.put(msg):
                count += 1
        return count
    async def close(self) -> None:
        """Release resources. Default is no-op."""
 class Empty(Exception):
    """Raised by ``get_nowait`` when mailbox is empty."""
 class MemoryMailbox(Mailbox):
    """In-process mailbox backed by ``asyncio.Queue``."""
    def __init__(self, maxsize: int = 256, *, backpressure_policy: str = BACKPRESSURE_BLOCK) -> None:
        if backpressure_policy not in BACKPRESSURE_POLICIES:
            raise ValueError(
                f"Invalid backpressure_policy={backpressure_policy!r}, "
                f"expected one of {sorted(BACKPRESSURE_POLICIES)}"
            )
        self._queue: asyncio.Queue[Any] = asyncio.Queue(maxsize=maxsize)
        self._maxsize = maxsize
        self._backpressure_policy = backpressure_policy
    async def put(self, msg: Any) -> bool:
        if self._backpressure_policy == BACKPRESSURE_BLOCK:
            await self._queue.put(msg)
            return True
        if self._backpressure_policy in (BACKPRESSURE_DROP_NEW, BACKPRESSURE_FAIL):
            if self._queue.full():
                return False
            self._queue.put_nowait(msg)
            return True
        return False
    def put_nowait(self, msg: Any) -> bool:
        if self._queue.full():
            return False
        self._queue.put_nowait(msg)
        return True
    async def get(self) -> Any:
        return await self._queue.get()
    def get_nowait(self) -> Any:
        try:
            return self._queue.get_nowait()
        except asyncio.QueueEmpty:
            raise Empty("mailbox empty")
    def empty(self) -> bool:
        return self._queue.empty()
    @property
    def full(self) -> bool:
        return self._queue.full()
 # Type alias for mailbox factory
 MailboxFactory = type[Mailbox] | Any  # Callable[[], Mailbox]
@@ -0,0 +1,184 @@
 """Redis-backed mailbox — persistent, survives process restart.
 Requires ``redis[hiredis]`` (``uv add redis[hiredis]``).
 Usage::
    import redis.asyncio as redis
    from deerflow.actor import ActorSystem
    from deerflow.actor.mailbox_redis import RedisMailbox
    pool = redis.ConnectionPool.from_url("redis://localhost:6379")
    system = ActorSystem("app")
    ref = await system.spawn(
        MyActor, "worker",
        mailbox=RedisMailbox(pool, "actor:inbox:worker"),
    )
 """
 from __future__ import annotations
 import json
 import logging
 from typing import Any
 from .mailbox import Empty, Mailbox
 from .ref import _Envelope, _Stop
 logger = logging.getLogger(__name__)
 def _serialize(msg: _Envelope | _Stop) -> str:
    """Serialize an envelope to JSON for Redis storage.
    Raises ``TypeError`` if the payload is not JSON-serializable.
    """
    if isinstance(msg, _Stop):
        return json.dumps({"__type__": "stop"})
    try:
        return json.dumps({
            "__type__": "envelope",
            "payload": msg.payload,
            "correlation_id": msg.correlation_id,
            "reply_to": msg.reply_to,
        })
    except (TypeError, ValueError) as e:
        raise TypeError(f"Payload is not JSON-serializable: {e}. RedisMailbox requires JSON-compatible messages.") from e
 def _deserialize(data: str | bytes) -> _Envelope | _Stop:
    """Deserialize a JSON string back to an envelope or stop sentinel."""
    if isinstance(data, bytes):
        data = data.decode("utf-8")
    d = json.loads(data)
    if d.get("__type__") == "stop":
        return _Stop()
    return _Envelope(
        payload=d.get("payload"),
        sender=None,
        correlation_id=d.get("correlation_id"),
        reply_to=d.get("reply_to"),
    )
 class RedisMailbox(Mailbox):
    """Mailbox backed by a Redis LIST.
    Each actor gets its own Redis key (the ``queue_name``).
    Messages are serialized as JSON, so payloads must be JSON-compatible.
    Args:
        pool: A ``redis.asyncio.ConnectionPool`` instance.
        queue_name: Redis key for this actor's inbox (e.g. ``"actor:inbox:worker"``).
        maxlen: Maximum queue length. 0 = unbounded. When exceeded, ``put_nowait`` returns False.
        brpop_timeout: Seconds to block on ``get()`` before retrying. Default 1s.
    """
    def __init__(
        self,
        pool: Any,
        queue_name: str,
        *,
        maxlen: int = 0,
        brpop_timeout: float = 1.0,
    ) -> None:
        self._queue_name = queue_name
        self._maxlen = maxlen
        self._brpop_timeout = brpop_timeout
        self._closed = False
        # Lazy import to avoid hard dependency on redis
        try:
            import redis.asyncio as aioredis
            self._redis: aioredis.Redis = aioredis.Redis(connection_pool=pool)
        except ImportError:
            raise ImportError("RedisMailbox requires 'redis' package. Install with: uv add redis[hiredis]")
    # Lua script for atomic bounded push: check length then push
    _LUA_BOUNDED_PUSH = """
    if tonumber(ARGV[2]) > 0 and redis.call('llen', KEYS[1]) >= tonumber(ARGV[2]) then
        return 0
    end
    redis.call('lpush', KEYS[1], ARGV[1])
    return 1
    """
    async def put(self, msg: Any) -> bool:
        if self._closed:
            return False
        data = _serialize(msg)
        try:
            if self._maxlen > 0:
                # Atomic check+push via Lua script to avoid TOCTOU race
                result = await self._redis.eval(self._LUA_BOUNDED_PUSH, 1, self._queue_name, data, self._maxlen)
                return bool(result)
            await self._redis.lpush(self._queue_name, data)
            return True
        except Exception as e:
            logger.warning("RedisMailbox.put failed for %s: %s", self._queue_name, e)
            return False
    def put_nowait(self, msg: Any) -> bool:
        """Redis cannot do synchronous non-blocking enqueue reliably.
        Returns False so the caller uses dead-letter or task.cancel() fallback.
        Use ``put()`` (async) for reliable delivery.
        """
        return False
    async def put_batch(self, msgs: list[Any]) -> int:
        """Push multiple messages in a single LPUSH command (one round-trip).
        Unbounded queues: all messages sent atomically in one LPUSH.
        Bounded queues: sequential puts to respect maxlen (no batch Lua script needed).
        """
        if self._closed or not msgs:
            return 0
        data_list = []
        for msg in msgs:
            try:
                data_list.append(_serialize(msg))
            except TypeError as e:
                logger.warning("Skipping non-serializable message in put_batch: %s", e)
        if not data_list:
            return 0
        if self._maxlen > 0:
            count = 0
            for data in data_list:
                # Reuse the Lua script for TOCTOU-safe bounded check (same as put())
                result = await self._redis.eval(self._LUA_BOUNDED_PUSH, 1, self._queue_name, data, self._maxlen)
                if result:
                    count += 1
                else:
                    break  # queue full — stop early
            return count
        # Unbounded: single LPUSH with all values — one network round-trip
        await self._redis.lpush(self._queue_name, *data_list)
        return len(data_list)
    async def get(self) -> Any:
        """Blocking dequeue via BRPOP. Retries until a message arrives."""
        while not self._closed:
            result = await self._redis.brpop(self._queue_name, timeout=self._brpop_timeout)
            if result is not None:
                _, data = result
                return _deserialize(data)
        raise Empty("mailbox closed")
    def get_nowait(self) -> Any:
        raise Empty("Redis mailbox does not support synchronous get_nowait")
    def empty(self) -> bool:
        # Cannot query Redis synchronously. Return True so drain loops
        # terminate immediately and rely on get_nowait raising Empty.
        return True
    @property
    def full(self) -> bool:
        # Cannot query Redis synchronously. Backpressure enforced
        # atomically inside put() via Lua script.
        return False
    async def close(self) -> None:
        self._closed = True
        await self._redis.aclose()
@@ -0,0 +1,79 @@
 """Middleware pipeline — cross-cutting concerns for actors.
 Inspired by Proto.Actor's sender/receiver middleware model.
 Middleware intercepts messages before/after the actor processes them.
 Usage::
    class LoggingMiddleware(Middleware):
        async def on_receive(self, ctx, message, next_fn):
            logger.info("Received: %s", message)
            result = await next_fn(ctx, message)
            logger.info("Replied: %s", result)
            return result
    system = ActorSystem("app")
    ref = await system.spawn(MyActor, "a", middlewares=[LoggingMiddleware()])
 """
 from __future__ import annotations
 from collections.abc import Awaitable, Callable
 from typing import Any
 class ActorMailboxContext:
    """Context passed to middleware on each message."""
    __slots__ = ("actor_ref", "sender", "message_type")
    def __init__(self, actor_ref: Any, sender: Any, message_type: str) -> None:
        self.actor_ref = actor_ref
        self.sender = sender
        self.message_type = message_type  # "tell" or "ask"
 # The inner handler signature: (ctx, message) -> result
 NextFn = Callable[[ActorMailboxContext, Any], Awaitable[Any]]
 class Middleware:
    """Base class for actor middleware.
    Override ``on_receive`` to intercept inbound messages.
    Must call ``await next_fn(ctx, message)`` to continue the chain.
    """
    async def on_receive(self, ctx: ActorMailboxContext, message: Any, next_fn: NextFn) -> Any:
        """Intercept a message. Call next_fn to continue the chain."""
        return await next_fn(ctx, message)
    async def on_started(self, actor_ref: Any) -> None:
        """Called when the actor starts."""
    async def on_stopped(self, actor_ref: Any) -> None:
        """Called when the actor stops."""
    async def on_restart(self, actor_ref: Any, error: Exception) -> None:
        """Called when the actor restarts after a crash.
        Override to reset per-actor-instance state (caches, counters, etc.)
        that should not bleed across restarts.
        """
 def build_middleware_chain(middlewares: list[Middleware], handler: NextFn) -> NextFn:
    """Build a nested middleware chain ending with *handler*.
    Execution order: first middleware in list wraps outermost.
    ``[A, B, C]`` → ``A(B(C(handler)))``
    """
    chain = handler
    for mw in reversed(middlewares):
        outer = chain
        async def _wrap(ctx: ActorMailboxContext, msg: Any, _mw: Middleware = mw, _next: NextFn = outer) -> Any:
            return await _mw.on_receive(ctx, msg, _next)
        chain = _wrap
    return chain
@@ -0,0 +1,220 @@
 """ActorRef — immutable, serializable reference to an actor."""
 from __future__ import annotations
 import asyncio
 import uuid
 from typing import TYPE_CHECKING, Any
 if TYPE_CHECKING:
    from .system import _ActorCell
 class ActorRef:
    """Immutable handle for sending messages to an actor.
    Users never construct this directly — it is returned by
    ``ActorSystem.spawn`` or ``ActorContext.spawn``.
    """
    __slots__ = ("_cell",)
    def __init__(self, cell: _ActorCell) -> None:
        self._cell = cell
    @property
    def name(self) -> str:
        return self._cell.name
    @property
    def path(self) -> str:
        return self._cell.path
    @property
    def is_alive(self) -> bool:
        return not self._cell.stopped
    async def tell(self, message: Any, *, sender: ActorRef | None = None) -> None:
        """Fire-and-forget message delivery."""
        if self._cell.stopped:
            self._cell.system._dead_letter(self, message, sender)
            return
        await self._cell.enqueue(_Envelope(message, sender))
    async def ask(self, message: Any, *, timeout: float = 5.0) -> Any:
        """Request-response with timeout.
        Uses correlation ID + ReplyRegistry instead of passing a Future
        through the mailbox. This makes ask work with any Mailbox backend
        (memory, Redis, RabbitMQ, etc.).
        Raises ``asyncio.TimeoutError`` if the actor doesn't reply in time.
        Raises the actor's exception if ``on_receive`` fails.
        """
        if self._cell.stopped:
            raise ActorStoppedError(f"Actor {self.path} is stopped")
        corr_id = uuid.uuid4().hex
        future = self._cell.system._replies.register(corr_id)
        try:
            envelope = _Envelope(message, sender=None, correlation_id=corr_id, reply_to=self._cell.system.system_id)
            await self._cell.enqueue(envelope)
            return await asyncio.wait_for(future, timeout=timeout)
        finally:
            self._cell.system._replies.discard(corr_id)
    def stop(self) -> None:
        """Request graceful shutdown."""
        self._cell.request_stop()
    def __repr__(self) -> str:
        alive = "alive" if self.is_alive else "dead"
        return f"ActorRef({self.path}, {alive})"
    def __eq__(self, other: object) -> bool:
        if isinstance(other, ActorRef):
            return self._cell is other._cell
        return NotImplemented
    def __hash__(self) -> int:
        return id(self._cell)
 class ActorStoppedError(Exception):
    """Raised when sending to a stopped actor via ask."""
 class MailboxFullError(RuntimeError):
    """Raised when a message is rejected because the mailbox is at capacity."""
 # ---------------------------------------------------------------------------
 # Internal message wrappers (serializable — no Future objects)
 # ---------------------------------------------------------------------------
 class _Envelope:
    """Message envelope flowing through mailboxes.
    All fields are serializable (no asyncio.Future). This is what
    enables ask() to work across MQ-backed mailboxes.
    """
    __slots__ = ("payload", "sender", "correlation_id", "reply_to")
    def __init__(
        self,
        payload: Any,
        sender: ActorRef | None = None,
        correlation_id: str | None = None,
        reply_to: str | None = None,
    ) -> None:
        self.payload = payload
        self.sender = sender
        self.correlation_id = correlation_id
        self.reply_to = reply_to  # System ID of the caller (for cross-process reply routing)
 class _Stop:
    """Sentinel placed on the mailbox to trigger graceful shutdown."""
 # ---------------------------------------------------------------------------
 # ReplyRegistry — maps correlation_id → Future (lives on ActorSystem)
 # ---------------------------------------------------------------------------
 class _ReplyRegistry:
    """In-memory registry mapping correlation IDs to Futures.
    Used by ask() to receive replies without putting Futures in the mailbox.
    """
    def __init__(self) -> None:
        self._pending: dict[str, asyncio.Future[Any]] = {}
    def register(self, corr_id: str) -> asyncio.Future[Any]:
        """Create and register a Future for a correlation ID."""
        future: asyncio.Future[Any] = asyncio.get_running_loop().create_future()
        self._pending[corr_id] = future
        return future
    def resolve(self, corr_id: str, result: Any) -> None:
        """Complete a pending ask with a result."""
        future = self._pending.pop(corr_id, None)
        if future is not None and not future.done():
            future.set_result(result)
    def reject(self, corr_id: str, error: Exception) -> None:
        """Complete a pending ask with an error."""
        future = self._pending.pop(corr_id, None)
        if future is not None and not future.done():
            future.set_exception(error)
    def discard(self, corr_id: str) -> None:
        """Remove a pending entry (e.g. on timeout)."""
        self._pending.pop(corr_id, None)
    def reject_all(self, error: Exception) -> None:
        """Reject all pending asks (e.g. on system shutdown)."""
        for future in self._pending.values():
            if not future.done():
                future.set_exception(error)
        self._pending.clear()
 # ---------------------------------------------------------------------------
 # ReplyChannel — abstraction for routing replies (local or cross-process)
 # ---------------------------------------------------------------------------
 class _ReplyMessage:
    """Reply payload sent through ReplyChannel.
    Carries the original exception object for local delivery (preserves type).
    For cross-process serialization, use ``to_dict``/``from_dict``.
    """
    __slots__ = ("correlation_id", "result", "error", "exception")
    def __init__(self, correlation_id: str, result: Any = None, error: str | None = None, exception: Exception | None = None) -> None:
        self.correlation_id = correlation_id
        self.result = result
        self.error = error
        self.exception = exception  # Original exception (local only, not serializable)
    def to_dict(self) -> dict[str, Any]:
        """Serialize for cross-process transport (exception becomes string)."""
        return {"correlation_id": self.correlation_id, "result": self.result, "error": self.error}
    @classmethod
    def from_dict(cls, d: dict[str, Any]) -> _ReplyMessage:
        return cls(d["correlation_id"], d.get("result"), d.get("error"))
 class ReplyChannel:
    """Routes replies from actor back to the caller's ReplyRegistry.
    Default implementation: resolve locally (same process).
    Override ``send_reply`` for cross-process routing (e.g. via Redis pub/sub).
    """
    async def send_reply(self, reply_to: str, reply: _ReplyMessage, local_registry: _ReplyRegistry) -> None:
        """Deliver a reply to the system identified by *reply_to*.
        Default: assumes reply_to is the local system → resolve directly.
        Override for MQ-backed cross-process delivery.
        """
        if reply.exception is not None:
            # Local: preserve original exception type
            local_registry.reject(reply.correlation_id, reply.exception)
        elif reply.error is not None:
            # Cross-process: exception was serialized to string
            local_registry.reject(reply.correlation_id, RuntimeError(reply.error))
        else:
            local_registry.resolve(reply.correlation_id, reply.result)
    async def start_listener(self, system_id: str, registry: _ReplyRegistry) -> None:
        """Start listening for inbound replies (no-op for local)."""
    async def stop_listener(self) -> None:
        """Stop the reply listener (no-op for local)."""
@@ -0,0 +1,142 @@
 """Retry + idempotency helpers for Actor ask/tell patterns.
 This module provides:
 - Message envelope carrying retry/idempotency metadata
 - In-memory idempotency store (process-local)
 - ask_with_retry helper (bounded retries + exponential backoff + jitter)
 Design notes:
 - Keep transport-agnostic; works with current in-memory mailbox.
 - Business handlers must opt in by using ``IdempotentActorMixin`` and
  wrapping logic with ``handle_idempotent``.
 """
 from __future__ import annotations
 import asyncio
 import random
 import time
 import uuid
 from dataclasses import dataclass, field
 from typing import Any
@dataclass(slots=True)
 class RetryEnvelope:
    """Metadata wrapper for idempotent/retriable messages."""
    payload: Any
    message_id: str = field(default_factory=lambda: uuid.uuid4().hex)
    idempotency_key: str | None = None
    attempt: int = 1
    max_attempts: int = 1
    created_at_ms: int = field(default_factory=lambda: int(time.time() * 1000))
    @classmethod
    def wrap(
        cls,
        payload: Any,
        *,
        idempotency_key: str | None = None,
        attempt: int = 1,
        max_attempts: int = 1,
    ) -> "RetryEnvelope":
        return cls(
            payload=payload,
            idempotency_key=idempotency_key,
            attempt=attempt,
            max_attempts=max_attempts,
        )
 class IdempotencyStore:
    """Process-local idempotency result store."""
    def __init__(self) -> None:
        self._results: dict[str, Any] = {}
    def has(self, key: str) -> bool:
        return key in self._results
    def get(self, key: str) -> Any:
        return self._results[key]
    def set(self, key: str, value: Any) -> None:
        self._results[key] = value
 class IdempotentActorMixin:
    """Mixin adding idempotent handling utility for actors.
    Usage in actor::
        class MyActor(IdempotentActorMixin, Actor):
            async def on_receive(self, message):
                return await self.handle_idempotent(message, self._handle)
            async def _handle(self, payload):
                ...
    """
    def _idempotency_store(self) -> IdempotencyStore:
        store = getattr(self, "_idem_store", None)
        if store is None:
            store = IdempotencyStore()
            setattr(self, "_idem_store", store)
        return store
    async def handle_idempotent(self, message: Any, handler):
        if not isinstance(message, RetryEnvelope):
            return await handler(message)
        key = message.idempotency_key
        if not key:
            return await handler(message.payload)
        store = self._idempotency_store()
        if store.has(key):
            return store.get(key)
        result = await handler(message.payload)
        store.set(key, result)
        return result
 async def ask_with_retry(
    ref,
    payload: Any,
    *,
    timeout: float = 5.0,
    max_attempts: int = 3,
    base_backoff_s: float = 0.1,
    max_backoff_s: float = 5.0,
    jitter_ratio: float = 0.3,
    retry_exceptions: tuple[type[BaseException], ...] = (asyncio.TimeoutError,),
    idempotency_key: str | None = None,
 ) -> Any:
    """Ask actor with bounded retries and envelope metadata."""
    if max_attempts < 1:
        raise ValueError("max_attempts must be >= 1")
    key = idempotency_key or uuid.uuid4().hex
    last_exc: BaseException | None = None
    for attempt in range(1, max_attempts + 1):
        msg = RetryEnvelope.wrap(
            payload,
            idempotency_key=key,
            attempt=attempt,
            max_attempts=max_attempts,
        )
        try:
            return await ref.ask(msg, timeout=timeout)
        except retry_exceptions as exc:
            last_exc = exc
            if attempt >= max_attempts:
                break
            backoff = min(max_backoff_s, base_backoff_s * (2 ** (attempt - 1)))
            jitter = backoff * jitter_ratio * random.random()
            await asyncio.sleep(backoff + jitter)
    raise last_exc  # type: ignore[misc]  # always set: loop runs ≥1 time and sets on last iteration
@@ -0,0 +1,75 @@
 """Supervision strategies — Erlang/Akka-inspired fault tolerance."""
 from __future__ import annotations
 import enum
 import time
 from collections import deque
 from collections.abc import Callable
 from typing import Any
 class Directive(enum.Enum):
    """What a supervisor should do when a child fails."""
    resume = "resume"  # ignore error, keep processing
    restart = "restart"  # discard state, create fresh instance
    stop = "stop"  # terminate the child permanently
    escalate = "escalate"  # propagate to grandparent
 class SupervisorStrategy:
    """Base class for supervision strategies.
    Args:
        max_restarts: Maximum restarts allowed within *within_seconds*.
            Exceeding this limit stops the child permanently.
        within_seconds: Time window for restart counting.
        decider: Maps exception → Directive. Default: always restart.
    """
    def __init__(
        self,
        *,
        max_restarts: int = 3,
        within_seconds: float = 60.0,
        decider: Callable[[Exception], Directive] | None = None,
    ) -> None:
        self.max_restarts = max_restarts
        self.within_seconds = within_seconds
        self.decider = decider or (lambda _: Directive.restart)
        self._restart_timestamps: dict[str, deque[float]] = {}
    def decide(self, error: Exception) -> Directive:
        return self.decider(error)
    def record_restart(self, child_name: str) -> bool:
        """Record a restart and return True if within limits."""
        now = time.monotonic()
        if child_name not in self._restart_timestamps:
            self._restart_timestamps[child_name] = deque()
        ts = self._restart_timestamps[child_name]
        # Purge old entries outside the window
        cutoff = now - self.within_seconds
        while ts and ts[0] < cutoff:
            ts.popleft()
        ts.append(now)
        return len(ts) <= self.max_restarts
    def apply_to_children(self, failed_child: str, all_children: list[str]) -> list[str]:
        """Return which children should be affected by the directive."""
        raise NotImplementedError
 class OneForOneStrategy(SupervisorStrategy):
    """Only the failed child is affected."""
    def apply_to_children(self, failed_child: str, all_children: list[str]) -> list[str]:
        return [failed_child]
 class AllForOneStrategy(SupervisorStrategy):
    """All children are affected when any one fails."""
    def apply_to_children(self, failed_child: str, all_children: list[str]) -> list[str]:
        return list(all_children)
@@ -0,0 +1,416 @@
 """ActorSystem — top-level actor container and lifecycle manager."""
 from __future__ import annotations
 import asyncio
 import logging
 from collections import deque
 from dataclasses import dataclass
 from typing import Any
 from .actor import Actor, ActorContext
 from .mailbox import Empty, Mailbox, MemoryMailbox
 from .middleware import ActorMailboxContext, Middleware, NextFn, build_middleware_chain
 from .ref import ActorRef, ActorStoppedError, MailboxFullError, ReplyChannel, _Envelope, _ReplyMessage, _ReplyRegistry, _Stop
 from .supervision import Directive, SupervisorStrategy
 logger = logging.getLogger(__name__)
 # Timeout for middleware lifecycle hooks (on_started/on_stopped)
 _MIDDLEWARE_HOOK_TIMEOUT = 10.0
 # Maximum dead letters kept in memory
 _MAX_DEAD_LETTERS = 10000
 # Maximum consecutive failures before a root actor poison-quarantines a message
 _MAX_CONSECUTIVE_FAILURES = 10
@dataclass
 class DeadLetter:
    """A message that could not be delivered."""
    recipient: ActorRef
    message: Any
    sender: ActorRef | None
 class ActorSystem:
    """Top-level actor container.
    Manages root actors and provides the dead letter sink.
    """
    def __init__(
        self,
        name: str = "system",
        *,
        max_dead_letters: int = _MAX_DEAD_LETTERS,
        executor_workers: int | None = 4,
        reply_channel: ReplyChannel | None = None,
    ) -> None:
        import uuid as _uuid
        self.name = name
        self.system_id = f"{name}-{_uuid.uuid4().hex[:8]}"
        self._root_cells: dict[str, _ActorCell] = {}
        self._dead_letters: deque[DeadLetter] = deque(maxlen=max_dead_letters)
        self._on_dead_letter: list[Any] = []
        self._shutting_down = False
        self._replies = _ReplyRegistry()
        self._reply_channel = reply_channel or ReplyChannel()
        # Shared thread pool for actors to run blocking I/O
        from concurrent.futures import ThreadPoolExecutor
        self._executor = ThreadPoolExecutor(max_workers=executor_workers, thread_name_prefix=f"actor-{name}") if executor_workers else None
    async def spawn(
        self,
        actor_cls: type[Actor],
        name: str,
        *,
        mailbox_size: int = 256,
        mailbox: Mailbox | None = None,
        middlewares: list[Middleware] | None = None,
    ) -> ActorRef:
        """Spawn a root-level actor.
        Args:
            mailbox: Custom mailbox instance. If None, uses MemoryMailbox(mailbox_size).
        """
        if name in self._root_cells:
            raise ValueError(f"Root actor '{name}' already exists")
        cell = _ActorCell(
            actor_cls=actor_cls,
            name=name,
            parent=None,
            system=self,
            mailbox=mailbox or MemoryMailbox(mailbox_size),
            middlewares=middlewares or [],
        )
        self._root_cells[name] = cell
        try:
            await cell.start()
        except Exception:
            del self._root_cells[name]
            raise
        return cell.ref
    async def shutdown(self, *, timeout: float = 10.0) -> None:
        """Gracefully stop all actors."""
        self._shutting_down = True
        tasks = []
        for cell in list(self._root_cells.values()):
            cell.request_stop()
            if cell.task is not None:
                tasks.append(cell.task)
        if tasks:
            _, pending = await asyncio.wait(tasks, timeout=timeout)
            # Cancel tasks that didn't finish within the timeout to prevent zombie tasks
            for t in pending:
                t.cancel()
            if pending:
                await asyncio.wait(pending, timeout=2.0)
        self._root_cells.clear()
        self._replies.reject_all(ActorStoppedError("ActorSystem shutting down"))
        await self._reply_channel.stop_listener()
        if self._executor is not None:
            self._executor.shutdown(wait=False)
        logger.info("ActorSystem '%s' shut down (%d dead letters)", self.name, len(self._dead_letters))
    def _dead_letter(self, recipient: ActorRef, message: Any, sender: ActorRef | None) -> None:
        dl = DeadLetter(recipient=recipient, message=message, sender=sender)
        self._dead_letters.append(dl)
        for cb in self._on_dead_letter:
            try:
                cb(dl)
            except Exception:
                pass
        logger.debug("Dead letter: %s → %s", type(message).__name__, recipient.path)
    def on_dead_letter(self, callback: Any) -> None:
        """Register a dead letter listener."""
        self._on_dead_letter.append(callback)
    @property
    def dead_letters(self) -> list[DeadLetter]:
        return list(self._dead_letters)
 # ---------------------------------------------------------------------------
 # _ActorCell — internal runtime wrapper
 # ---------------------------------------------------------------------------
 class _ActorCell:
    """Runtime container for a single actor instance.
    Manages the mailbox, processing loop, children, and supervision.
    Not part of the public API.
    """
    def __init__(
        self,
        actor_cls: type[Actor],
        name: str,
        parent: _ActorCell | None,
        system: ActorSystem,
        mailbox: Mailbox,
        middlewares: list[Middleware] | None = None,
    ) -> None:
        self.actor_cls = actor_cls
        self.name = name
        self.parent = parent
        self.system = system
        self.children: dict[str, _ActorCell] = {}
        self.mailbox = mailbox
        self.ref = ActorRef(self)
        self.actor: Actor | None = None
        self.task: asyncio.Task[None] | None = None
        self.stopped = False
        self._supervisor_strategy: SupervisorStrategy | None = None
        self._middlewares = middlewares or []
        self._receive_chain: NextFn | None = None
        # Cache path (immutable after init — parent never changes)
        parts: list[str] = []
        cell: _ActorCell | None = self
        while cell is not None:
            parts.append(cell.name)
            cell = cell.parent
        parts.append(system.name)
        self.path = "/" + "/".join(reversed(parts))
    async def start(self) -> None:
        self.actor = self.actor_cls()
        self.actor.context = ActorContext(self)
        async def _inner_handler(_ctx: ActorMailboxContext, message: Any) -> Any:
            return await self.actor.on_receive(message)  # type: ignore[union-attr]
        if self._middlewares:
            self._receive_chain = build_middleware_chain(self._middlewares, _inner_handler)
        else:
            self._receive_chain = _inner_handler
        # Notify middleware of start (with timeout to prevent blocking)
        for mw in self._middlewares:
            try:
                await asyncio.wait_for(mw.on_started(self.ref), timeout=_MIDDLEWARE_HOOK_TIMEOUT)
            except asyncio.TimeoutError:
                logger.warning("Middleware %s.on_started timed out for %s", type(mw).__name__, self.path)
        await self.actor.on_started()
        self.task = asyncio.create_task(self._run(), name=f"actor:{self.path}")
    async def enqueue(self, msg: _Envelope | _Stop) -> None:
        # Try non-blocking first (fast path for MemoryMailbox)
        if self.mailbox.put_nowait(msg):
            return
        # Fallback to async put (required for Redis and other async backends)
        if not await self.mailbox.put(msg):
            if isinstance(msg, _Envelope) and msg.correlation_id is not None:
                self.system._replies.reject(msg.correlation_id, MailboxFullError(f"Mailbox full: {self.path}"))
            elif isinstance(msg, _Envelope):
                self.system._dead_letter(self.ref, msg.payload, msg.sender)
    def request_stop(self) -> None:
        """Request graceful shutdown.
        Tries put_nowait first. If that fails (full or unsupported backend),
        cancels the task directly so _run exits via CancelledError → finally → _shutdown.
        """
        if not self.stopped:
            if not self.mailbox.put_nowait(_Stop()):
                # Redis/async backends can't put_nowait — cancel the task
                if self.task is not None and not self.task.done():
                    self.task.cancel()
                else:
                    self.stopped = True
    async def spawn_child(
        self,
        actor_cls: type[Actor],
        name: str,
        *,
        mailbox_size: int = 256,
        mailbox: Mailbox | None = None,
        middlewares: list[Middleware] | None = None,
    ) -> ActorRef:
        if name in self.children:
            raise ValueError(f"Child '{name}' already exists under {self.path}")
        child = _ActorCell(
            actor_cls=actor_cls,
            name=name,
            parent=self,
            system=self.system,
            mailbox=mailbox or MemoryMailbox(mailbox_size),
            middlewares=middlewares or [],
        )
        self.children[name] = child
        try:
            await child.start()
        except Exception:
            del self.children[name]
            raise
        return child.ref
    # -- Processing loop -------------------------------------------------------
    async def _run(self) -> None:
        consecutive_failures = 0
        try:
            while not self.stopped:
                try:
                    msg = await self.mailbox.get()
                except asyncio.CancelledError:
                    break
                if isinstance(msg, _Stop):
                    break
                try:
                    if not isinstance(msg, _Envelope):
                        continue
                    msg_type = "ask" if msg.correlation_id else "tell"
                    ctx = ActorMailboxContext(self.ref, msg.sender, msg_type)
                    result = await self._receive_chain(ctx, msg.payload)  # type: ignore[misc]
                    if msg.correlation_id is not None:
                        reply = _ReplyMessage(msg.correlation_id, result=result)
                        await self.system._reply_channel.send_reply(msg.reply_to or self.system.system_id, reply, self.system._replies)
                    consecutive_failures = 0
                except Exception as exc:
                    if isinstance(msg, _Envelope) and msg.correlation_id is not None:
                        reply = _ReplyMessage(msg.correlation_id, error=str(exc), exception=exc)
                        await self.system._reply_channel.send_reply(msg.reply_to or self.system.system_id, reply, self.system._replies)
                    if self.parent is not None:
                        await self.parent._handle_child_failure(self, exc)
                    else:
                        consecutive_failures += 1
                        logger.error("Uncaught error in root actor %s (%d/%d): %s", self.path, consecutive_failures, _MAX_CONSECUTIVE_FAILURES, exc)
                        if consecutive_failures >= _MAX_CONSECUTIVE_FAILURES:
                            logger.error("Root actor %s hit consecutive failure limit — stopping", self.path)
                            break
        except asyncio.CancelledError:
            pass  # Fall through to _shutdown
        finally:
            await self._shutdown()
    async def _shutdown(self) -> None:
        self.stopped = True
        # Parallel child shutdown prevents cascading timeouts.
        child_tasks = []
        for child in list(self.children.values()):
            child.request_stop()
            if child.task is not None:
                child_tasks.append(child.task)
        if child_tasks:
            _, pending = await asyncio.wait(child_tasks, timeout=10.0)
            for t in pending:
                t.cancel()
                # Mark leaked children as stopped
                for child in self.children.values():
                    if child.task is t:
                        child.stopped = True
        # Drain mailbox → dead letters (use try/except to handle all backends)
        while True:
            try:
                msg = self.mailbox.get_nowait()
            except Empty:
                break
            if isinstance(msg, _Envelope):
                if msg.correlation_id is not None:
                    self.system._replies.reject(msg.correlation_id, ActorStoppedError(f"Actor {self.path} stopped"))
                else:
                    self.system._dead_letter(self.ref, msg.payload, msg.sender)
        # Lifecycle hook
        for mw in self._middlewares:
            try:
                await asyncio.wait_for(mw.on_stopped(self.ref), timeout=_MIDDLEWARE_HOOK_TIMEOUT)
            except asyncio.TimeoutError:
                logger.warning("Middleware %s.on_stopped timed out for %s", type(mw).__name__, self.path)
            except Exception:
                logger.exception("Error in middleware on_stopped for %s", self.path)
        if self.actor is not None:
            try:
                await self.actor.on_stopped()
            except Exception:
                logger.exception("Error in on_stopped for %s", self.path)
        # Remove from parent
        if self.parent is not None:
            self.parent.children.pop(self.name, None)
        # Close mailbox to release backend resources (e.g. Redis connections)
        try:
            await self.mailbox.close()
        except Exception:
            logger.exception("Error closing mailbox for %s", self.path)
    # -- Supervision -----------------------------------------------------------
    def _get_supervisor_strategy(self) -> SupervisorStrategy:
        if self._supervisor_strategy is None:
            self._supervisor_strategy = self.actor.supervisor_strategy()  # type: ignore[union-attr]
        return self._supervisor_strategy
    async def _handle_child_failure(self, child: _ActorCell, error: Exception) -> None:
        strategy = self._get_supervisor_strategy()
        directive = strategy.decide(error)
        affected = strategy.apply_to_children(child.name, list(self.children.keys()))
        if directive == Directive.resume:
            logger.info("Supervisor %s: resume %s after %s", self.path, child.path, type(error).__name__)
            return
        if directive == Directive.stop:
            for name in affected:
                c = self.children.get(name)
                if c is not None:
                    c.request_stop()
            logger.info("Supervisor %s: stop %s after %s", self.path, [self.children[n].path for n in affected if n in self.children], type(error).__name__)
            return
        if directive == Directive.escalate:
            # Stop the failing child, then propagate failure up the supervision chain.
            # We cannot use `raise error` here — that would crash the child's _run
            # loop instead of notifying the grandparent's supervisor.
            child.request_stop()
            if self.parent is not None:
                logger.info("Supervisor %s: escalate %s to grandparent %s", self.path, type(error).__name__, self.parent.path)
                await self.parent._handle_child_failure(self, error)
            else:
                logger.error("Uncaught escalation at root actor %s: %s", self.path, error)
            return
        if directive == Directive.restart:
            for name in affected:
                c = self.children.get(name)
                if c is None:
                    continue
                if not strategy.record_restart(name):
                    logger.warning("Supervisor %s: child %s exceeded restart limit — stopping", self.path, c.path)
                    c.request_stop()
                    continue
                await self._restart_child(c, error)
    async def _restart_child(self, child: _ActorCell, error: Exception) -> None:
        logger.info("Supervisor %s: restarting %s after %s", self.path, child.path, type(error).__name__)
        # Stop the old actor (but keep the cell and mailbox)
        old_actor = child.actor
        if old_actor is not None:
            try:
                await old_actor.on_stopped()
            except Exception:
                logger.exception("Error in on_stopped during restart of %s", child.path)
        # Notify middleware of restart (reset per-instance state)
        for mw in child._middlewares:
            try:
                await asyncio.wait_for(mw.on_restart(child.ref, error), timeout=_MIDDLEWARE_HOOK_TIMEOUT)
            except asyncio.TimeoutError:
                logger.warning("Middleware %s.on_restart timed out for %s", type(mw).__name__, child.path)
            except Exception:
                logger.exception("Error in middleware on_restart for %s", child.path)
        # Create fresh instance
        new_actor = child.actor_cls()
        new_actor.context = ActorContext(child)
        child.actor = new_actor
        try:
            await new_actor.on_restart(error)
            await new_actor.on_started()
        except Exception:
            logger.exception("Error during restart initialization of %s", child.path)
            child.request_stop()
@@ -19,7 +19,11 @@ dependencies = [
 ]
 [dependency-groups]
-dev = ["pytest>=8.0.0", "ruff>=0.14.11"]
+dev = [
    "pytest>=8.0.0",
    "redis>=7.4.0",
    "ruff>=0.14.11",
 ]
 [tool.uv.workspace]
 members = ["packages/harness"]
@@ -0,0 +1,268 @@
 """Actor framework benchmarks — throughput, latency, concurrency."""
 import asyncio
 import time
 from deerflow.actor import Actor, ActorSystem, Middleware
 class NoopActor(Actor):
    async def on_receive(self, message):
        return message
 class CounterActor(Actor):
    async def on_started(self):
        self.count = 0
    async def on_receive(self, message):
        if message == "inc":
            self.count += 1
            return self.count
        if message == "get":
            return self.count
        return self.count
 class ChainActor(Actor):
    """Forwards message to next actor in chain."""
    next_ref = None
    async def on_receive(self, message):
        if self.next_ref is not None:
            return await self.next_ref.ask(message)
        return message
 class ComputeActor(Actor):
    """Simulates CPU work via thread pool."""
    async def on_receive(self, message):
        def fib(n):
            a, b = 0, 1
            for _ in range(n):
                a, b = b, a + b
            return a
        return await self.context.run_in_executor(fib, message)
 class CountMiddleware(Middleware):
    def __init__(self):
        self.count = 0
    async def on_receive(self, ctx, message, next_fn):
        self.count += 1
        return await next_fn(ctx, message)
 def fmt(n):
    if n >= 1_000_000:
        return f"{n/1_000_000:.1f}M"
    if n >= 1_000:
        return f"{n/1_000:.0f}K"
    return str(n)
 async def bench_tell_throughput(n=100_000):
    """Measure tell (fire-and-forget) throughput."""
    system = ActorSystem("bench")
    ref = await system.spawn(CounterActor, "counter", mailbox_size=n + 10)
    start = time.perf_counter()
    for _ in range(n):
        await ref.tell("inc")
    # Wait for all messages to be processed
    count = await ref.ask("get", timeout=30.0)
    if count != n:
        print(f"  warning: expected {n} processed, got {count}")
    elapsed = time.perf_counter() - start
    await system.shutdown()
    rate = n / elapsed
    print(f"  tell throughput:     {fmt(n)} msgs in {elapsed:.2f}s = {fmt(int(rate))}/s")
 async def bench_ask_throughput(n=50_000):
    """Measure ask (request-response) throughput."""
    system = ActorSystem("bench")
    ref = await system.spawn(NoopActor, "echo")
    start = time.perf_counter()
    for _ in range(n):
        await ref.ask("ping")
    elapsed = time.perf_counter() - start
    await system.shutdown()
    rate = n / elapsed
    print(f"  ask throughput:      {fmt(n)} msgs in {elapsed:.2f}s = {fmt(int(rate))}/s")
 async def bench_ask_latency(n=10_000):
    """Measure ask round-trip latency percentiles."""
    system = ActorSystem("bench")
    ref = await system.spawn(NoopActor, "echo")
    # Warmup
    for _ in range(100):
        await ref.ask("warmup")
    latencies = []
    for _ in range(n):
        t0 = time.perf_counter()
        await ref.ask("ping")
        latencies.append((time.perf_counter() - t0) * 1_000_000)  # microseconds
    await system.shutdown()
    latencies.sort()
    p50 = latencies[len(latencies) // 2]
    p99 = latencies[int(len(latencies) * 0.99)]
    p999 = latencies[int(len(latencies) * 0.999)]
    print(f"  ask latency:        p50={p50:.0f}µs  p99={p99:.0f}µs  p99.9={p999:.0f}µs")
 async def bench_concurrent_actors(num_actors=1000, msgs_per_actor=100):
    """Measure throughput with many concurrent actors."""
    system = ActorSystem("bench")
    refs = []
    for i in range(num_actors):
        refs.append(await system.spawn(CounterActor, f"a{i}", mailbox_size=msgs_per_actor + 10))
    start = time.perf_counter()
    async def send_batch(ref, n):
        for i in range(n):
            await ref.tell("inc")
            # Yield control every 50 msgs so actor loops can drain
            if i % 50 == 49:
                await asyncio.sleep(0)
        return await ref.ask("get", timeout=30.0)
    results = await asyncio.gather(*[send_batch(r, msgs_per_actor) for r in refs])
    elapsed = time.perf_counter() - start
    total = num_actors * msgs_per_actor
    delivered = sum(results)
    rate = total / elapsed
    loss = total - delivered
    print(f"  {num_actors} actors × {msgs_per_actor} msgs: {fmt(total)} in {elapsed:.2f}s = {fmt(int(rate))}/s (loss: {loss})")
    await system.shutdown()
 async def bench_actor_chain(depth=100):
    """Measure ask latency through a chain of actors (hop overhead)."""
    system = ActorSystem("bench")
    refs = []
    for i in range(depth):
        refs.append(await system.spawn(ChainActor, f"c{i}"))
    # Link chain: c0 → c1 → ... → c99
    for i in range(depth - 1):
        refs[i]._cell.actor.next_ref = refs[i + 1]
    start = time.perf_counter()
    result = await refs[0].ask("ping", timeout=30.0)
    elapsed = time.perf_counter() - start
    assert result == "ping"
    per_hop = elapsed / depth * 1_000_000  # µs
    print(f"  chain {depth} hops:     {elapsed*1000:.1f}ms total, {per_hop:.0f}µs/hop")
    await system.shutdown()
 async def bench_middleware_overhead(n=50_000):
    """Measure overhead of middleware pipeline."""
    mw = CountMiddleware()
    system_plain = ActorSystem("plain")
    ref_plain = await system_plain.spawn(NoopActor, "echo")
    system_mw = ActorSystem("mw")
    ref_mw = await system_mw.spawn(NoopActor, "echo", middlewares=[mw])
    # Plain
    t0 = time.perf_counter()
    for _ in range(n):
        await ref_plain.ask("p")
    plain_elapsed = time.perf_counter() - t0
    # With middleware
    t0 = time.perf_counter()
    for _ in range(n):
        await ref_mw.ask("p")
    mw_elapsed = time.perf_counter() - t0
    overhead = ((mw_elapsed - plain_elapsed) / plain_elapsed) * 100
    print(f"  middleware overhead: {overhead:+.1f}% ({fmt(n)} ask calls, 1 middleware)")
    await system_plain.shutdown()
    await system_mw.shutdown()
 async def bench_executor_parallel(num_tasks=16):
    """Measure thread pool parallelism with CPU work."""
    system = ActorSystem("bench", executor_workers=8)
    refs = [await system.spawn(ComputeActor, f"cpu{i}") for i in range(num_tasks)]
    start = time.perf_counter()
    results = await asyncio.gather(*[r.ask(10_000, timeout=30.0) for r in refs])
    elapsed = time.perf_counter() - start
    print(f"  executor parallel:  {num_tasks} fib(10K) in {elapsed*1000:.0f}ms ({num_tasks/elapsed:.0f} tasks/s)")
    await system.shutdown()
 async def bench_spawn_teardown(n=5000):
    """Measure actor spawn + shutdown speed."""
    system = ActorSystem("bench")
    start = time.perf_counter()
    refs = []
    for i in range(n):
        refs.append(await system.spawn(NoopActor, f"a{i}"))
    spawn_elapsed = time.perf_counter() - start
    start = time.perf_counter()
    await system.shutdown()
    shutdown_elapsed = time.perf_counter() - start
    print(f"  spawn {n}:          {spawn_elapsed*1000:.0f}ms ({n/spawn_elapsed:.0f}/s)")
    print(f"  shutdown {n}:       {shutdown_elapsed*1000:.0f}ms")
 async def main():
    print("=" * 60)
    print("  Actor Framework Benchmarks")
    print("=" * 60)
    print()
    print("[Throughput]")
    await bench_tell_throughput()
    await bench_ask_throughput()
    print()
    print("[Latency]")
    await bench_ask_latency()
    await bench_actor_chain()
    print()
    print("[Concurrency]")
    await bench_concurrent_actors()
    await bench_executor_parallel()
    print()
    print("[Overhead]")
    await bench_middleware_overhead()
    print()
    print("[Lifecycle]")
    await bench_spawn_teardown()
    print()
    print("=" * 60)
    print("  Done")
    print("=" * 60)
 if __name__ == "__main__":
    asyncio.run(main())
@@ -0,0 +1,273 @@
 """RedisMailbox benchmark: throughput, latency, concurrency, backpressure."""
 import asyncio
 import time
 import redis.asyncio as redis
 from deerflow.actor import Actor, ActorSystem
 from deerflow.actor.mailbox_redis import RedisMailbox
 class EchoActor(Actor):
    async def on_receive(self, message):
        return message
 class CounterActor(Actor):
    async def on_started(self):
        self.count = 0
    async def on_receive(self, message):
        if message == "inc":
            self.count += 1
            return self.count
        if message == "get":
            return self.count
        return self.count
 def fmt(n):
    if n >= 1_000_000:
        return f"{n/1_000_000:.1f}M"
    if n >= 1_000:
        return f"{n/1_000:.0f}K"
    return str(n)
 async def _redis_client():
    client = redis.Redis(host="127.0.0.1", port=6379, decode_responses=False)
    await client.ping()
    return client
 async def bench_redis_ask_throughput(n=20_000):
    client = await _redis_client()
    queue = "deerflow:bench:redis:ask"
    await client.delete(queue)
    mailbox = RedisMailbox(client.connection_pool, queue, brpop_timeout=0.05)
    system = ActorSystem("bench-redis")
    ref = await system.spawn(EchoActor, "echo", mailbox=mailbox)
    start = time.perf_counter()
    for _ in range(n):
        await ref.ask("ping", timeout=5.0)
    elapsed = time.perf_counter() - start
    await system.shutdown()
    rate = n / elapsed
    print(f"  redis ask throughput:  {fmt(n)} msgs in {elapsed:.2f}s = {fmt(int(rate))}/s")
 async def bench_redis_tell_throughput(n=50_000):
    client = await _redis_client()
    queue = "deerflow:bench:redis:tell"
    await client.delete(queue)
    mailbox = RedisMailbox(client.connection_pool, queue, brpop_timeout=0.05)
    system = ActorSystem("bench-redis")
    ref = await system.spawn(CounterActor, "counter", mailbox=mailbox)
    start = time.perf_counter()
    for _ in range(n):
        await ref.tell("inc")
    count = await ref.ask("get", timeout=30.0)
    elapsed = time.perf_counter() - start
    await system.shutdown()
    rate = n / elapsed
    loss = n - count
    print(f"  redis tell throughput: {fmt(n)} msgs in {elapsed:.2f}s = {fmt(int(rate))}/s (loss: {loss})")
 async def bench_redis_ask_latency(n=5_000):
    client = await _redis_client()
    queue = "deerflow:bench:redis:latency"
    await client.delete(queue)
    mailbox = RedisMailbox(client.connection_pool, queue, brpop_timeout=0.05)
    system = ActorSystem("bench-redis")
    ref = await system.spawn(EchoActor, "echo", mailbox=mailbox)
    for _ in range(100):
        await ref.ask("warmup", timeout=5.0)
    latencies = []
    for _ in range(n):
        t0 = time.perf_counter()
        await ref.ask("ping", timeout=5.0)
        latencies.append((time.perf_counter() - t0) * 1_000_000)
    await system.shutdown()
    latencies.sort()
    p50 = latencies[len(latencies) // 2]
    p99 = latencies[int(len(latencies) * 0.99)]
    p999 = latencies[int(len(latencies) * 0.999)]
    print(f"  redis ask latency:     p50={p50:.0f}µs  p99={p99:.0f}µs  p99.9={p999:.0f}µs")
 async def bench_redis_concurrent_actors(num_actors=200, msgs_per_actor=100):
    client = await _redis_client()
    system = ActorSystem("bench-redis")
    refs = []
    for i in range(num_actors):
        q = f"deerflow:bench:redis:conc:{i}"
        await client.delete(q)
        mailbox = RedisMailbox(client.connection_pool, q, brpop_timeout=0.05)
        refs.append(await system.spawn(CounterActor, f"a{i}", mailbox=mailbox))
    start = time.perf_counter()
    async def send_batch(ref, n):
        for i in range(n):
            await ref.tell("inc")
            if i % 50 == 49:
                await asyncio.sleep(0)
        return await ref.ask("get", timeout=30.0)
    results = await asyncio.gather(*[send_batch(r, msgs_per_actor) for r in refs])
    elapsed = time.perf_counter() - start
    total = num_actors * msgs_per_actor
    delivered = sum(results)
    rate = total / elapsed
    loss = total - delivered
    print(
        f"  redis concurrency:     {num_actors} actors × {msgs_per_actor} msgs = {fmt(total)} in {elapsed:.2f}s = {fmt(int(rate))}/s (loss: {loss})"
    )
    await system.shutdown()
 async def bench_redis_maxlen_backpressure(total_messages=20_000, maxlen=100, ask_timeout=0.01, ask_concurrency=200):
    client = await _redis_client()
    queue_tell = "deerflow:bench:redis:bp:tell"
    await client.delete(queue_tell)
    mailbox_tell = RedisMailbox(client.connection_pool, queue_tell, maxlen=maxlen, brpop_timeout=0.05)
    system_tell = ActorSystem("bench-redis-bp-tell")
    ref_tell = await system_tell.spawn(CounterActor, "counter", mailbox=mailbox_tell)
    # Saturate with tell: dropped messages become dead letters
    for _ in range(total_messages):
        await ref_tell.tell("inc")
    await asyncio.sleep(0.2)
    processed = await ref_tell.ask("get", timeout=10.0)
    dropped = len(system_tell.dead_letters)
    drop_rate = dropped / total_messages if total_messages else 0.0
    print(
        f"  redis maxlen tell:     maxlen={maxlen}, sent={fmt(total_messages)}, processed={fmt(processed)}, dropped={fmt(dropped)} ({drop_rate:.1%})"
    )
    await system_tell.shutdown()
    # Ask timeout rate under pressure
    queue_ask = "deerflow:bench:redis:bp:ask"
    await client.delete(queue_ask)
    mailbox_ask = RedisMailbox(client.connection_pool, queue_ask, maxlen=maxlen, brpop_timeout=0.05)
    system_ask = ActorSystem("bench-redis-bp-ask")
    ref_ask = await system_ask.spawn(EchoActor, "echo", mailbox=mailbox_ask)
    async def one_ask(i):
        try:
            await ref_ask.ask(i, timeout=ask_timeout)
            return True, None
        except asyncio.TimeoutError:
            return False, "timeout"
        except Exception:  # MailboxFullError or other rejection
            return False, "rejected"
    sem = asyncio.Semaphore(ask_concurrency)
    async def one_ask_limited(i):
        async with sem:
            return await one_ask(i)
    results = await asyncio.gather(*[one_ask_limited(i) for i in range(total_messages)])
    ok = sum(1 for r, _ in results if r)
    timeout_count = sum(1 for _, reason in results if reason == "timeout")
    rejected_count = sum(1 for _, reason in results if reason == "rejected")
    fail_rate = (total_messages - ok) / total_messages if total_messages else 0.0
    print(
        f"  redis maxlen ask:      maxlen={maxlen}, total={fmt(total_messages)}, ok={fmt(ok)}, "
        f"timeout={fmt(timeout_count)}, rejected={fmt(rejected_count)} (fail: {fail_rate:.1%}), "
        f"ask_timeout={ask_timeout}s, concurrency={ask_concurrency}"
    )
    await system_ask.shutdown()
 async def bench_redis_put_batch(n=50_000, batch_size=100):
    """put_batch: N messages in N/batch_size round-trips instead of N."""
    client = await _redis_client()
    queue = "deerflow:bench:redis:batch"
    await client.delete(queue)
    mailbox = RedisMailbox(client.connection_pool, queue, brpop_timeout=0.05)
    system = ActorSystem("bench-redis-batch")
    ref = await system.spawn(CounterActor, "counter", mailbox=mailbox)
    from deerflow.actor.ref import _Envelope
    batches = [
        [_Envelope(payload="inc") for _ in range(batch_size)]
        for _ in range(n // batch_size)
    ]
    t0 = time.perf_counter()
    for batch in batches:
        await mailbox.put_batch(batch)
    enqueue_elapsed = time.perf_counter() - t0
    count = await ref.ask("get", timeout=60.0)
    total_elapsed = time.perf_counter() - t0
    loss = n - count
    enqueue_rate = n / enqueue_elapsed
    print(
        f"  redis put_batch push:  {fmt(n)} msgs in {enqueue_elapsed:.3f}s = {fmt(int(enqueue_rate))}/s "
        f"(batch={batch_size}, round-trips={n // batch_size})"
    )
    print(
        f"  redis put_batch total: end-to-end {total_elapsed:.2f}s = {fmt(int(n / total_elapsed))}/s "
        f"(consume bottleneck, loss={loss})"
    )
    await system.shutdown()
 async def main():
    print("=" * 72)
    print("  RedisMailbox Benchmarks")
    print("=" * 72)
    print()
    await bench_redis_tell_throughput()
    await bench_redis_ask_throughput()
    await bench_redis_ask_latency()
    await bench_redis_concurrent_actors()
    await bench_redis_put_batch()
    await bench_redis_maxlen_backpressure()
    print()
    print("=" * 72)
    print("  Done")
    print("=" * 72)
 if __name__ == "__main__":
    asyncio.run(main())
@@ -0,0 +1,534 @@
 """Tests for the async Actor framework."""
 import asyncio
 import pytest
 from deerflow.actor import (
    Actor,
    ActorRef,
    ActorSystem,
    AllForOneStrategy,
    Directive,
    Middleware,
    OneForOneStrategy,
 )
 from deerflow.actor.ref import ActorStoppedError
 # ---------------------------------------------------------------------------
 # Basic actors for testing
 # ---------------------------------------------------------------------------
 class EchoActor(Actor):
    async def on_receive(self, message):
        return message
 class CounterActor(Actor):
    async def on_started(self):
        self.count = 0
    async def on_receive(self, message):
        if message == "inc":
            self.count += 1
        elif message == "get":
            return self.count
 class CrashActor(Actor):
    async def on_receive(self, message):
        if message == "crash":
            raise ValueError("boom")
        return "ok"
 class ParentActor(Actor):
    def __init__(self):
        self.child_ref: ActorRef | None = None
        self.restarts = 0
    def supervisor_strategy(self):
        return OneForOneStrategy(max_restarts=3, within_seconds=60)
    async def on_started(self):
        self.child_ref = await self.context.spawn(CrashActor, "child")
    async def on_receive(self, message):
        if message == "get_child":
            return self.child_ref
 class StopOnCrashParent(Actor):
    def supervisor_strategy(self):
        return OneForOneStrategy(decider=lambda _: Directive.stop)
    async def on_started(self):
        self.child_ref = await self.context.spawn(CrashActor, "child")
    async def on_receive(self, message):
        if message == "get_child":
            return self.child_ref
 class AllForOneParent(Actor):
    def supervisor_strategy(self):
        return AllForOneStrategy(max_restarts=2, within_seconds=60)
    async def on_started(self):
        self.c1 = await self.context.spawn(CounterActor, "c1")
        self.c2 = await self.context.spawn(CrashActor, "c2")
    async def on_receive(self, message):
        if message == "get_children":
            return (self.c1, self.c2)
 class LifecycleActor(Actor):
    started = False
    stopped = False
    restarted_with: Exception | None = None
    async def on_started(self):
        LifecycleActor.started = True
    async def on_stopped(self):
        LifecycleActor.stopped = True
    async def on_restart(self, error):
        LifecycleActor.restarted_with = error
    async def on_receive(self, message):
        if message == "crash":
            raise RuntimeError("lifecycle crash")
        return "alive"
 # ---------------------------------------------------------------------------
 # Tests
 # ---------------------------------------------------------------------------
 class TestBasicMessaging:
    @pytest.mark.anyio
    async def test_tell_and_ask(self):
        system = ActorSystem("test")
        ref = await system.spawn(EchoActor, "echo")
        result = await ref.ask("hello")
        assert result == "hello"
        await system.shutdown()
    @pytest.mark.anyio
    async def test_ask_timeout(self):
        class SlowActor(Actor):
            async def on_receive(self, message):
                await asyncio.sleep(10)
        system = ActorSystem("test")
        ref = await system.spawn(SlowActor, "slow")
        with pytest.raises(asyncio.TimeoutError):
            await ref.ask("hi", timeout=0.1)
        await system.shutdown()
    @pytest.mark.anyio
    async def test_tell_fire_and_forget(self):
        system = ActorSystem("test")
        ref = await system.spawn(CounterActor, "counter")
        await ref.tell("inc")
        await ref.tell("inc")
        await ref.tell("inc")
        # Give the actor time to process
        await asyncio.sleep(0.05)
        count = await ref.ask("get")
        assert count == 3
        await system.shutdown()
    @pytest.mark.anyio
    async def test_ask_stopped_actor(self):
        system = ActorSystem("test")
        ref = await system.spawn(EchoActor, "echo")
        ref.stop()
        await asyncio.sleep(0.05)
        with pytest.raises(ActorStoppedError):
            await ref.ask("hello")
        await system.shutdown()
    @pytest.mark.anyio
    async def test_tell_stopped_actor_goes_to_dead_letters(self):
        system = ActorSystem("test")
        ref = await system.spawn(EchoActor, "echo")
        ref.stop()
        await asyncio.sleep(0.05)
        await ref.tell("orphan")
        assert len(system.dead_letters) >= 1
        await system.shutdown()
 class TestActorPath:
    @pytest.mark.anyio
    async def test_root_actor_path(self):
        system = ActorSystem("app")
        ref = await system.spawn(EchoActor, "echo")
        assert ref.path == "/app/echo"
        await system.shutdown()
    @pytest.mark.anyio
    async def test_child_actor_path(self):
        system = ActorSystem("app")
        parent = await system.spawn(ParentActor, "parent")
        child: ActorRef = await parent.ask("get_child")
        assert child.path == "/app/parent/child"
        await system.shutdown()
 class TestLifecycle:
    @pytest.mark.anyio
    async def test_on_started_called(self):
        LifecycleActor.started = False
        system = ActorSystem("test")
        await system.spawn(LifecycleActor, "lc")
        assert LifecycleActor.started is True
        await system.shutdown()
    @pytest.mark.anyio
    async def test_on_stopped_called(self):
        LifecycleActor.stopped = False
        system = ActorSystem("test")
        ref = await system.spawn(LifecycleActor, "lc")
        ref.stop()
        await asyncio.sleep(0.1)
        assert LifecycleActor.stopped is True
        await system.shutdown()
    @pytest.mark.anyio
    async def test_shutdown_stops_all(self):
        system = ActorSystem("test")
        r1 = await system.spawn(EchoActor, "a")
        r2 = await system.spawn(EchoActor, "b")
        await system.shutdown()
        assert not r1.is_alive
        assert not r2.is_alive
 class TestSupervision:
    @pytest.mark.anyio
    async def test_restart_on_crash(self):
        system = ActorSystem("test")
        parent = await system.spawn(ParentActor, "parent")
        child: ActorRef = await parent.ask("get_child")
        # Crash the child
        with pytest.raises(ValueError, match="boom"):
            await child.ask("crash")
        await asyncio.sleep(0.1)
        # Child should still be alive (restarted)
        assert child.is_alive
        result = await child.ask("safe")
        assert result == "ok"
        await system.shutdown()
    @pytest.mark.anyio
    async def test_stop_directive(self):
        system = ActorSystem("test")
        parent = await system.spawn(StopOnCrashParent, "parent")
        child: ActorRef = await parent.ask("get_child")
        with pytest.raises(ValueError, match="boom"):
            await child.ask("crash")
        await asyncio.sleep(0.1)
        assert not child.is_alive
        await system.shutdown()
    @pytest.mark.anyio
    async def test_restart_limit_exceeded(self):
        system = ActorSystem("test")
        class StrictParent(Actor):
            def supervisor_strategy(self):
                return OneForOneStrategy(max_restarts=2, within_seconds=60)
            async def on_started(self):
                self.child_ref = await self.context.spawn(CrashActor, "child")
            async def on_receive(self, message):
                return self.child_ref
        parent = await system.spawn(StrictParent, "parent")
        child: ActorRef = await parent.ask("any")
        # Exhaust restart limit
        for _ in range(3):
            try:
                await child.ask("crash")
            except (ValueError, ActorStoppedError):
                pass
            await asyncio.sleep(0.05)
        # After exceeding limit, child should be stopped
        assert not child.is_alive
        await system.shutdown()
    @pytest.mark.anyio
    async def test_all_for_one_restarts_siblings(self):
        system = ActorSystem("test")
        parent = await system.spawn(AllForOneParent, "parent")
        c1, c2 = await parent.ask("get_children")
        # Increment counter on c1
        await c1.tell("inc")
        await asyncio.sleep(0.05)
        count_before = await c1.ask("get")
        assert count_before == 1
        # Crash c2 → AllForOne should restart both
        try:
            await c2.ask("crash")
        except ValueError:
            pass
        await asyncio.sleep(0.1)
        # c1 was restarted, counter should be 0
        count_after = await c1.ask("get")
        assert count_after == 0
        await system.shutdown()
 class TestDeadLetters:
    @pytest.mark.anyio
    async def test_dead_letter_callback(self):
        received = []
        system = ActorSystem("test")
        system.on_dead_letter(lambda dl: received.append(dl))
        ref = await system.spawn(EchoActor, "echo")
        ref.stop()
        await asyncio.sleep(0.05)
        await ref.tell("orphan")
        assert len(received) >= 1
        assert received[-1].message == "orphan"
        await system.shutdown()
 class TestDuplicateNames:
    @pytest.mark.anyio
    async def test_duplicate_root_name_raises(self):
        system = ActorSystem("test")
        await system.spawn(EchoActor, "echo")
        with pytest.raises(ValueError, match="already exists"):
            await system.spawn(EchoActor, "echo")
        await system.shutdown()
 # ---------------------------------------------------------------------------
 # Middleware tests
 # ---------------------------------------------------------------------------
 class LogMiddleware(Middleware):
    def __init__(self):
        self.log: list[str] = []
    async def on_receive(self, ctx, message, next_fn):
        self.log.append(f"before:{message}")
        result = await next_fn(ctx, message)
        self.log.append(f"after:{result}")
        return result
    async def on_started(self, actor_ref):
        self.log.append("started")
    async def on_stopped(self, actor_ref):
        self.log.append("stopped")
 class TransformMiddleware(Middleware):
    """Uppercases string messages before passing to actor."""
    async def on_receive(self, ctx, message, next_fn):
        if isinstance(message, str):
            message = message.upper()
        return await next_fn(ctx, message)
 class TestExecutor:
    @pytest.mark.anyio
    async def test_run_in_executor(self):
        """Blocking function runs in thread pool without blocking event loop."""
        import time
        class BlockingActor(Actor):
            async def on_receive(self, message):
                # Simulate blocking I/O via thread pool
                result = await self.context.run_in_executor(time.sleep, 0.01)
                return "done"
        system = ActorSystem("test", executor_workers=2)
        ref = await system.spawn(BlockingActor, "blocker")
        result = await ref.ask("go", timeout=5.0)
        assert result == "done"
        await system.shutdown()
    @pytest.mark.anyio
    async def test_concurrent_blocking_calls(self):
        """Multiple actors can run blocking I/O concurrently via shared pool."""
        import time
        class SlowActor(Actor):
            async def on_receive(self, message):
                await self.context.run_in_executor(time.sleep, 0.1)
                return "ok"
        system = ActorSystem("test", executor_workers=4)
        refs = [await system.spawn(SlowActor, f"s{i}") for i in range(4)]
        start = time.monotonic()
        results = await asyncio.gather(*[r.ask("go", timeout=5.0) for r in refs])
        elapsed = time.monotonic() - start
        assert all(r == "ok" for r in results)
        # 4 parallel × 0.1s should finish in ~0.1-0.2s, not 0.4s
        assert elapsed < 0.3
        await system.shutdown()
 class TestMiddleware:
    @pytest.mark.anyio
    async def test_middleware_intercepts_messages(self):
        mw = LogMiddleware()
        system = ActorSystem("test")
        ref = await system.spawn(EchoActor, "echo", middlewares=[mw])
        result = await ref.ask("hello")
        assert result == "hello"
        assert "before:hello" in mw.log
        assert "after:hello" in mw.log
        await system.shutdown()
    @pytest.mark.anyio
    async def test_middleware_lifecycle_hooks(self):
        mw = LogMiddleware()
        system = ActorSystem("test")
        ref = await system.spawn(EchoActor, "echo", middlewares=[mw])
        assert "started" in mw.log
        ref.stop()
        await asyncio.sleep(0.1)
        assert "stopped" in mw.log
        await system.shutdown()
    @pytest.mark.anyio
    async def test_middleware_chain_order(self):
        """First middleware wraps outermost — sees original message."""
        mw1 = LogMiddleware()
        mw2 = TransformMiddleware()
        system = ActorSystem("test")
        # Chain: mw1(mw2(actor)). mw1 logs original, mw2 uppercases, actor echoes
        ref = await system.spawn(EchoActor, "echo", middlewares=[mw1, mw2])
        result = await ref.ask("hello")
        assert result == "HELLO"  # TransformMiddleware uppercased
        assert "before:hello" in mw1.log  # LogMiddleware saw original
        assert "after:HELLO" in mw1.log  # LogMiddleware saw transformed result
        await system.shutdown()
    @pytest.mark.anyio
    async def test_middleware_with_tell(self):
        mw = LogMiddleware()
        system = ActorSystem("test")
        await system.spawn(CounterActor, "counter", middlewares=[mw])
        # tell goes through middleware too
        assert any("before:" in entry for entry in mw.log) is False
        await system.shutdown()
    @pytest.mark.anyio
    async def test_middleware_on_restart_hook(self):
        """on_restart is called on the middleware when a child actor is restarted."""
        class RestartTrackingMiddleware(Middleware):
            def __init__(self):
                self.restart_errors: list[Exception] = []
            async def on_restart(self, actor_ref, error):
                self.restart_errors.append(error)
        mw = RestartTrackingMiddleware()
        class ChildSpawningParent(Actor):
            async def on_receive(self, message):
                if message == "spawn":
                    ref = await self.context.spawn(CrashActor, "child", middlewares=[mw])
                    return ref
        system = ActorSystem("test")
        parent = await system.spawn(ChildSpawningParent, "parent")
        child = await parent.ask("spawn")
        # Crash the child — parent supervisor will restart it
        try:
            await child.ask("crash")
        except ValueError:
            pass
        await asyncio.sleep(0.1)
        assert len(mw.restart_errors) == 1
        assert isinstance(mw.restart_errors[0], ValueError)
        await system.shutdown()
 class TestAskErrorPropagation:
    @pytest.mark.anyio
    async def test_ask_propagates_actor_exception(self):
        """ask() re-raises the original exception type when on_receive crashes."""
        class BoomActor(Actor):
            async def on_receive(self, message):
                raise ValueError("intentional crash")
        system = ActorSystem("test")
        ref = await system.spawn(BoomActor, "boom")
        with pytest.raises(ValueError, match="intentional crash"):
            await ref.ask("trigger")
        await system.shutdown()
    @pytest.mark.anyio
    async def test_ask_propagates_exception_while_supervised(self):
        """ask() gets the exception even when the actor is supervised (not stopped)."""
        class SometimesCrashActor(Actor):
            async def on_receive(self, message):
                if message == "crash":
                    raise RuntimeError("supervised crash")
                return "ok"
        system = ActorSystem("test")
        ref = await system.spawn(SometimesCrashActor, "sca")
        with pytest.raises(RuntimeError, match="supervised crash"):
            await ref.ask("crash")
        # Root actor keeps running after a crash (consecutive_failures, not restart)
        result = await ref.ask("hello", timeout=2.0)
        assert result == "ok"
        await system.shutdown()
    @pytest.mark.anyio
    async def test_ask_timeout_late_reply_no_exception(self):
        """Late reply arriving after ask() timeout is silently dropped — no exception, no orphaned future."""
        class SlowActor(Actor):
            async def on_receive(self, message):
                await asyncio.sleep(0.3)
                return "late"
        system = ActorSystem("test")
        ref = await system.spawn(SlowActor, "slow")
        with pytest.raises(asyncio.TimeoutError):
            await ref.ask("go", timeout=0.05)
        # Wait for actor to finish processing — late reply arrives, should be a no-op
        await asyncio.sleep(0.4)
        # System still functional: no orphaned futures, no leaked state
        assert ref.is_alive
        result = await ref.ask("go", timeout=2.0)
        assert result == "late"
        await system.shutdown()
@@ -0,0 +1,89 @@
 import asyncio
 import pytest
 from deerflow.actor import Actor, ActorSystem, MailboxFullError
 from deerflow.actor.mailbox import BACKPRESSURE_BLOCK, BACKPRESSURE_DROP_NEW, BACKPRESSURE_FAIL, MemoryMailbox
 class SlowActor(Actor):
    async def on_started(self):
        self.count = 0
    async def on_receive(self, message):
        if message == 'inc':
            await asyncio.sleep(0.01)
            self.count += 1
            return None
        if message == 'get':
            return self.count
        return None
@pytest.mark.anyio
 async def test_memory_mailbox_drop_new_policy_drops_tell_to_dead_letters():
    system = ActorSystem('bp')
    ref = await system.spawn(
        SlowActor,
        'slow',
        mailbox=MemoryMailbox(1, backpressure_policy=BACKPRESSURE_DROP_NEW),
    )
    # Overfill quickly
    for _ in range(20):
        await ref.tell('inc')
    await asyncio.sleep(0.4)
    count = await ref.ask('get', timeout=2.0)
    await system.shutdown()
    # Some messages should be dropped under drop_new
    assert count < 20
    assert len(system.dead_letters) > 0
@pytest.mark.anyio
 async def test_memory_mailbox_fail_policy_rejects_ask_when_full():
    system = ActorSystem('bp')
    ref = await system.spawn(
        SlowActor,
        'slow',
        mailbox=MemoryMailbox(1, backpressure_policy=BACKPRESSURE_FAIL),
    )
    # Fill queue with tell first
    await ref.tell('inc')
    # Then ask may be rejected when queue still full
    got_reject = False
    for _ in range(30):
        try:
            await ref.ask('inc', timeout=0.02)
        except MailboxFullError:
            got_reject = True
            break
        except asyncio.TimeoutError:
            pass
    await system.shutdown()
    assert got_reject
@pytest.mark.anyio
 async def test_memory_mailbox_block_policy_eventually_accepts():
    system = ActorSystem('bp')
    ref = await system.spawn(
        SlowActor,
        'slow',
        mailbox=MemoryMailbox(1, backpressure_policy=BACKPRESSURE_BLOCK),
    )
    for _ in range(10):
        await ref.tell('inc')
    await asyncio.sleep(0.25)
    count = await ref.ask('get', timeout=2.0)
    await system.shutdown()
    # Block policy should avoid dropping on tell path
    assert count == 10
@@ -0,0 +1,62 @@
 import asyncio
 import pytest
 from deerflow.actor import Actor, ActorSystem, IdempotentActorMixin, RetryEnvelope, ask_with_retry
 class FlakyIdempotentActor(IdempotentActorMixin, Actor):
    async def on_started(self):
        self.calls = 0
    async def on_receive(self, message):
        return await self.handle_idempotent(message, self._handle)
    async def _handle(self, payload):
        self.calls += 1
        if payload == 'flaky' and self.calls == 1:
            await asyncio.sleep(0.02)
            return 'late'
        return f"ok:{payload}"
@pytest.mark.anyio
 async def test_ask_with_retry_timeout_raises():
    system = ActorSystem('retry')
    ref = await system.spawn(FlakyIdempotentActor, 'a')
    with pytest.raises(asyncio.TimeoutError):
        await ask_with_retry(
            ref,
            'flaky',
            timeout=0.005,
            max_attempts=3,
            base_backoff_s=0.001,
            max_backoff_s=0.005,
            jitter_ratio=0.0,
            idempotency_key='k1',
        )
    # This helper retries timeout, but if each attempt times out it should raise.
    assert ref.is_alive
    await system.shutdown()
@pytest.mark.anyio
 async def test_idempotent_envelope_returns_cached_result():
    system = ActorSystem('retry')
    ref = await system.spawn(FlakyIdempotentActor, 'a')
    m1 = RetryEnvelope.wrap('x', idempotency_key='same-key')
    m2 = RetryEnvelope.wrap('x', idempotency_key='same-key', attempt=2, max_attempts=3)
    r1 = await ref.ask(m1, timeout=1.0)
    r2 = await ref.ask(m2, timeout=1.0)
    assert r1 == 'ok:x'
    assert r2 == 'ok:x'
    # handler should run once due to idempotency cache
    actor = ref._cell.actor
    assert actor.calls == 1
    await system.shutdown()
@@ -0,0 +1,83 @@
 import asyncio
 import pytest
 redis = pytest.importorskip("redis.asyncio")
 from deerflow.actor.mailbox_redis import RedisMailbox
 from deerflow.actor.ref import _Envelope, _Stop
 pytestmark = pytest.mark.anyio
 async def _make_mailbox(queue_name: str, *, maxlen: int = 0) -> RedisMailbox:
    client = redis.Redis(host="127.0.0.1", port=6379, decode_responses=False)
    await client.ping()
    await client.delete(queue_name)
    mailbox = RedisMailbox(client.connection_pool, queue_name, maxlen=maxlen, brpop_timeout=0.2)
    return mailbox
 async def test_roundtrip_envelope_and_stop():
    queue = "deerflow:test:redis-mailbox:roundtrip"
    mailbox = await _make_mailbox(queue)
    try:
        msg = _Envelope(payload={"k": "v"}, correlation_id="c1", reply_to="sysA")
        ok = await mailbox.put(msg)
        assert ok is True
        got = await mailbox.get()
        assert isinstance(got, _Envelope)
        assert got.payload == {"k": "v"}
        assert got.correlation_id == "c1"
        assert got.reply_to == "sysA"
        ok = await mailbox.put(_Stop())
        assert ok is True
        stop = await mailbox.get()
        assert isinstance(stop, _Stop)
    finally:
        await mailbox.close()
 async def test_bounded_queue_rejects_when_full():
    queue = "deerflow:test:redis-mailbox:bounded"
    mailbox = await _make_mailbox(queue, maxlen=1)
    try:
        assert await mailbox.put(_Envelope("m1")) is True
        assert await mailbox.put(_Envelope("m2")) is False
    finally:
        await mailbox.close()
 async def test_put_nowait_and_get_nowait_contract():
    queue = "deerflow:test:redis-mailbox:nowait"
    mailbox = await _make_mailbox(queue)
    try:
        assert mailbox.put_nowait(_Envelope("x")) is False
        with pytest.raises(Exception, match="does not support synchronous get_nowait"):
            mailbox.get_nowait()
    finally:
        await mailbox.close()
 async def test_system_enqueue_fallback_with_async_mailbox():
    from deerflow.actor import Actor, ActorSystem
    class EchoActor(Actor):
        async def on_receive(self, message):
            return message
    queue = "deerflow:test:redis-mailbox:system-fallback"
    mailbox = await _make_mailbox(queue)
    system = ActorSystem("redis-test")
    ref = await system.spawn(EchoActor, "echo", mailbox=mailbox)
    try:
        # This exercises _ActorCell.enqueue fallback path:
        # put_nowait() -> False, then await put() -> True
        result = await ref.ask("hello", timeout=3.0)
        assert result == "hello"
    finally:
        await system.shutdown()