fix(actor): harden lifecycle, supervision, Redis mailbox, and add comprehensive tests

- Fix spawn() zombie cell: clean up registry on start() failure
- Fix shutdown(): cancel + await tasks that exceed graceful timeout
- Fix _shutdown(): await mailbox.close() to release backend resources
- Fix escalate directive: stop failing child before propagating to grandparent
- Fix RedisMailbox.put(): wrap Redis errors in try/except, return False on failure
- Fix retry.py: replace assert with proper raise for last_exc
- Add put_batch() to Mailbox abstraction for single-roundtrip bulk enqueue
- Add RedisMailbox.put_batch() with atomic Lua script for bounded queues
- Add MailboxFullError exception type for semantic backpressure handling
- Add redis>=7.4.0 dependency with public PyPI sources in uv.lock

Tests added (31 total, up from 27):
- test_middleware_on_restart_hook: verifies middleware.on_restart() on supervision restart
- test_ask_propagates_actor_exception: ask() re-raises original exception type
- test_ask_propagates_exception_while_supervised: exception propagates; root actor survives
- test_ask_timeout_late_reply_no_exception: late reply after timeout is silent no-op
- test_actor_backpressure.py: MailboxFullError + dead letter on full mailbox
- test_actor_retry.py: ask_with_retry with exponential backoff
- test_mailbox_redis.py: RedisMailbox put/get/batch/close
- bench_actor_redis.py: RedisMailbox throughput benchmarks
This commit is contained in:
greatmengqi
2026-03-31 10:09:05 +08:00
parent 3e17417122
commit 228a2a66e3
14 changed files with 3156 additions and 2289 deletions
+7 -2
View File
@@ -2,7 +2,6 @@
import asyncio
import time
import statistics
from deerflow.actor import Actor, ActorSystem, Middleware
@@ -17,7 +16,11 @@ class CounterActor(Actor):
self.count = 0
async def on_receive(self, message):
self.count += 1
if message == "inc":
self.count += 1
return self.count
if message == "get":
return self.count
return self.count
@@ -69,6 +72,8 @@ async def bench_tell_throughput(n=100_000):
await ref.tell("inc")
# Wait for all messages to be processed
count = await ref.ask("get", timeout=30.0)
if count != n:
print(f" warning: expected {n} processed, got {count}")
elapsed = time.perf_counter() - start
await system.shutdown()
+273
View File
@@ -0,0 +1,273 @@
"""RedisMailbox benchmark: throughput, latency, concurrency, backpressure."""
import asyncio
import time
import redis.asyncio as redis
from deerflow.actor import Actor, ActorSystem
from deerflow.actor.mailbox_redis import RedisMailbox
class EchoActor(Actor):
async def on_receive(self, message):
return message
class CounterActor(Actor):
async def on_started(self):
self.count = 0
async def on_receive(self, message):
if message == "inc":
self.count += 1
return self.count
if message == "get":
return self.count
return self.count
def fmt(n):
if n >= 1_000_000:
return f"{n/1_000_000:.1f}M"
if n >= 1_000:
return f"{n/1_000:.0f}K"
return str(n)
async def _redis_client():
client = redis.Redis(host="127.0.0.1", port=6379, decode_responses=False)
await client.ping()
return client
async def bench_redis_ask_throughput(n=20_000):
client = await _redis_client()
queue = "deerflow:bench:redis:ask"
await client.delete(queue)
mailbox = RedisMailbox(client.connection_pool, queue, brpop_timeout=0.05)
system = ActorSystem("bench-redis")
ref = await system.spawn(EchoActor, "echo", mailbox=mailbox)
start = time.perf_counter()
for _ in range(n):
await ref.ask("ping", timeout=5.0)
elapsed = time.perf_counter() - start
await system.shutdown()
rate = n / elapsed
print(f" redis ask throughput: {fmt(n)} msgs in {elapsed:.2f}s = {fmt(int(rate))}/s")
async def bench_redis_tell_throughput(n=50_000):
client = await _redis_client()
queue = "deerflow:bench:redis:tell"
await client.delete(queue)
mailbox = RedisMailbox(client.connection_pool, queue, brpop_timeout=0.05)
system = ActorSystem("bench-redis")
ref = await system.spawn(CounterActor, "counter", mailbox=mailbox)
start = time.perf_counter()
for _ in range(n):
await ref.tell("inc")
count = await ref.ask("get", timeout=30.0)
elapsed = time.perf_counter() - start
await system.shutdown()
rate = n / elapsed
loss = n - count
print(f" redis tell throughput: {fmt(n)} msgs in {elapsed:.2f}s = {fmt(int(rate))}/s (loss: {loss})")
async def bench_redis_ask_latency(n=5_000):
client = await _redis_client()
queue = "deerflow:bench:redis:latency"
await client.delete(queue)
mailbox = RedisMailbox(client.connection_pool, queue, brpop_timeout=0.05)
system = ActorSystem("bench-redis")
ref = await system.spawn(EchoActor, "echo", mailbox=mailbox)
for _ in range(100):
await ref.ask("warmup", timeout=5.0)
latencies = []
for _ in range(n):
t0 = time.perf_counter()
await ref.ask("ping", timeout=5.0)
latencies.append((time.perf_counter() - t0) * 1_000_000)
await system.shutdown()
latencies.sort()
p50 = latencies[len(latencies) // 2]
p99 = latencies[int(len(latencies) * 0.99)]
p999 = latencies[int(len(latencies) * 0.999)]
print(f" redis ask latency: p50={p50:.0f}µs p99={p99:.0f}µs p99.9={p999:.0f}µs")
async def bench_redis_concurrent_actors(num_actors=200, msgs_per_actor=100):
client = await _redis_client()
system = ActorSystem("bench-redis")
refs = []
for i in range(num_actors):
q = f"deerflow:bench:redis:conc:{i}"
await client.delete(q)
mailbox = RedisMailbox(client.connection_pool, q, brpop_timeout=0.05)
refs.append(await system.spawn(CounterActor, f"a{i}", mailbox=mailbox))
start = time.perf_counter()
async def send_batch(ref, n):
for i in range(n):
await ref.tell("inc")
if i % 50 == 49:
await asyncio.sleep(0)
return await ref.ask("get", timeout=30.0)
results = await asyncio.gather(*[send_batch(r, msgs_per_actor) for r in refs])
elapsed = time.perf_counter() - start
total = num_actors * msgs_per_actor
delivered = sum(results)
rate = total / elapsed
loss = total - delivered
print(
f" redis concurrency: {num_actors} actors × {msgs_per_actor} msgs = {fmt(total)} in {elapsed:.2f}s = {fmt(int(rate))}/s (loss: {loss})"
)
await system.shutdown()
async def bench_redis_maxlen_backpressure(total_messages=20_000, maxlen=100, ask_timeout=0.01, ask_concurrency=200):
client = await _redis_client()
queue_tell = "deerflow:bench:redis:bp:tell"
await client.delete(queue_tell)
mailbox_tell = RedisMailbox(client.connection_pool, queue_tell, maxlen=maxlen, brpop_timeout=0.05)
system_tell = ActorSystem("bench-redis-bp-tell")
ref_tell = await system_tell.spawn(CounterActor, "counter", mailbox=mailbox_tell)
# Saturate with tell: dropped messages become dead letters
for _ in range(total_messages):
await ref_tell.tell("inc")
await asyncio.sleep(0.2)
processed = await ref_tell.ask("get", timeout=10.0)
dropped = len(system_tell.dead_letters)
drop_rate = dropped / total_messages if total_messages else 0.0
print(
f" redis maxlen tell: maxlen={maxlen}, sent={fmt(total_messages)}, processed={fmt(processed)}, dropped={fmt(dropped)} ({drop_rate:.1%})"
)
await system_tell.shutdown()
# Ask timeout rate under pressure
queue_ask = "deerflow:bench:redis:bp:ask"
await client.delete(queue_ask)
mailbox_ask = RedisMailbox(client.connection_pool, queue_ask, maxlen=maxlen, brpop_timeout=0.05)
system_ask = ActorSystem("bench-redis-bp-ask")
ref_ask = await system_ask.spawn(EchoActor, "echo", mailbox=mailbox_ask)
async def one_ask(i):
try:
await ref_ask.ask(i, timeout=ask_timeout)
return True, None
except asyncio.TimeoutError:
return False, "timeout"
except Exception: # MailboxFullError or other rejection
return False, "rejected"
sem = asyncio.Semaphore(ask_concurrency)
async def one_ask_limited(i):
async with sem:
return await one_ask(i)
results = await asyncio.gather(*[one_ask_limited(i) for i in range(total_messages)])
ok = sum(1 for r, _ in results if r)
timeout_count = sum(1 for _, reason in results if reason == "timeout")
rejected_count = sum(1 for _, reason in results if reason == "rejected")
fail_rate = (total_messages - ok) / total_messages if total_messages else 0.0
print(
f" redis maxlen ask: maxlen={maxlen}, total={fmt(total_messages)}, ok={fmt(ok)}, "
f"timeout={fmt(timeout_count)}, rejected={fmt(rejected_count)} (fail: {fail_rate:.1%}), "
f"ask_timeout={ask_timeout}s, concurrency={ask_concurrency}"
)
await system_ask.shutdown()
async def bench_redis_put_batch(n=50_000, batch_size=100):
"""put_batch: N messages in N/batch_size round-trips instead of N."""
client = await _redis_client()
queue = "deerflow:bench:redis:batch"
await client.delete(queue)
mailbox = RedisMailbox(client.connection_pool, queue, brpop_timeout=0.05)
system = ActorSystem("bench-redis-batch")
ref = await system.spawn(CounterActor, "counter", mailbox=mailbox)
from deerflow.actor.ref import _Envelope
batches = [
[_Envelope(payload="inc") for _ in range(batch_size)]
for _ in range(n // batch_size)
]
t0 = time.perf_counter()
for batch in batches:
await mailbox.put_batch(batch)
enqueue_elapsed = time.perf_counter() - t0
count = await ref.ask("get", timeout=60.0)
total_elapsed = time.perf_counter() - t0
loss = n - count
enqueue_rate = n / enqueue_elapsed
print(
f" redis put_batch push: {fmt(n)} msgs in {enqueue_elapsed:.3f}s = {fmt(int(enqueue_rate))}/s "
f"(batch={batch_size}, round-trips={n // batch_size})"
)
print(
f" redis put_batch total: end-to-end {total_elapsed:.2f}s = {fmt(int(n / total_elapsed))}/s "
f"(consume bottleneck, loss={loss})"
)
await system.shutdown()
async def main():
print("=" * 72)
print(" RedisMailbox Benchmarks")
print("=" * 72)
print()
await bench_redis_tell_throughput()
await bench_redis_ask_throughput()
await bench_redis_ask_latency()
await bench_redis_concurrent_actors()
await bench_redis_put_batch()
await bench_redis_maxlen_backpressure()
print()
print("=" * 72)
print(" Done")
print("=" * 72)
if __name__ == "__main__":
asyncio.run(main())
+92
View File
@@ -440,3 +440,95 @@ class TestMiddleware:
# tell goes through middleware too
assert any("before:" in entry for entry in mw.log) is False
await system.shutdown()
@pytest.mark.anyio
async def test_middleware_on_restart_hook(self):
"""on_restart is called on the middleware when a child actor is restarted."""
class RestartTrackingMiddleware(Middleware):
def __init__(self):
self.restart_errors: list[Exception] = []
async def on_restart(self, actor_ref, error):
self.restart_errors.append(error)
mw = RestartTrackingMiddleware()
class ChildSpawningParent(Actor):
async def on_receive(self, message):
if message == "spawn":
ref = await self.context.spawn(CrashActor, "child", middlewares=[mw])
return ref
system = ActorSystem("test")
parent = await system.spawn(ChildSpawningParent, "parent")
child = await parent.ask("spawn")
# Crash the child — parent supervisor will restart it
try:
await child.ask("crash")
except ValueError:
pass
await asyncio.sleep(0.1)
assert len(mw.restart_errors) == 1
assert isinstance(mw.restart_errors[0], ValueError)
await system.shutdown()
class TestAskErrorPropagation:
@pytest.mark.anyio
async def test_ask_propagates_actor_exception(self):
"""ask() re-raises the original exception type when on_receive crashes."""
class BoomActor(Actor):
async def on_receive(self, message):
raise ValueError("intentional crash")
system = ActorSystem("test")
ref = await system.spawn(BoomActor, "boom")
with pytest.raises(ValueError, match="intentional crash"):
await ref.ask("trigger")
await system.shutdown()
@pytest.mark.anyio
async def test_ask_propagates_exception_while_supervised(self):
"""ask() gets the exception even when the actor is supervised (not stopped)."""
class SometimesCrashActor(Actor):
async def on_receive(self, message):
if message == "crash":
raise RuntimeError("supervised crash")
return "ok"
system = ActorSystem("test")
ref = await system.spawn(SometimesCrashActor, "sca")
with pytest.raises(RuntimeError, match="supervised crash"):
await ref.ask("crash")
# Root actor keeps running after a crash (consecutive_failures, not restart)
result = await ref.ask("hello", timeout=2.0)
assert result == "ok"
await system.shutdown()
@pytest.mark.anyio
async def test_ask_timeout_late_reply_no_exception(self):
"""Late reply arriving after ask() timeout is silently dropped — no exception, no orphaned future."""
class SlowActor(Actor):
async def on_receive(self, message):
await asyncio.sleep(0.3)
return "late"
system = ActorSystem("test")
ref = await system.spawn(SlowActor, "slow")
with pytest.raises(asyncio.TimeoutError):
await ref.ask("go", timeout=0.05)
# Wait for actor to finish processing — late reply arrives, should be a no-op
await asyncio.sleep(0.4)
# System still functional: no orphaned futures, no leaked state
assert ref.is_alive
result = await ref.ask("go", timeout=2.0)
assert result == "late"
await system.shutdown()
+89
View File
@@ -0,0 +1,89 @@
import asyncio
import pytest
from deerflow.actor import Actor, ActorSystem, MailboxFullError
from deerflow.actor.mailbox import BACKPRESSURE_BLOCK, BACKPRESSURE_DROP_NEW, BACKPRESSURE_FAIL, MemoryMailbox
class SlowActor(Actor):
async def on_started(self):
self.count = 0
async def on_receive(self, message):
if message == 'inc':
await asyncio.sleep(0.01)
self.count += 1
return None
if message == 'get':
return self.count
return None
@pytest.mark.anyio
async def test_memory_mailbox_drop_new_policy_drops_tell_to_dead_letters():
system = ActorSystem('bp')
ref = await system.spawn(
SlowActor,
'slow',
mailbox=MemoryMailbox(1, backpressure_policy=BACKPRESSURE_DROP_NEW),
)
# Overfill quickly
for _ in range(20):
await ref.tell('inc')
await asyncio.sleep(0.4)
count = await ref.ask('get', timeout=2.0)
await system.shutdown()
# Some messages should be dropped under drop_new
assert count < 20
assert len(system.dead_letters) > 0
@pytest.mark.anyio
async def test_memory_mailbox_fail_policy_rejects_ask_when_full():
system = ActorSystem('bp')
ref = await system.spawn(
SlowActor,
'slow',
mailbox=MemoryMailbox(1, backpressure_policy=BACKPRESSURE_FAIL),
)
# Fill queue with tell first
await ref.tell('inc')
# Then ask may be rejected when queue still full
got_reject = False
for _ in range(30):
try:
await ref.ask('inc', timeout=0.02)
except MailboxFullError:
got_reject = True
break
except asyncio.TimeoutError:
pass
await system.shutdown()
assert got_reject
@pytest.mark.anyio
async def test_memory_mailbox_block_policy_eventually_accepts():
system = ActorSystem('bp')
ref = await system.spawn(
SlowActor,
'slow',
mailbox=MemoryMailbox(1, backpressure_policy=BACKPRESSURE_BLOCK),
)
for _ in range(10):
await ref.tell('inc')
await asyncio.sleep(0.25)
count = await ref.ask('get', timeout=2.0)
await system.shutdown()
# Block policy should avoid dropping on tell path
assert count == 10
+62
View File
@@ -0,0 +1,62 @@
import asyncio
import pytest
from deerflow.actor import Actor, ActorSystem, IdempotentActorMixin, RetryEnvelope, ask_with_retry
class FlakyIdempotentActor(IdempotentActorMixin, Actor):
async def on_started(self):
self.calls = 0
async def on_receive(self, message):
return await self.handle_idempotent(message, self._handle)
async def _handle(self, payload):
self.calls += 1
if payload == 'flaky' and self.calls == 1:
await asyncio.sleep(0.02)
return 'late'
return f"ok:{payload}"
@pytest.mark.anyio
async def test_ask_with_retry_timeout_raises():
system = ActorSystem('retry')
ref = await system.spawn(FlakyIdempotentActor, 'a')
with pytest.raises(asyncio.TimeoutError):
await ask_with_retry(
ref,
'flaky',
timeout=0.005,
max_attempts=3,
base_backoff_s=0.001,
max_backoff_s=0.005,
jitter_ratio=0.0,
idempotency_key='k1',
)
# This helper retries timeout, but if each attempt times out it should raise.
assert ref.is_alive
await system.shutdown()
@pytest.mark.anyio
async def test_idempotent_envelope_returns_cached_result():
system = ActorSystem('retry')
ref = await system.spawn(FlakyIdempotentActor, 'a')
m1 = RetryEnvelope.wrap('x', idempotency_key='same-key')
m2 = RetryEnvelope.wrap('x', idempotency_key='same-key', attempt=2, max_attempts=3)
r1 = await ref.ask(m1, timeout=1.0)
r2 = await ref.ask(m2, timeout=1.0)
assert r1 == 'ok:x'
assert r2 == 'ok:x'
# handler should run once due to idempotency cache
actor = ref._cell.actor
assert actor.calls == 1
await system.shutdown()
+83
View File
@@ -0,0 +1,83 @@
import asyncio
import pytest
redis = pytest.importorskip("redis.asyncio")
from deerflow.actor.mailbox_redis import RedisMailbox
from deerflow.actor.ref import _Envelope, _Stop
pytestmark = pytest.mark.anyio
async def _make_mailbox(queue_name: str, *, maxlen: int = 0) -> RedisMailbox:
client = redis.Redis(host="127.0.0.1", port=6379, decode_responses=False)
await client.ping()
await client.delete(queue_name)
mailbox = RedisMailbox(client.connection_pool, queue_name, maxlen=maxlen, brpop_timeout=0.2)
return mailbox
async def test_roundtrip_envelope_and_stop():
queue = "deerflow:test:redis-mailbox:roundtrip"
mailbox = await _make_mailbox(queue)
try:
msg = _Envelope(payload={"k": "v"}, correlation_id="c1", reply_to="sysA")
ok = await mailbox.put(msg)
assert ok is True
got = await mailbox.get()
assert isinstance(got, _Envelope)
assert got.payload == {"k": "v"}
assert got.correlation_id == "c1"
assert got.reply_to == "sysA"
ok = await mailbox.put(_Stop())
assert ok is True
stop = await mailbox.get()
assert isinstance(stop, _Stop)
finally:
await mailbox.close()
async def test_bounded_queue_rejects_when_full():
queue = "deerflow:test:redis-mailbox:bounded"
mailbox = await _make_mailbox(queue, maxlen=1)
try:
assert await mailbox.put(_Envelope("m1")) is True
assert await mailbox.put(_Envelope("m2")) is False
finally:
await mailbox.close()
async def test_put_nowait_and_get_nowait_contract():
queue = "deerflow:test:redis-mailbox:nowait"
mailbox = await _make_mailbox(queue)
try:
assert mailbox.put_nowait(_Envelope("x")) is False
with pytest.raises(Exception, match="does not support synchronous get_nowait"):
mailbox.get_nowait()
finally:
await mailbox.close()
async def test_system_enqueue_fallback_with_async_mailbox():
from deerflow.actor import Actor, ActorSystem
class EchoActor(Actor):
async def on_receive(self, message):
return message
queue = "deerflow:test:redis-mailbox:system-fallback"
mailbox = await _make_mailbox(queue)
system = ActorSystem("redis-test")
ref = await system.spawn(EchoActor, "echo", mailbox=mailbox)
try:
# This exercises _ActorCell.enqueue fallback path:
# put_nowait() -> False, then await put() -> True
result = await ref.ask("hello", timeout=3.0)
assert result == "hello"
finally:
await system.shutdown()