refactor(runtime): restructure runs module with new execution architecture

Major refactoring of deerflow/runtime/:
- runs/callbacks/ - new callback system (builder, events, title, tokens)
- runs/internal/ - execution internals (executor, supervisor, stream_logic, registry)
- runs/internal/execution/ - execution artifacts and events handling
- runs/facade.py - high-level run facade
- runs/observer.py - run observation protocol
- runs/types.py - type definitions
- runs/store/ - simplified store interfaces (create, delete, query, event)

Refactor stream_bridge/:
- Replace old providers with contract.py and exceptions.py
- Remove async_provider.py, base.py, memory.py

Add documentation:
- README.md and README_zh.md for runtime module

Remove deprecated:
- manager.py moved to internal/
- worker.py, schemas.py
- user_context.py

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
rayhpeng
2026-04-22 11:28:01 +08:00
parent 39a575617b
commit 9d0a42c1fb
43 changed files with 3928 additions and 1192 deletions
@@ -1,21 +1,47 @@
"""Stream bridge — decouples agent workers from SSE endpoints.
"""Stream bridge public surface.
A ``StreamBridge`` sits between the background task that runs an agent
(producer) and the HTTP endpoint that pushes Server-Sent Events to
the client (consumer). This package provides an abstract protocol
(:class:`StreamBridge`) plus a default in-memory implementation backed
by :mod:`asyncio.Queue`.
The harness package owns the stream abstraction and event semantics.
Concrete backends are intentionally not part of the public API here so
applications can inject infra-specific implementations.
"""
from .async_provider import make_stream_bridge
from .base import END_SENTINEL, HEARTBEAT_SENTINEL, StreamBridge, StreamEvent
from .memory import MemoryStreamBridge
from .contract import (
CANCELLED_SENTINEL,
END_SENTINEL,
HEARTBEAT_SENTINEL,
JSONScalar,
JSONValue,
TERMINAL_STATES,
ResumeResult,
StreamBridge,
StreamEvent,
StreamStatus,
)
from .exceptions import (
BridgeClosedError,
StreamBridgeError,
StreamCapacityExceededError,
StreamNotFoundError,
StreamTerminatedError,
)
__all__ = [
# Sentinels
"CANCELLED_SENTINEL",
"END_SENTINEL",
"HEARTBEAT_SENTINEL",
"MemoryStreamBridge",
# Types
"JSONScalar",
"JSONValue",
"ResumeResult",
"StreamBridge",
"StreamEvent",
"make_stream_bridge",
"StreamStatus",
"TERMINAL_STATES",
# Exceptions
"BridgeClosedError",
"StreamBridgeError",
"StreamCapacityExceededError",
"StreamNotFoundError",
"StreamTerminatedError",
]
@@ -1,52 +0,0 @@
"""Async stream bridge factory.
Provides an **async context manager** aligned with
:func:`deerflow.runtime.checkpointer.async_provider.make_checkpointer`.
Usage (e.g. FastAPI lifespan)::
from deerflow.agents.stream_bridge import make_stream_bridge
async with make_stream_bridge() as bridge:
app.state.stream_bridge = bridge
"""
from __future__ import annotations
import contextlib
import logging
from collections.abc import AsyncIterator
from deerflow.config.stream_bridge_config import get_stream_bridge_config
from .base import StreamBridge
logger = logging.getLogger(__name__)
@contextlib.asynccontextmanager
async def make_stream_bridge(config=None) -> AsyncIterator[StreamBridge]:
"""Async context manager that yields a :class:`StreamBridge`.
Falls back to :class:`MemoryStreamBridge` when no configuration is
provided and nothing is set globally.
"""
if config is None:
config = get_stream_bridge_config()
if config is None or config.type == "memory":
from deerflow.runtime.stream_bridge.memory import MemoryStreamBridge
maxsize = config.queue_maxsize if config is not None else 256
bridge = MemoryStreamBridge(queue_maxsize=maxsize)
logger.info("Stream bridge initialised: memory (queue_maxsize=%d)", maxsize)
try:
yield bridge
finally:
await bridge.close()
return
if config.type == "redis":
raise NotImplementedError("Redis stream bridge planned for Phase 2")
raise ValueError(f"Unknown stream bridge type: {config.type!r}")
@@ -1,72 +0,0 @@
"""Abstract stream bridge protocol.
StreamBridge decouples agent workers (producers) from SSE endpoints
(consumers), aligning with LangGraph Platform's Queue + StreamManager
architecture.
"""
from __future__ import annotations
import abc
from collections.abc import AsyncIterator
from dataclasses import dataclass
from typing import Any
@dataclass(frozen=True)
class StreamEvent:
"""Single stream event.
Attributes:
id: Monotonically increasing event ID (used as SSE ``id:`` field,
supports ``Last-Event-ID`` reconnection).
event: SSE event name, e.g. ``"metadata"``, ``"updates"``,
``"events"``, ``"error"``, ``"end"``.
data: JSON-serialisable payload.
"""
id: str
event: str
data: Any
HEARTBEAT_SENTINEL = StreamEvent(id="", event="__heartbeat__", data=None)
END_SENTINEL = StreamEvent(id="", event="__end__", data=None)
class StreamBridge(abc.ABC):
"""Abstract base for stream bridges."""
@abc.abstractmethod
async def publish(self, run_id: str, event: str, data: Any) -> None:
"""Enqueue a single event for *run_id* (producer side)."""
@abc.abstractmethod
async def publish_end(self, run_id: str) -> None:
"""Signal that no more events will be produced for *run_id*."""
@abc.abstractmethod
def subscribe(
self,
run_id: str,
*,
last_event_id: str | None = None,
heartbeat_interval: float = 15.0,
) -> AsyncIterator[StreamEvent]:
"""Async iterator that yields events for *run_id* (consumer side).
Yields :data:`HEARTBEAT_SENTINEL` when no event arrives within
*heartbeat_interval* seconds. Yields :data:`END_SENTINEL` once
the producer calls :meth:`publish_end`.
"""
@abc.abstractmethod
async def cleanup(self, run_id: str, *, delay: float = 0) -> None:
"""Release resources associated with *run_id*.
If *delay* > 0 the implementation should wait before releasing,
giving late subscribers a chance to drain remaining events.
"""
async def close(self) -> None:
"""Release backend resources. Default is a no-op."""
@@ -0,0 +1,112 @@
"""Stream bridge contract and public types."""
from __future__ import annotations
import abc
from collections.abc import AsyncIterator
from dataclasses import dataclass
from enum import Enum
from typing import Literal
type JSONScalar = None | bool | int | float | str
type JSONValue = JSONScalar | list["JSONValue"] | dict[str, "JSONValue"]
class StreamStatus(str, Enum):
"""Stream lifecycle states."""
ACTIVE = "active"
ENDED = "ended"
CANCELLED = "cancelled"
ERRORED = "errored"
CLOSED = "closed"
TERMINAL_STATES = frozenset({
StreamStatus.ENDED,
StreamStatus.CANCELLED,
StreamStatus.ERRORED,
})
@dataclass(frozen=True, slots=True)
class StreamEvent:
"""Single stream event."""
id: str
event: str
data: JSONValue
@dataclass(frozen=True, slots=True)
class ResumeResult:
"""Result of resolving Last-Event-ID."""
next_offset: int
status: Literal["fresh", "resumed", "evicted", "invalid", "unknown"]
gap_count: int = 0
HEARTBEAT_SENTINEL = StreamEvent(id="", event="__heartbeat__", data=None)
END_SENTINEL = StreamEvent(id="", event="__end__", data=None)
CANCELLED_SENTINEL = StreamEvent(id="", event="__cancelled__", data=None)
class StreamBridge(abc.ABC):
"""Abstract base for stream bridges.
``StreamBridge`` defines runtime stream semantics, not storage semantics.
Concrete backends may live outside the harness package and be injected by
the application composition root.
Important boundary rules:
- Terminal run events (``end``/``cancel``/``error``) are real replayable
events and belong to run-level semantics.
- ``close()`` is bridge-level shutdown and must not be treated as a run
cancellation signal.
"""
@abc.abstractmethod
async def publish(self, run_id: str, event: str, data: JSONValue) -> str:
"""Enqueue a single event for *run_id* and return its event ID."""
@abc.abstractmethod
async def publish_end(self, run_id: str) -> str:
"""Signal that no more events will be produced for *run_id*."""
async def publish_terminal(
self,
run_id: str,
kind: StreamStatus,
data: JSONValue = None,
) -> str:
"""Publish a terminal event (end/cancel/error)."""
await self.publish_end(run_id)
return ""
@abc.abstractmethod
def subscribe(
self,
run_id: str,
*,
last_event_id: str | None = None,
heartbeat_interval: float = 15.0,
) -> AsyncIterator[StreamEvent]:
"""Yield replayable stream events for *run_id*."""
@abc.abstractmethod
async def cleanup(self, run_id: str, *, delay: float = 0) -> None:
"""Release resources associated with *run_id*."""
async def cancel(self, run_id: str) -> None:
"""Cancel a run and notify all subscribers."""
await self.publish_terminal(run_id, StreamStatus.CANCELLED)
async def mark_awaiting_input(self, run_id: str) -> None:
"""Mark stream as awaiting human input."""
async def start(self) -> None:
"""Start background tasks, if needed."""
async def close(self) -> None:
"""Release bridge-level backend resources."""
@@ -0,0 +1,23 @@
"""Stream bridge exceptions."""
from __future__ import annotations
class StreamBridgeError(Exception):
"""Base exception for stream bridge errors."""
class BridgeClosedError(StreamBridgeError):
"""Raised when operating on a closed bridge."""
class StreamCapacityExceededError(StreamBridgeError):
"""Raised when max_active_streams is reached and eviction is not possible."""
class StreamTerminatedError(StreamBridgeError):
"""Raised when publishing to a terminal stream."""
class StreamNotFoundError(StreamBridgeError):
"""Raised when referencing a non-existent stream."""
@@ -1,133 +0,0 @@
"""In-memory stream bridge backed by an in-process event log."""
from __future__ import annotations
import asyncio
import logging
import time
from collections.abc import AsyncIterator
from dataclasses import dataclass, field
from typing import Any
from .base import END_SENTINEL, HEARTBEAT_SENTINEL, StreamBridge, StreamEvent
logger = logging.getLogger(__name__)
@dataclass
class _RunStream:
events: list[StreamEvent] = field(default_factory=list)
condition: asyncio.Condition = field(default_factory=asyncio.Condition)
ended: bool = False
start_offset: int = 0
class MemoryStreamBridge(StreamBridge):
"""Per-run in-memory event log implementation.
Events are retained for a bounded time window per run so late subscribers
and reconnecting clients can replay buffered events from ``Last-Event-ID``.
"""
def __init__(self, *, queue_maxsize: int = 256) -> None:
self._maxsize = queue_maxsize
self._streams: dict[str, _RunStream] = {}
self._counters: dict[str, int] = {}
# -- helpers ---------------------------------------------------------------
def _get_or_create_stream(self, run_id: str) -> _RunStream:
if run_id not in self._streams:
self._streams[run_id] = _RunStream()
self._counters[run_id] = 0
return self._streams[run_id]
def _next_id(self, run_id: str) -> str:
self._counters[run_id] = self._counters.get(run_id, 0) + 1
ts = int(time.time() * 1000)
seq = self._counters[run_id] - 1
return f"{ts}-{seq}"
def _resolve_start_offset(self, stream: _RunStream, last_event_id: str | None) -> int:
if last_event_id is None:
return stream.start_offset
for index, entry in enumerate(stream.events):
if entry.id == last_event_id:
return stream.start_offset + index + 1
if stream.events:
logger.warning(
"last_event_id=%s not found in retained buffer; replaying from earliest retained event",
last_event_id,
)
return stream.start_offset
# -- StreamBridge API ------------------------------------------------------
async def publish(self, run_id: str, event: str, data: Any) -> None:
stream = self._get_or_create_stream(run_id)
entry = StreamEvent(id=self._next_id(run_id), event=event, data=data)
async with stream.condition:
stream.events.append(entry)
if len(stream.events) > self._maxsize:
overflow = len(stream.events) - self._maxsize
del stream.events[:overflow]
stream.start_offset += overflow
stream.condition.notify_all()
async def publish_end(self, run_id: str) -> None:
stream = self._get_or_create_stream(run_id)
async with stream.condition:
stream.ended = True
stream.condition.notify_all()
async def subscribe(
self,
run_id: str,
*,
last_event_id: str | None = None,
heartbeat_interval: float = 15.0,
) -> AsyncIterator[StreamEvent]:
stream = self._get_or_create_stream(run_id)
async with stream.condition:
next_offset = self._resolve_start_offset(stream, last_event_id)
while True:
async with stream.condition:
if next_offset < stream.start_offset:
logger.warning(
"subscriber for run %s fell behind retained buffer; resuming from offset %s",
run_id,
stream.start_offset,
)
next_offset = stream.start_offset
local_index = next_offset - stream.start_offset
if 0 <= local_index < len(stream.events):
entry = stream.events[local_index]
next_offset += 1
elif stream.ended:
entry = END_SENTINEL
else:
try:
await asyncio.wait_for(stream.condition.wait(), timeout=heartbeat_interval)
except TimeoutError:
entry = HEARTBEAT_SENTINEL
else:
continue
if entry is END_SENTINEL:
yield END_SENTINEL
return
yield entry
async def cleanup(self, run_id: str, *, delay: float = 0) -> None:
if delay > 0:
await asyncio.sleep(delay)
self._streams.pop(run_id, None)
self._counters.pop(run_id, None)
async def close(self) -> None:
self._streams.clear()
self._counters.clear()