mirror of
https://github.com/bytedance/deer-flow.git
synced 2026-05-23 08:25:57 +00:00
feat(auth): authentication module with multi-tenant isolation (RFC-001)
Introduce an always-on auth layer with auto-created admin on first boot, multi-tenant isolation for threads/stores, and a full setup/login flow. Backend - JWT access tokens with `ver` field for stale-token rejection; bump on password/email change - Password hashing, HttpOnly+Secure cookies (Secure derived from request scheme at runtime) - CSRF middleware covering both REST and LangGraph routes - IP-based login rate limiting (5 attempts / 5-min lockout) with bounded dict growth and X-Forwarded-For bypass fix - Multi-worker-safe admin auto-creation (single DB write, WAL once) - needs_setup + token_version on User model; SQLite schema migration - Thread/store isolation by owner; orphan thread migration on first admin registration - thread_id validated as UUID to prevent log injection - CLI tool to reset admin password - Decorator-based authz module extracted from auth core Frontend - Login and setup pages with SSR guard for needs_setup flow - Account settings page (change password / email) - AuthProvider + route guards; skips redirect when no users registered - i18n (en-US / zh-CN) for auth surfaces - Typed auth API client; parseAuthError unwraps FastAPI detail envelope Infra & tooling - Unified `serve.sh` with gateway mode + auto dep install - Public PyPI uv.toml pin for CI compatibility - Regenerated uv.lock with public index Tests - HTTP vs HTTPS cookie security tests - Auth middleware, rate limiter, CSRF, setup flow coverage
This commit is contained in:
@@ -8,6 +8,14 @@ from deerflow.subagents import get_available_subagent_names
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _get_enabled_skills():
|
||||
try:
|
||||
return list(load_skills(enabled_only=True))
|
||||
except Exception:
|
||||
logger.exception("Failed to load enabled skills for prompt injection")
|
||||
return []
|
||||
|
||||
|
||||
def _build_subagent_section(max_concurrent: int) -> str:
|
||||
"""Build the subagent system prompt section with dynamic concurrency limit.
|
||||
|
||||
@@ -386,7 +394,7 @@ def get_skills_prompt_section(available_skills: set[str] | None = None) -> str:
|
||||
Returns the <skill_system>...</skill_system> block listing all enabled skills,
|
||||
suitable for injection into any agent's system prompt.
|
||||
"""
|
||||
skills = load_skills(enabled_only=True)
|
||||
skills = _get_enabled_skills()
|
||||
|
||||
try:
|
||||
from deerflow.config import get_app_config
|
||||
@@ -450,7 +458,7 @@ def get_deferred_tools_prompt_section() -> str:
|
||||
|
||||
if not get_app_config().tool_search.enabled:
|
||||
return ""
|
||||
except FileNotFoundError:
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
registry = get_deferred_registry()
|
||||
|
||||
@@ -246,6 +246,10 @@ def format_memory_for_injection(memory_data: dict[str, Any], max_tokens: int = 2
|
||||
if earlier.get("summary"):
|
||||
history_sections.append(f"Earlier: {earlier['summary']}")
|
||||
|
||||
background = history_data.get("longTermBackground", {})
|
||||
if background.get("summary"):
|
||||
history_sections.append(f"Background: {background['summary']}")
|
||||
|
||||
if history_sections:
|
||||
sections.append("History:\n" + "\n".join(f"- {s}" for s in history_sections))
|
||||
|
||||
|
||||
@@ -21,6 +21,7 @@ class ConversationContext:
|
||||
timestamp: datetime = field(default_factory=datetime.utcnow)
|
||||
agent_name: str | None = None
|
||||
correction_detected: bool = False
|
||||
reinforcement_detected: bool = False
|
||||
|
||||
|
||||
class MemoryUpdateQueue:
|
||||
@@ -44,6 +45,7 @@ class MemoryUpdateQueue:
|
||||
messages: list[Any],
|
||||
agent_name: str | None = None,
|
||||
correction_detected: bool = False,
|
||||
reinforcement_detected: bool = False,
|
||||
) -> None:
|
||||
"""Add a conversation to the update queue.
|
||||
|
||||
@@ -52,6 +54,7 @@ class MemoryUpdateQueue:
|
||||
messages: The conversation messages.
|
||||
agent_name: If provided, memory is stored per-agent. If None, uses global memory.
|
||||
correction_detected: Whether recent turns include an explicit correction signal.
|
||||
reinforcement_detected: Whether recent turns include a positive reinforcement signal.
|
||||
"""
|
||||
config = get_memory_config()
|
||||
if not config.enabled:
|
||||
@@ -63,11 +66,13 @@ class MemoryUpdateQueue:
|
||||
None,
|
||||
)
|
||||
merged_correction_detected = correction_detected or (existing_context.correction_detected if existing_context is not None else False)
|
||||
merged_reinforcement_detected = reinforcement_detected or (existing_context.reinforcement_detected if existing_context is not None else False)
|
||||
context = ConversationContext(
|
||||
thread_id=thread_id,
|
||||
messages=messages,
|
||||
agent_name=agent_name,
|
||||
correction_detected=merged_correction_detected,
|
||||
reinforcement_detected=merged_reinforcement_detected,
|
||||
)
|
||||
|
||||
# Check if this thread already has a pending update
|
||||
@@ -130,6 +135,7 @@ class MemoryUpdateQueue:
|
||||
thread_id=context.thread_id,
|
||||
agent_name=context.agent_name,
|
||||
correction_detected=context.correction_detected,
|
||||
reinforcement_detected=context.reinforcement_detected,
|
||||
)
|
||||
if success:
|
||||
logger.info("Memory updated successfully for thread %s", context.thread_id)
|
||||
|
||||
@@ -246,7 +246,7 @@ def _fact_content_key(content: Any) -> str | None:
|
||||
stripped = content.strip()
|
||||
if not stripped:
|
||||
return None
|
||||
return stripped
|
||||
return stripped.casefold()
|
||||
|
||||
|
||||
class MemoryUpdater:
|
||||
@@ -272,6 +272,7 @@ class MemoryUpdater:
|
||||
thread_id: str | None = None,
|
||||
agent_name: str | None = None,
|
||||
correction_detected: bool = False,
|
||||
reinforcement_detected: bool = False,
|
||||
) -> bool:
|
||||
"""Update memory based on conversation messages.
|
||||
|
||||
@@ -280,6 +281,7 @@ class MemoryUpdater:
|
||||
thread_id: Optional thread ID for tracking source.
|
||||
agent_name: If provided, updates per-agent memory. If None, updates global memory.
|
||||
correction_detected: Whether recent turns include an explicit correction signal.
|
||||
reinforcement_detected: Whether recent turns include a positive reinforcement signal.
|
||||
|
||||
Returns:
|
||||
True if update was successful, False otherwise.
|
||||
@@ -310,6 +312,14 @@ class MemoryUpdater:
|
||||
"and record the correct approach as a fact with category "
|
||||
'"correction" and confidence >= 0.95 when appropriate.'
|
||||
)
|
||||
if reinforcement_detected:
|
||||
reinforcement_hint = (
|
||||
"IMPORTANT: Positive reinforcement signals were detected in this conversation. "
|
||||
"The user explicitly confirmed the agent's approach was correct or helpful. "
|
||||
"Record the confirmed approach, style, or preference as a fact with category "
|
||||
'"preference" or "behavior" and confidence >= 0.9 when appropriate.'
|
||||
)
|
||||
correction_hint = (correction_hint + "\n" + reinforcement_hint).strip() if correction_hint else reinforcement_hint
|
||||
|
||||
prompt = MEMORY_UPDATE_PROMPT.format(
|
||||
current_memory=json.dumps(current_memory, indent=2),
|
||||
@@ -441,6 +451,7 @@ def update_memory_from_conversation(
|
||||
thread_id: str | None = None,
|
||||
agent_name: str | None = None,
|
||||
correction_detected: bool = False,
|
||||
reinforcement_detected: bool = False,
|
||||
) -> bool:
|
||||
"""Convenience function to update memory from a conversation.
|
||||
|
||||
@@ -449,9 +460,10 @@ def update_memory_from_conversation(
|
||||
thread_id: Optional thread ID.
|
||||
agent_name: If provided, updates per-agent memory. If None, updates global memory.
|
||||
correction_detected: Whether recent turns include an explicit correction signal.
|
||||
reinforcement_detected: Whether recent turns include a positive reinforcement signal.
|
||||
|
||||
Returns:
|
||||
True if successful, False otherwise.
|
||||
"""
|
||||
updater = MemoryUpdater()
|
||||
return updater.update_memory(messages, thread_id, agent_name, correction_detected)
|
||||
return updater.update_memory(messages, thread_id, agent_name, correction_detected, reinforcement_detected)
|
||||
|
||||
@@ -182,6 +182,23 @@ class LoopDetectionMiddleware(AgentMiddleware[AgentState]):
|
||||
|
||||
return None, False
|
||||
|
||||
@staticmethod
|
||||
def _append_text(content: str | list | None, text: str) -> str | list:
|
||||
"""Append *text* to AIMessage content, handling str, list, and None.
|
||||
|
||||
When content is a list of content blocks (e.g. Anthropic thinking mode),
|
||||
we append a new ``{"type": "text", ...}`` block instead of concatenating
|
||||
a string to a list, which would raise ``TypeError``.
|
||||
"""
|
||||
if content is None:
|
||||
return text
|
||||
if isinstance(content, list):
|
||||
return [*content, {"type": "text", "text": f"\n\n{text}"}]
|
||||
if isinstance(content, str):
|
||||
return content + f"\n\n{text}"
|
||||
# Fallback: coerce unexpected types to str to avoid TypeError
|
||||
return str(content) + f"\n\n{text}"
|
||||
|
||||
def _apply(self, state: AgentState, runtime: Runtime) -> dict | None:
|
||||
warning, hard_stop = self._track_and_check(state, runtime)
|
||||
|
||||
@@ -192,7 +209,7 @@ class LoopDetectionMiddleware(AgentMiddleware[AgentState]):
|
||||
stripped_msg = last_msg.model_copy(
|
||||
update={
|
||||
"tool_calls": [],
|
||||
"content": (last_msg.content or "") + f"\n\n{_HARD_STOP_MSG}",
|
||||
"content": self._append_text(last_msg.content, _HARD_STOP_MSG),
|
||||
}
|
||||
)
|
||||
return {"messages": [stripped_msg]}
|
||||
|
||||
@@ -29,6 +29,22 @@ _CORRECTION_PATTERNS = (
|
||||
re.compile(r"改用"),
|
||||
)
|
||||
|
||||
_REINFORCEMENT_PATTERNS = (
|
||||
re.compile(r"\byes[,.]?\s+(?:exactly|perfect|that(?:'s| is) (?:right|correct|it))\b", re.IGNORECASE),
|
||||
re.compile(r"\bperfect(?:[.!?]|$)", re.IGNORECASE),
|
||||
re.compile(r"\bexactly\s+(?:right|correct)\b", re.IGNORECASE),
|
||||
re.compile(r"\bthat(?:'s| is)\s+(?:exactly\s+)?(?:right|correct|what i (?:wanted|needed|meant))\b", re.IGNORECASE),
|
||||
re.compile(r"\bkeep\s+(?:doing\s+)?that\b", re.IGNORECASE),
|
||||
re.compile(r"\bjust\s+(?:like\s+)?(?:that|this)\b", re.IGNORECASE),
|
||||
re.compile(r"\bthis is (?:great|helpful)\b(?:[.!?]|$)", re.IGNORECASE),
|
||||
re.compile(r"\bthis is what i wanted\b(?:[.!?]|$)", re.IGNORECASE),
|
||||
re.compile(r"对[,,]?\s*就是这样(?:[。!?!?.]|$)"),
|
||||
re.compile(r"完全正确(?:[。!?!?.]|$)"),
|
||||
re.compile(r"(?:对[,,]?\s*)?就是这个意思(?:[。!?!?.]|$)"),
|
||||
re.compile(r"正是我想要的(?:[。!?!?.]|$)"),
|
||||
re.compile(r"继续保持(?:[。!?!?.]|$)"),
|
||||
)
|
||||
|
||||
|
||||
class MemoryMiddlewareState(AgentState):
|
||||
"""Compatible with the `ThreadState` schema."""
|
||||
@@ -132,6 +148,29 @@ def detect_correction(messages: list[Any]) -> bool:
|
||||
return False
|
||||
|
||||
|
||||
def detect_reinforcement(messages: list[Any]) -> bool:
|
||||
"""Detect explicit positive reinforcement signals in recent conversation turns.
|
||||
|
||||
Complements detect_correction() by identifying when the user confirms the
|
||||
agent's approach was correct. This allows the memory system to record what
|
||||
worked well, not just what went wrong.
|
||||
|
||||
The queue keeps only one pending context per thread, so callers pass the
|
||||
latest filtered message list. Checking only recent user turns keeps signal
|
||||
detection conservative while avoiding stale signals from long histories.
|
||||
"""
|
||||
recent_user_msgs = [msg for msg in messages[-6:] if getattr(msg, "type", None) == "human"]
|
||||
|
||||
for msg in recent_user_msgs:
|
||||
content = _extract_message_text(msg).strip()
|
||||
if not content:
|
||||
continue
|
||||
if any(pattern.search(content) for pattern in _REINFORCEMENT_PATTERNS):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
class MemoryMiddleware(AgentMiddleware[MemoryMiddlewareState]):
|
||||
"""Middleware that queues conversation for memory update after agent execution.
|
||||
|
||||
@@ -196,12 +235,14 @@ class MemoryMiddleware(AgentMiddleware[MemoryMiddlewareState]):
|
||||
|
||||
# Queue the filtered conversation for memory update
|
||||
correction_detected = detect_correction(filtered_messages)
|
||||
reinforcement_detected = not correction_detected and detect_reinforcement(filtered_messages)
|
||||
queue = get_memory_queue()
|
||||
queue.add(
|
||||
thread_id=thread_id,
|
||||
messages=filtered_messages,
|
||||
agent_name=self._agent_name,
|
||||
correction_detected=correction_detected,
|
||||
reinforcement_detected=reinforcement_detected,
|
||||
)
|
||||
|
||||
return None
|
||||
|
||||
@@ -101,44 +101,33 @@ class TitleMiddleware(AgentMiddleware[TitleMiddlewareState]):
|
||||
return user_msg if user_msg else "New Conversation"
|
||||
|
||||
def _generate_title_result(self, state: TitleMiddlewareState) -> dict | None:
|
||||
"""Synchronously generate a title. Returns state update or None."""
|
||||
"""Generate a local fallback title without blocking on an LLM call."""
|
||||
if not self._should_generate_title(state):
|
||||
return None
|
||||
|
||||
prompt, user_msg = self._build_title_prompt(state)
|
||||
config = get_title_config()
|
||||
model = create_chat_model(name=config.model_name, thinking_enabled=False)
|
||||
|
||||
try:
|
||||
response = model.invoke(prompt)
|
||||
title = self._parse_title(response.content)
|
||||
if not title:
|
||||
title = self._fallback_title(user_msg)
|
||||
except Exception:
|
||||
logger.exception("Failed to generate title (sync)")
|
||||
title = self._fallback_title(user_msg)
|
||||
|
||||
return {"title": title}
|
||||
_, user_msg = self._build_title_prompt(state)
|
||||
return {"title": self._fallback_title(user_msg)}
|
||||
|
||||
async def _agenerate_title_result(self, state: TitleMiddlewareState) -> dict | None:
|
||||
"""Asynchronously generate a title. Returns state update or None."""
|
||||
"""Generate a title asynchronously and fall back locally on failure."""
|
||||
if not self._should_generate_title(state):
|
||||
return None
|
||||
|
||||
prompt, user_msg = self._build_title_prompt(state)
|
||||
config = get_title_config()
|
||||
model = create_chat_model(name=config.model_name, thinking_enabled=False)
|
||||
prompt, user_msg = self._build_title_prompt(state)
|
||||
|
||||
try:
|
||||
if config.model_name:
|
||||
model = create_chat_model(name=config.model_name, thinking_enabled=False)
|
||||
else:
|
||||
model = create_chat_model(thinking_enabled=False)
|
||||
response = await model.ainvoke(prompt)
|
||||
title = self._parse_title(response.content)
|
||||
if not title:
|
||||
title = self._fallback_title(user_msg)
|
||||
if title:
|
||||
return {"title": title}
|
||||
except Exception:
|
||||
logger.exception("Failed to generate title (async)")
|
||||
title = self._fallback_title(user_msg)
|
||||
|
||||
return {"title": title}
|
||||
logger.debug("Failed to generate async title; falling back to local title", exc_info=True)
|
||||
return {"title": self._fallback_title(user_msg)}
|
||||
|
||||
@override
|
||||
def after_model(self, state: TitleMiddlewareState, runtime: Runtime) -> dict | None:
|
||||
|
||||
+1
-1
@@ -138,6 +138,6 @@ def build_subagent_runtime_middlewares(*, lazy_init: bool = True) -> list[AgentM
|
||||
"""Middlewares shared by subagent runtime before subagent-only middlewares."""
|
||||
return _build_runtime_middlewares(
|
||||
include_uploads=False,
|
||||
include_dangling_tool_call_patch=False,
|
||||
include_dangling_tool_call_patch=True,
|
||||
lazy_init=lazy_init,
|
||||
)
|
||||
|
||||
@@ -10,10 +10,52 @@ from langchain_core.messages import HumanMessage
|
||||
from langgraph.runtime import Runtime
|
||||
|
||||
from deerflow.config.paths import Paths, get_paths
|
||||
from deerflow.utils.file_conversion import extract_outline
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
_OUTLINE_PREVIEW_LINES = 5
|
||||
|
||||
|
||||
def _extract_outline_for_file(file_path: Path) -> tuple[list[dict], list[str]]:
|
||||
"""Return the document outline and fallback preview for *file_path*.
|
||||
|
||||
Looks for a sibling ``<stem>.md`` file produced by the upload conversion
|
||||
pipeline.
|
||||
|
||||
Returns:
|
||||
(outline, preview) where:
|
||||
- outline: list of ``{title, line}`` dicts (plus optional sentinel).
|
||||
Empty when no headings are found or no .md exists.
|
||||
- preview: first few non-empty lines of the .md, used as a content
|
||||
anchor when outline is empty so the agent has some context.
|
||||
Empty when outline is non-empty (no fallback needed).
|
||||
"""
|
||||
md_path = file_path.with_suffix(".md")
|
||||
if not md_path.is_file():
|
||||
return [], []
|
||||
|
||||
outline = extract_outline(md_path)
|
||||
if outline:
|
||||
logger.debug("Extracted %d outline entries from %s", len(outline), file_path.name)
|
||||
return outline, []
|
||||
|
||||
# outline is empty — read the first few non-empty lines as a content preview
|
||||
preview: list[str] = []
|
||||
try:
|
||||
with md_path.open(encoding="utf-8") as f:
|
||||
for line in f:
|
||||
stripped = line.strip()
|
||||
if stripped:
|
||||
preview.append(stripped)
|
||||
if len(preview) >= _OUTLINE_PREVIEW_LINES:
|
||||
break
|
||||
except Exception:
|
||||
logger.debug("Failed to read preview lines from %s", md_path, exc_info=True)
|
||||
return [], preview
|
||||
|
||||
|
||||
class UploadsMiddlewareState(AgentState):
|
||||
"""State schema for uploads middleware."""
|
||||
|
||||
@@ -39,12 +81,38 @@ class UploadsMiddleware(AgentMiddleware[UploadsMiddlewareState]):
|
||||
super().__init__()
|
||||
self._paths = Paths(base_dir) if base_dir else get_paths()
|
||||
|
||||
def _format_file_entry(self, file: dict, lines: list[str]) -> None:
|
||||
"""Append a single file entry (name, size, path, optional outline) to lines."""
|
||||
size_kb = file["size"] / 1024
|
||||
size_str = f"{size_kb:.1f} KB" if size_kb < 1024 else f"{size_kb / 1024:.1f} MB"
|
||||
lines.append(f"- {file['filename']} ({size_str})")
|
||||
lines.append(f" Path: {file['path']}")
|
||||
outline = file.get("outline") or []
|
||||
if outline:
|
||||
truncated = outline[-1].get("truncated", False)
|
||||
visible = [e for e in outline if not e.get("truncated")]
|
||||
lines.append(" Document outline (use `read_file` with line ranges to read sections):")
|
||||
for entry in visible:
|
||||
lines.append(f" L{entry['line']}: {entry['title']}")
|
||||
if truncated:
|
||||
lines.append(f" ... (showing first {len(visible)} headings; use `read_file` to explore further)")
|
||||
else:
|
||||
preview = file.get("outline_preview") or []
|
||||
if preview:
|
||||
lines.append(" No structural headings detected. Document begins with:")
|
||||
for text in preview:
|
||||
lines.append(f" > {text}")
|
||||
lines.append(" Use `grep` to search for keywords (e.g. `grep(pattern='keyword', path='/mnt/user-data/uploads/')`).")
|
||||
lines.append("")
|
||||
|
||||
def _create_files_message(self, new_files: list[dict], historical_files: list[dict]) -> str:
|
||||
"""Create a formatted message listing uploaded files.
|
||||
|
||||
Args:
|
||||
new_files: Files uploaded in the current message.
|
||||
historical_files: Files uploaded in previous messages.
|
||||
Each file dict may contain an optional ``outline`` key — a list of
|
||||
``{title, line}`` dicts extracted from the converted Markdown file.
|
||||
|
||||
Returns:
|
||||
Formatted string inside <uploaded_files> tags.
|
||||
@@ -55,25 +123,24 @@ class UploadsMiddleware(AgentMiddleware[UploadsMiddlewareState]):
|
||||
lines.append("")
|
||||
if new_files:
|
||||
for file in new_files:
|
||||
size_kb = file["size"] / 1024
|
||||
size_str = f"{size_kb:.1f} KB" if size_kb < 1024 else f"{size_kb / 1024:.1f} MB"
|
||||
lines.append(f"- {file['filename']} ({size_str})")
|
||||
lines.append(f" Path: {file['path']}")
|
||||
lines.append("")
|
||||
self._format_file_entry(file, lines)
|
||||
else:
|
||||
lines.append("(empty)")
|
||||
lines.append("")
|
||||
|
||||
if historical_files:
|
||||
lines.append("The following files were uploaded in previous messages and are still available:")
|
||||
lines.append("")
|
||||
for file in historical_files:
|
||||
size_kb = file["size"] / 1024
|
||||
size_str = f"{size_kb:.1f} KB" if size_kb < 1024 else f"{size_kb / 1024:.1f} MB"
|
||||
lines.append(f"- {file['filename']} ({size_str})")
|
||||
lines.append(f" Path: {file['path']}")
|
||||
lines.append("")
|
||||
self._format_file_entry(file, lines)
|
||||
|
||||
lines.append("You can read these files using the `read_file` tool with the paths shown above.")
|
||||
lines.append("To work with these files:")
|
||||
lines.append("- Read from the file first — use the outline line numbers and `read_file` to locate relevant sections.")
|
||||
lines.append("- Use `grep` to search for keywords when you are not sure which section to look at")
|
||||
lines.append(" (e.g. `grep(pattern='revenue', path='/mnt/user-data/uploads/')`).")
|
||||
lines.append("- Use `glob` to find files by name pattern")
|
||||
lines.append(" (e.g. `glob(pattern='**/*.md', path='/mnt/user-data/uploads/')`).")
|
||||
lines.append("- Only fall back to web search if the file content is clearly insufficient to answer the question.")
|
||||
lines.append("</uploaded_files>")
|
||||
|
||||
return "\n".join(lines)
|
||||
@@ -147,6 +214,13 @@ class UploadsMiddleware(AgentMiddleware[UploadsMiddlewareState]):
|
||||
|
||||
# Resolve uploads directory for existence checks
|
||||
thread_id = (runtime.context or {}).get("thread_id")
|
||||
if thread_id is None:
|
||||
try:
|
||||
from langgraph.config import get_config
|
||||
|
||||
thread_id = get_config().get("configurable", {}).get("thread_id")
|
||||
except RuntimeError:
|
||||
pass # get_config() raises outside a runnable context (e.g. unit tests)
|
||||
uploads_dir = self._paths.sandbox_uploads_dir(thread_id) if thread_id else None
|
||||
|
||||
# Get newly uploaded files from the current message's additional_kwargs.files
|
||||
@@ -159,15 +233,26 @@ class UploadsMiddleware(AgentMiddleware[UploadsMiddlewareState]):
|
||||
for file_path in sorted(uploads_dir.iterdir()):
|
||||
if file_path.is_file() and file_path.name not in new_filenames:
|
||||
stat = file_path.stat()
|
||||
outline, preview = _extract_outline_for_file(file_path)
|
||||
historical_files.append(
|
||||
{
|
||||
"filename": file_path.name,
|
||||
"size": stat.st_size,
|
||||
"path": f"/mnt/user-data/uploads/{file_path.name}",
|
||||
"extension": file_path.suffix,
|
||||
"outline": outline,
|
||||
"outline_preview": preview,
|
||||
}
|
||||
)
|
||||
|
||||
# Attach outlines to new files as well
|
||||
if uploads_dir:
|
||||
for file in new_files:
|
||||
phys_path = uploads_dir / file["filename"]
|
||||
outline, preview = _extract_outline_for_file(phys_path)
|
||||
file["outline"] = outline
|
||||
file["outline_preview"] = preview
|
||||
|
||||
if not new_files and not historical_files:
|
||||
return None
|
||||
|
||||
|
||||
@@ -117,6 +117,7 @@ class DeerFlowClient:
|
||||
subagent_enabled: bool = False,
|
||||
plan_mode: bool = False,
|
||||
agent_name: str | None = None,
|
||||
available_skills: set[str] | None = None,
|
||||
middlewares: Sequence[AgentMiddleware] | None = None,
|
||||
):
|
||||
"""Initialize the client.
|
||||
@@ -133,6 +134,7 @@ class DeerFlowClient:
|
||||
subagent_enabled: Enable subagent delegation.
|
||||
plan_mode: Enable TodoList middleware for plan mode.
|
||||
agent_name: Name of the agent to use.
|
||||
available_skills: Optional set of skill names to make available. If None (default), all scanned skills are available.
|
||||
middlewares: Optional list of custom middlewares to inject into the agent.
|
||||
"""
|
||||
if config_path is not None:
|
||||
@@ -148,6 +150,7 @@ class DeerFlowClient:
|
||||
self._subagent_enabled = subagent_enabled
|
||||
self._plan_mode = plan_mode
|
||||
self._agent_name = agent_name
|
||||
self._available_skills = set(available_skills) if available_skills is not None else None
|
||||
self._middlewares = list(middlewares) if middlewares else []
|
||||
|
||||
# Lazy agent — created on first call, recreated when config changes.
|
||||
@@ -208,6 +211,8 @@ class DeerFlowClient:
|
||||
cfg.get("thinking_enabled"),
|
||||
cfg.get("is_plan_mode"),
|
||||
cfg.get("subagent_enabled"),
|
||||
self._agent_name,
|
||||
frozenset(self._available_skills) if self._available_skills is not None else None,
|
||||
)
|
||||
|
||||
if self._agent is not None and self._agent_config_key == key:
|
||||
@@ -226,6 +231,7 @@ class DeerFlowClient:
|
||||
subagent_enabled=subagent_enabled,
|
||||
max_concurrent_subagents=max_concurrent_subagents,
|
||||
agent_name=self._agent_name,
|
||||
available_skills=self._available_skills,
|
||||
),
|
||||
"state_schema": ThreadState,
|
||||
}
|
||||
|
||||
@@ -7,6 +7,7 @@ import uuid
|
||||
from agent_sandbox import Sandbox as AioSandboxClient
|
||||
|
||||
from deerflow.sandbox.sandbox import Sandbox
|
||||
from deerflow.sandbox.search import GrepMatch, path_matches, should_ignore_path, truncate_line
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -135,6 +136,86 @@ class AioSandbox(Sandbox):
|
||||
logger.error(f"Failed to write file in sandbox: {e}")
|
||||
raise
|
||||
|
||||
def glob(self, path: str, pattern: str, *, include_dirs: bool = False, max_results: int = 200) -> tuple[list[str], bool]:
|
||||
if not include_dirs:
|
||||
result = self._client.file.find_files(path=path, glob=pattern)
|
||||
files = result.data.files if result.data and result.data.files else []
|
||||
filtered = [file_path for file_path in files if not should_ignore_path(file_path)]
|
||||
truncated = len(filtered) > max_results
|
||||
return filtered[:max_results], truncated
|
||||
|
||||
result = self._client.file.list_path(path=path, recursive=True, show_hidden=False)
|
||||
entries = result.data.files if result.data and result.data.files else []
|
||||
matches: list[str] = []
|
||||
root_path = path.rstrip("/") or "/"
|
||||
root_prefix = root_path if root_path == "/" else f"{root_path}/"
|
||||
for entry in entries:
|
||||
if entry.path != root_path and not entry.path.startswith(root_prefix):
|
||||
continue
|
||||
if should_ignore_path(entry.path):
|
||||
continue
|
||||
rel_path = entry.path[len(root_path) :].lstrip("/")
|
||||
if path_matches(pattern, rel_path):
|
||||
matches.append(entry.path)
|
||||
if len(matches) >= max_results:
|
||||
return matches, True
|
||||
return matches, False
|
||||
|
||||
def grep(
|
||||
self,
|
||||
path: str,
|
||||
pattern: str,
|
||||
*,
|
||||
glob: str | None = None,
|
||||
literal: bool = False,
|
||||
case_sensitive: bool = False,
|
||||
max_results: int = 100,
|
||||
) -> tuple[list[GrepMatch], bool]:
|
||||
import re as _re
|
||||
|
||||
regex_source = _re.escape(pattern) if literal else pattern
|
||||
# Validate the pattern locally so an invalid regex raises re.error
|
||||
# (caught by grep_tool's except re.error handler) rather than a
|
||||
# generic remote API error.
|
||||
_re.compile(regex_source, 0 if case_sensitive else _re.IGNORECASE)
|
||||
regex = regex_source if case_sensitive else f"(?i){regex_source}"
|
||||
|
||||
if glob is not None:
|
||||
find_result = self._client.file.find_files(path=path, glob=glob)
|
||||
candidate_paths = find_result.data.files if find_result.data and find_result.data.files else []
|
||||
else:
|
||||
list_result = self._client.file.list_path(path=path, recursive=True, show_hidden=False)
|
||||
entries = list_result.data.files if list_result.data and list_result.data.files else []
|
||||
candidate_paths = [entry.path for entry in entries if not entry.is_directory]
|
||||
|
||||
matches: list[GrepMatch] = []
|
||||
truncated = False
|
||||
|
||||
for file_path in candidate_paths:
|
||||
if should_ignore_path(file_path):
|
||||
continue
|
||||
|
||||
search_result = self._client.file.search_in_file(file=file_path, regex=regex)
|
||||
data = search_result.data
|
||||
if data is None:
|
||||
continue
|
||||
|
||||
line_numbers = data.line_numbers or []
|
||||
matched_lines = data.matches or []
|
||||
for line_number, line in zip(line_numbers, matched_lines):
|
||||
matches.append(
|
||||
GrepMatch(
|
||||
path=file_path,
|
||||
line_number=line_number if isinstance(line_number, int) else 0,
|
||||
line=truncate_line(line),
|
||||
)
|
||||
)
|
||||
if len(matches) >= max_results:
|
||||
truncated = True
|
||||
return matches, truncated
|
||||
|
||||
return matches, truncated
|
||||
|
||||
def update_file(self, path: str, content: bytes) -> None:
|
||||
"""Update a file with binary content in the sandbox.
|
||||
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import logging
|
||||
import os
|
||||
from contextvars import ContextVar
|
||||
from pathlib import Path
|
||||
from typing import Any, Self
|
||||
|
||||
@@ -10,15 +11,15 @@ from pydantic import BaseModel, ConfigDict, Field
|
||||
from deerflow.config.acp_config import load_acp_config_from_dict
|
||||
from deerflow.config.checkpointer_config import CheckpointerConfig, load_checkpointer_config_from_dict
|
||||
from deerflow.config.extensions_config import ExtensionsConfig
|
||||
from deerflow.config.guardrails_config import load_guardrails_config_from_dict
|
||||
from deerflow.config.memory_config import load_memory_config_from_dict
|
||||
from deerflow.config.guardrails_config import GuardrailsConfig, load_guardrails_config_from_dict
|
||||
from deerflow.config.memory_config import MemoryConfig, load_memory_config_from_dict
|
||||
from deerflow.config.model_config import ModelConfig
|
||||
from deerflow.config.sandbox_config import SandboxConfig
|
||||
from deerflow.config.skills_config import SkillsConfig
|
||||
from deerflow.config.stream_bridge_config import StreamBridgeConfig, load_stream_bridge_config_from_dict
|
||||
from deerflow.config.subagents_config import load_subagents_config_from_dict
|
||||
from deerflow.config.summarization_config import load_summarization_config_from_dict
|
||||
from deerflow.config.title_config import load_title_config_from_dict
|
||||
from deerflow.config.subagents_config import SubagentsAppConfig, load_subagents_config_from_dict
|
||||
from deerflow.config.summarization_config import SummarizationConfig, load_summarization_config_from_dict
|
||||
from deerflow.config.title_config import TitleConfig, load_title_config_from_dict
|
||||
from deerflow.config.token_usage_config import TokenUsageConfig
|
||||
from deerflow.config.tool_config import ToolConfig, ToolGroupConfig
|
||||
from deerflow.config.tool_search_config import ToolSearchConfig, load_tool_search_config_from_dict
|
||||
@@ -28,6 +29,13 @@ load_dotenv()
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _default_config_candidates() -> tuple[Path, ...]:
|
||||
"""Return deterministic config.yaml locations without relying on cwd."""
|
||||
backend_dir = Path(__file__).resolve().parents[4]
|
||||
repo_root = backend_dir.parent
|
||||
return (backend_dir / "config.yaml", repo_root / "config.yaml")
|
||||
|
||||
|
||||
class AppConfig(BaseModel):
|
||||
"""Config for the DeerFlow application"""
|
||||
|
||||
@@ -40,6 +48,11 @@ class AppConfig(BaseModel):
|
||||
skills: SkillsConfig = Field(default_factory=SkillsConfig, description="Skills configuration")
|
||||
extensions: ExtensionsConfig = Field(default_factory=ExtensionsConfig, description="Extensions configuration (MCP servers and skills state)")
|
||||
tool_search: ToolSearchConfig = Field(default_factory=ToolSearchConfig, description="Tool search / deferred loading configuration")
|
||||
title: TitleConfig = Field(default_factory=TitleConfig, description="Automatic title generation configuration")
|
||||
summarization: SummarizationConfig = Field(default_factory=SummarizationConfig, description="Conversation summarization configuration")
|
||||
memory: MemoryConfig = Field(default_factory=MemoryConfig, description="Memory subsystem configuration")
|
||||
subagents: SubagentsAppConfig = Field(default_factory=SubagentsAppConfig, description="Subagent runtime configuration")
|
||||
guardrails: GuardrailsConfig = Field(default_factory=GuardrailsConfig, description="Guardrail middleware configuration")
|
||||
model_config = ConfigDict(extra="allow", frozen=False)
|
||||
checkpointer: CheckpointerConfig | None = Field(default=None, description="Checkpointer configuration")
|
||||
stream_bridge: StreamBridgeConfig | None = Field(default=None, description="Stream bridge configuration")
|
||||
@@ -51,7 +64,7 @@ class AppConfig(BaseModel):
|
||||
Priority:
|
||||
1. If provided `config_path` argument, use it.
|
||||
2. If provided `DEER_FLOW_CONFIG_PATH` environment variable, use it.
|
||||
3. Otherwise, first check the `config.yaml` in the current directory, then fallback to `config.yaml` in the parent directory.
|
||||
3. Otherwise, search deterministic backend/repository-root defaults from `_default_config_candidates()`.
|
||||
"""
|
||||
if config_path:
|
||||
path = Path(config_path)
|
||||
@@ -64,14 +77,10 @@ class AppConfig(BaseModel):
|
||||
raise FileNotFoundError(f"Config file specified by environment variable `DEER_FLOW_CONFIG_PATH` not found at {path}")
|
||||
return path
|
||||
else:
|
||||
# Check if the config.yaml is in the current directory
|
||||
path = Path(os.getcwd()) / "config.yaml"
|
||||
if not path.exists():
|
||||
# Check if the config.yaml is in the parent directory of CWD
|
||||
path = Path(os.getcwd()).parent / "config.yaml"
|
||||
if not path.exists():
|
||||
raise FileNotFoundError("`config.yaml` file not found at the current directory nor its parent directory")
|
||||
return path
|
||||
for path in _default_config_candidates():
|
||||
if path.exists():
|
||||
return path
|
||||
raise FileNotFoundError("`config.yaml` file not found at the default backend or repository root locations")
|
||||
|
||||
@classmethod
|
||||
def from_file(cls, config_path: str | None = None) -> Self:
|
||||
@@ -244,6 +253,8 @@ _app_config: AppConfig | None = None
|
||||
_app_config_path: Path | None = None
|
||||
_app_config_mtime: float | None = None
|
||||
_app_config_is_custom = False
|
||||
_current_app_config: ContextVar[AppConfig | None] = ContextVar("deerflow_current_app_config", default=None)
|
||||
_current_app_config_stack: ContextVar[tuple[AppConfig | None, ...]] = ContextVar("deerflow_current_app_config_stack", default=())
|
||||
|
||||
|
||||
def _get_config_mtime(config_path: Path) -> float | None:
|
||||
@@ -276,6 +287,10 @@ def get_app_config() -> AppConfig:
|
||||
"""
|
||||
global _app_config, _app_config_path, _app_config_mtime
|
||||
|
||||
runtime_override = _current_app_config.get()
|
||||
if runtime_override is not None:
|
||||
return runtime_override
|
||||
|
||||
if _app_config is not None and _app_config_is_custom:
|
||||
return _app_config
|
||||
|
||||
@@ -337,3 +352,26 @@ def set_app_config(config: AppConfig) -> None:
|
||||
_app_config_path = None
|
||||
_app_config_mtime = None
|
||||
_app_config_is_custom = True
|
||||
|
||||
|
||||
def peek_current_app_config() -> AppConfig | None:
|
||||
"""Return the runtime-scoped AppConfig override, if one is active."""
|
||||
return _current_app_config.get()
|
||||
|
||||
|
||||
def push_current_app_config(config: AppConfig) -> None:
|
||||
"""Push a runtime-scoped AppConfig override for the current execution context."""
|
||||
stack = _current_app_config_stack.get()
|
||||
_current_app_config_stack.set(stack + (_current_app_config.get(),))
|
||||
_current_app_config.set(config)
|
||||
|
||||
|
||||
def pop_current_app_config() -> None:
|
||||
"""Pop the latest runtime-scoped AppConfig override for the current execution context."""
|
||||
stack = _current_app_config_stack.get()
|
||||
if not stack:
|
||||
_current_app_config.set(None)
|
||||
return
|
||||
previous = stack[-1]
|
||||
_current_app_config_stack.set(stack[:-1])
|
||||
_current_app_config.set(previous)
|
||||
|
||||
@@ -80,6 +80,12 @@ class ExtensionsConfig(BaseModel):
|
||||
Args:
|
||||
config_path: Optional path to extensions config file.
|
||||
|
||||
Resolution order:
|
||||
1. If provided `config_path` argument, use it.
|
||||
2. If provided `DEER_FLOW_EXTENSIONS_CONFIG_PATH` environment variable, use it.
|
||||
3. Otherwise, search backend/repository-root defaults for
|
||||
`extensions_config.json`, then legacy `mcp_config.json`.
|
||||
|
||||
Returns:
|
||||
Path to the extensions config file if found, otherwise None.
|
||||
"""
|
||||
@@ -94,24 +100,16 @@ class ExtensionsConfig(BaseModel):
|
||||
raise FileNotFoundError(f"Extensions config file specified by environment variable `DEER_FLOW_EXTENSIONS_CONFIG_PATH` not found at {path}")
|
||||
return path
|
||||
else:
|
||||
# Check if the extensions_config.json is in the current directory
|
||||
path = Path(os.getcwd()) / "extensions_config.json"
|
||||
if path.exists():
|
||||
return path
|
||||
|
||||
# Check if the extensions_config.json is in the parent directory of CWD
|
||||
path = Path(os.getcwd()).parent / "extensions_config.json"
|
||||
if path.exists():
|
||||
return path
|
||||
|
||||
# Backward compatibility: check for mcp_config.json
|
||||
path = Path(os.getcwd()) / "mcp_config.json"
|
||||
if path.exists():
|
||||
return path
|
||||
|
||||
path = Path(os.getcwd()).parent / "mcp_config.json"
|
||||
if path.exists():
|
||||
return path
|
||||
backend_dir = Path(__file__).resolve().parents[4]
|
||||
repo_root = backend_dir.parent
|
||||
for path in (
|
||||
backend_dir / "extensions_config.json",
|
||||
repo_root / "extensions_config.json",
|
||||
backend_dir / "mcp_config.json",
|
||||
repo_root / "mcp_config.json",
|
||||
):
|
||||
if path.exists():
|
||||
return path
|
||||
|
||||
# Extensions are optional, so return None if not found
|
||||
return None
|
||||
|
||||
@@ -9,6 +9,12 @@ VIRTUAL_PATH_PREFIX = "/mnt/user-data"
|
||||
_SAFE_THREAD_ID_RE = re.compile(r"^[A-Za-z0-9_\-]+$")
|
||||
|
||||
|
||||
def _default_local_base_dir() -> Path:
|
||||
"""Return the repo-local DeerFlow state directory without relying on cwd."""
|
||||
backend_dir = Path(__file__).resolve().parents[4]
|
||||
return backend_dir / ".deer-flow"
|
||||
|
||||
|
||||
def _validate_thread_id(thread_id: str) -> str:
|
||||
"""Validate a thread ID before using it in filesystem paths."""
|
||||
if not _SAFE_THREAD_ID_RE.match(thread_id):
|
||||
@@ -67,8 +73,7 @@ class Paths:
|
||||
BaseDir resolution (in priority order):
|
||||
1. Constructor argument `base_dir`
|
||||
2. DEER_FLOW_HOME environment variable
|
||||
3. Local dev fallback: cwd/.deer-flow (when cwd is the backend/ dir)
|
||||
4. Default: $HOME/.deer-flow
|
||||
3. Repo-local fallback derived from this module path: `{backend_dir}/.deer-flow`
|
||||
"""
|
||||
|
||||
def __init__(self, base_dir: str | Path | None = None) -> None:
|
||||
@@ -104,11 +109,7 @@ class Paths:
|
||||
if env_home := os.getenv("DEER_FLOW_HOME"):
|
||||
return Path(env_home).resolve()
|
||||
|
||||
cwd = Path.cwd()
|
||||
if cwd.name == "backend" or (cwd / "pyproject.toml").exists():
|
||||
return cwd / ".deer-flow"
|
||||
|
||||
return Path.home() / ".deer-flow"
|
||||
return _default_local_base_dir()
|
||||
|
||||
@property
|
||||
def memory_file(self) -> Path:
|
||||
|
||||
@@ -3,6 +3,11 @@ from pathlib import Path
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
def _default_repo_root() -> Path:
|
||||
"""Resolve the repo root without relying on the current working directory."""
|
||||
return Path(__file__).resolve().parents[5]
|
||||
|
||||
|
||||
class SkillsConfig(BaseModel):
|
||||
"""Configuration for skills system"""
|
||||
|
||||
@@ -26,8 +31,8 @@ class SkillsConfig(BaseModel):
|
||||
# Use configured path (can be absolute or relative)
|
||||
path = Path(self.path)
|
||||
if not path.is_absolute():
|
||||
# If relative, resolve from current working directory
|
||||
path = Path.cwd() / path
|
||||
# If relative, resolve from the repo root for deterministic behavior.
|
||||
path = _default_repo_root() / path
|
||||
return path.resolve()
|
||||
else:
|
||||
# Default: ../skills relative to backend directory
|
||||
|
||||
@@ -15,6 +15,11 @@ class SubagentOverrideConfig(BaseModel):
|
||||
ge=1,
|
||||
description="Timeout in seconds for this subagent (None = use global default)",
|
||||
)
|
||||
max_turns: int | None = Field(
|
||||
default=None,
|
||||
ge=1,
|
||||
description="Maximum turns for this subagent (None = use global or builtin default)",
|
||||
)
|
||||
|
||||
|
||||
class SubagentsAppConfig(BaseModel):
|
||||
@@ -25,6 +30,11 @@ class SubagentsAppConfig(BaseModel):
|
||||
ge=1,
|
||||
description="Default timeout in seconds for all subagents (default: 900 = 15 minutes)",
|
||||
)
|
||||
max_turns: int | None = Field(
|
||||
default=None,
|
||||
ge=1,
|
||||
description="Optional default max-turn override for all subagents (None = keep builtin defaults)",
|
||||
)
|
||||
agents: dict[str, SubagentOverrideConfig] = Field(
|
||||
default_factory=dict,
|
||||
description="Per-agent configuration overrides keyed by agent name",
|
||||
@@ -44,6 +54,15 @@ class SubagentsAppConfig(BaseModel):
|
||||
return override.timeout_seconds
|
||||
return self.timeout_seconds
|
||||
|
||||
def get_max_turns_for(self, agent_name: str, builtin_default: int) -> int:
|
||||
"""Get the effective max_turns for a specific agent."""
|
||||
override = self.agents.get(agent_name)
|
||||
if override is not None and override.max_turns is not None:
|
||||
return override.max_turns
|
||||
if self.max_turns is not None:
|
||||
return self.max_turns
|
||||
return builtin_default
|
||||
|
||||
|
||||
_subagents_config: SubagentsAppConfig = SubagentsAppConfig()
|
||||
|
||||
@@ -58,8 +77,26 @@ def load_subagents_config_from_dict(config_dict: dict) -> None:
|
||||
global _subagents_config
|
||||
_subagents_config = SubagentsAppConfig(**config_dict)
|
||||
|
||||
overrides_summary = {name: f"{override.timeout_seconds}s" for name, override in _subagents_config.agents.items() if override.timeout_seconds is not None}
|
||||
overrides_summary = {}
|
||||
for name, override in _subagents_config.agents.items():
|
||||
parts = []
|
||||
if override.timeout_seconds is not None:
|
||||
parts.append(f"timeout={override.timeout_seconds}s")
|
||||
if override.max_turns is not None:
|
||||
parts.append(f"max_turns={override.max_turns}")
|
||||
if parts:
|
||||
overrides_summary[name] = ", ".join(parts)
|
||||
|
||||
if overrides_summary:
|
||||
logger.info(f"Subagents config loaded: default timeout={_subagents_config.timeout_seconds}s, per-agent overrides={overrides_summary}")
|
||||
logger.info(
|
||||
"Subagents config loaded: default timeout=%ss, default max_turns=%s, per-agent overrides=%s",
|
||||
_subagents_config.timeout_seconds,
|
||||
_subagents_config.max_turns,
|
||||
overrides_summary,
|
||||
)
|
||||
else:
|
||||
logger.info(f"Subagents config loaded: default timeout={_subagents_config.timeout_seconds}s, no per-agent overrides")
|
||||
logger.info(
|
||||
"Subagents config loaded: default timeout=%ss, default max_turns=%s, no per-agent overrides",
|
||||
_subagents_config.timeout_seconds,
|
||||
_subagents_config.max_turns,
|
||||
)
|
||||
|
||||
@@ -25,6 +25,7 @@ class MemoryStreamBridge(StreamBridge):
|
||||
self._maxsize = queue_maxsize
|
||||
self._queues: dict[str, asyncio.Queue[StreamEvent]] = {}
|
||||
self._counters: dict[str, int] = {}
|
||||
self._dropped_counts: dict[str, int] = {}
|
||||
|
||||
# -- helpers ---------------------------------------------------------------
|
||||
|
||||
@@ -32,6 +33,7 @@ class MemoryStreamBridge(StreamBridge):
|
||||
if run_id not in self._queues:
|
||||
self._queues[run_id] = asyncio.Queue(maxsize=self._maxsize)
|
||||
self._counters[run_id] = 0
|
||||
self._dropped_counts[run_id] = 0
|
||||
return self._queues[run_id]
|
||||
|
||||
def _next_id(self, run_id: str) -> str:
|
||||
@@ -48,14 +50,41 @@ class MemoryStreamBridge(StreamBridge):
|
||||
try:
|
||||
await asyncio.wait_for(queue.put(entry), timeout=_PUBLISH_TIMEOUT)
|
||||
except TimeoutError:
|
||||
logger.warning("Stream bridge queue full for run %s — dropping event %s", run_id, event)
|
||||
self._dropped_counts[run_id] = self._dropped_counts.get(run_id, 0) + 1
|
||||
logger.warning(
|
||||
"Stream bridge queue full for run %s — dropping event %s (total dropped: %d)",
|
||||
run_id,
|
||||
event,
|
||||
self._dropped_counts[run_id],
|
||||
)
|
||||
|
||||
async def publish_end(self, run_id: str) -> None:
|
||||
queue = self._get_or_create_queue(run_id)
|
||||
try:
|
||||
await asyncio.wait_for(queue.put(END_SENTINEL), timeout=_PUBLISH_TIMEOUT)
|
||||
except TimeoutError:
|
||||
logger.warning("Stream bridge queue full for run %s — dropping END sentinel", run_id)
|
||||
|
||||
# END sentinel is critical — it is the only signal that allows
|
||||
# subscribers to terminate. If the queue is full we evict the
|
||||
# oldest *regular* events to make room rather than dropping END,
|
||||
# which would cause the SSE connection to hang forever and leak
|
||||
# the queue/counter resources for this run_id.
|
||||
if queue.full():
|
||||
evicted = 0
|
||||
while queue.full():
|
||||
try:
|
||||
queue.get_nowait()
|
||||
evicted += 1
|
||||
except asyncio.QueueEmpty:
|
||||
break # pragma: no cover – defensive
|
||||
if evicted:
|
||||
logger.warning(
|
||||
"Stream bridge queue full for run %s — evicted %d event(s) to guarantee END sentinel delivery",
|
||||
run_id,
|
||||
evicted,
|
||||
)
|
||||
|
||||
# After eviction the queue is guaranteed to have space, so a
|
||||
# simple non-blocking put is safe. We still use put() (which
|
||||
# blocks until space is available) as a defensive measure.
|
||||
await queue.put(END_SENTINEL)
|
||||
|
||||
async def subscribe(
|
||||
self,
|
||||
@@ -84,7 +113,18 @@ class MemoryStreamBridge(StreamBridge):
|
||||
await asyncio.sleep(delay)
|
||||
self._queues.pop(run_id, None)
|
||||
self._counters.pop(run_id, None)
|
||||
self._dropped_counts.pop(run_id, None)
|
||||
|
||||
async def close(self) -> None:
|
||||
self._queues.clear()
|
||||
self._counters.clear()
|
||||
self._dropped_counts.clear()
|
||||
|
||||
def dropped_count(self, run_id: str) -> int:
|
||||
"""Return the number of events dropped for *run_id*."""
|
||||
return self._dropped_counts.get(run_id, 0)
|
||||
|
||||
@property
|
||||
def dropped_total(self) -> int:
|
||||
"""Return the total number of events dropped across all runs."""
|
||||
return sum(self._dropped_counts.values())
|
||||
|
||||
@@ -1,72 +1,6 @@
|
||||
import fnmatch
|
||||
from pathlib import Path
|
||||
|
||||
IGNORE_PATTERNS = [
|
||||
# Version Control
|
||||
".git",
|
||||
".svn",
|
||||
".hg",
|
||||
".bzr",
|
||||
# Dependencies
|
||||
"node_modules",
|
||||
"__pycache__",
|
||||
".venv",
|
||||
"venv",
|
||||
".env",
|
||||
"env",
|
||||
".tox",
|
||||
".nox",
|
||||
".eggs",
|
||||
"*.egg-info",
|
||||
"site-packages",
|
||||
# Build outputs
|
||||
"dist",
|
||||
"build",
|
||||
".next",
|
||||
".nuxt",
|
||||
".output",
|
||||
".turbo",
|
||||
"target",
|
||||
"out",
|
||||
# IDE & Editor
|
||||
".idea",
|
||||
".vscode",
|
||||
"*.swp",
|
||||
"*.swo",
|
||||
"*~",
|
||||
".project",
|
||||
".classpath",
|
||||
".settings",
|
||||
# OS generated
|
||||
".DS_Store",
|
||||
"Thumbs.db",
|
||||
"desktop.ini",
|
||||
"*.lnk",
|
||||
# Logs & temp files
|
||||
"*.log",
|
||||
"*.tmp",
|
||||
"*.temp",
|
||||
"*.bak",
|
||||
"*.cache",
|
||||
".cache",
|
||||
"logs",
|
||||
# Coverage & test artifacts
|
||||
".coverage",
|
||||
"coverage",
|
||||
".nyc_output",
|
||||
"htmlcov",
|
||||
".pytest_cache",
|
||||
".mypy_cache",
|
||||
".ruff_cache",
|
||||
]
|
||||
|
||||
|
||||
def _should_ignore(name: str) -> bool:
|
||||
"""Check if a file/directory name matches any ignore pattern."""
|
||||
for pattern in IGNORE_PATTERNS:
|
||||
if fnmatch.fnmatch(name, pattern):
|
||||
return True
|
||||
return False
|
||||
from deerflow.sandbox.search import should_ignore_name
|
||||
|
||||
|
||||
def list_dir(path: str, max_depth: int = 2) -> list[str]:
|
||||
@@ -95,7 +29,7 @@ def list_dir(path: str, max_depth: int = 2) -> list[str]:
|
||||
|
||||
try:
|
||||
for item in current_path.iterdir():
|
||||
if _should_ignore(item.name):
|
||||
if should_ignore_name(item.name):
|
||||
continue
|
||||
|
||||
post_fix = "/" if item.is_dir() else ""
|
||||
|
||||
@@ -1,11 +1,23 @@
|
||||
import errno
|
||||
import ntpath
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
from deerflow.sandbox.local.list_dir import list_dir
|
||||
from deerflow.sandbox.sandbox import Sandbox
|
||||
from deerflow.sandbox.search import GrepMatch, find_glob_matches, find_grep_matches
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class PathMapping:
|
||||
"""A path mapping from a container path to a local path with optional read-only flag."""
|
||||
|
||||
container_path: str
|
||||
local_path: str
|
||||
read_only: bool = False
|
||||
|
||||
|
||||
class LocalSandbox(Sandbox):
|
||||
@@ -39,17 +51,42 @@ class LocalSandbox(Sandbox):
|
||||
|
||||
return None
|
||||
|
||||
def __init__(self, id: str, path_mappings: dict[str, str] | None = None):
|
||||
def __init__(self, id: str, path_mappings: list[PathMapping] | None = None):
|
||||
"""
|
||||
Initialize local sandbox with optional path mappings.
|
||||
|
||||
Args:
|
||||
id: Sandbox identifier
|
||||
path_mappings: Dictionary mapping container paths to local paths
|
||||
Example: {"/mnt/skills": "/absolute/path/to/skills"}
|
||||
path_mappings: List of path mappings with optional read-only flag.
|
||||
Skills directory is read-only by default.
|
||||
"""
|
||||
super().__init__(id)
|
||||
self.path_mappings = path_mappings or {}
|
||||
self.path_mappings = path_mappings or []
|
||||
|
||||
def _is_read_only_path(self, resolved_path: str) -> bool:
|
||||
"""Check if a resolved path is under a read-only mount.
|
||||
|
||||
When multiple mappings match (nested mounts), prefer the most specific
|
||||
mapping (i.e. the one whose local_path is the longest prefix of the
|
||||
resolved path), similar to how ``_resolve_path`` handles container paths.
|
||||
"""
|
||||
resolved = str(Path(resolved_path).resolve())
|
||||
|
||||
best_mapping: PathMapping | None = None
|
||||
best_prefix_len = -1
|
||||
|
||||
for mapping in self.path_mappings:
|
||||
local_resolved = str(Path(mapping.local_path).resolve())
|
||||
if resolved == local_resolved or resolved.startswith(local_resolved + os.sep):
|
||||
prefix_len = len(local_resolved)
|
||||
if prefix_len > best_prefix_len:
|
||||
best_prefix_len = prefix_len
|
||||
best_mapping = mapping
|
||||
|
||||
if best_mapping is None:
|
||||
return False
|
||||
|
||||
return best_mapping.read_only
|
||||
|
||||
def _resolve_path(self, path: str) -> str:
|
||||
"""
|
||||
@@ -64,7 +101,9 @@ class LocalSandbox(Sandbox):
|
||||
path_str = str(path)
|
||||
|
||||
# Try each mapping (longest prefix first for more specific matches)
|
||||
for container_path, local_path in sorted(self.path_mappings.items(), key=lambda x: len(x[0]), reverse=True):
|
||||
for mapping in sorted(self.path_mappings, key=lambda m: len(m.container_path), reverse=True):
|
||||
container_path = mapping.container_path
|
||||
local_path = mapping.local_path
|
||||
if path_str == container_path or path_str.startswith(container_path + "/"):
|
||||
# Replace the container path prefix with local path
|
||||
relative = path_str[len(container_path) :].lstrip("/")
|
||||
@@ -84,15 +123,16 @@ class LocalSandbox(Sandbox):
|
||||
Returns:
|
||||
Container path if mapping exists, otherwise original path
|
||||
"""
|
||||
path_str = str(Path(path).resolve())
|
||||
normalized_path = path.replace("\\", "/")
|
||||
path_str = str(Path(normalized_path).resolve())
|
||||
|
||||
# Try each mapping (longest local path first for more specific matches)
|
||||
for container_path, local_path in sorted(self.path_mappings.items(), key=lambda x: len(x[1]), reverse=True):
|
||||
local_path_resolved = str(Path(local_path).resolve())
|
||||
if path_str.startswith(local_path_resolved):
|
||||
for mapping in sorted(self.path_mappings, key=lambda m: len(m.local_path), reverse=True):
|
||||
local_path_resolved = str(Path(mapping.local_path).resolve())
|
||||
if path_str == local_path_resolved or path_str.startswith(local_path_resolved + "/"):
|
||||
# Replace the local path prefix with container path
|
||||
relative = path_str[len(local_path_resolved) :].lstrip("/")
|
||||
resolved = f"{container_path}/{relative}" if relative else container_path
|
||||
resolved = f"{mapping.container_path}/{relative}" if relative else mapping.container_path
|
||||
return resolved
|
||||
|
||||
# No mapping found, return original path
|
||||
@@ -111,7 +151,7 @@ class LocalSandbox(Sandbox):
|
||||
import re
|
||||
|
||||
# Sort mappings by local path length (longest first) for correct prefix matching
|
||||
sorted_mappings = sorted(self.path_mappings.items(), key=lambda x: len(x[1]), reverse=True)
|
||||
sorted_mappings = sorted(self.path_mappings, key=lambda m: len(m.local_path), reverse=True)
|
||||
|
||||
if not sorted_mappings:
|
||||
return output
|
||||
@@ -119,12 +159,11 @@ class LocalSandbox(Sandbox):
|
||||
# Create pattern that matches absolute paths
|
||||
# Match paths like /Users/... or other absolute paths
|
||||
result = output
|
||||
for container_path, local_path in sorted_mappings:
|
||||
local_path_resolved = str(Path(local_path).resolve())
|
||||
for mapping in sorted_mappings:
|
||||
# Escape the local path for use in regex
|
||||
escaped_local = re.escape(local_path_resolved)
|
||||
# Match the local path followed by optional path components
|
||||
pattern = re.compile(escaped_local + r"(?:/[^\s\"';&|<>()]*)?")
|
||||
escaped_local = re.escape(str(Path(mapping.local_path).resolve()))
|
||||
# Match the local path followed by optional path components with either separator
|
||||
pattern = re.compile(escaped_local + r"(?:[/\\][^\s\"';&|<>()]*)?")
|
||||
|
||||
def replace_match(match: re.Match) -> str:
|
||||
matched_path = match.group(0)
|
||||
@@ -147,7 +186,7 @@ class LocalSandbox(Sandbox):
|
||||
import re
|
||||
|
||||
# Sort mappings by length (longest first) for correct prefix matching
|
||||
sorted_mappings = sorted(self.path_mappings.items(), key=lambda x: len(x[0]), reverse=True)
|
||||
sorted_mappings = sorted(self.path_mappings, key=lambda m: len(m.container_path), reverse=True)
|
||||
|
||||
# Build regex pattern to match all container paths
|
||||
# Match container path followed by optional path components
|
||||
@@ -157,7 +196,7 @@ class LocalSandbox(Sandbox):
|
||||
# Create pattern that matches any of the container paths.
|
||||
# The lookahead (?=/|$|...) ensures we only match at a path-segment boundary,
|
||||
# preventing /mnt/skills from matching inside /mnt/skills-extra.
|
||||
patterns = [re.escape(container_path) + r"(?=/|$|[\s\"';&|<>()])(?:/[^\s\"';&|<>()]*)?" for container_path, _ in sorted_mappings]
|
||||
patterns = [re.escape(m.container_path) + r"(?=/|$|[\s\"';&|<>()])(?:/[^\s\"';&|<>()]*)?" for m in sorted_mappings]
|
||||
pattern = re.compile("|".join(f"({p})" for p in patterns))
|
||||
|
||||
def replace_match(match: re.Match) -> str:
|
||||
@@ -248,6 +287,8 @@ class LocalSandbox(Sandbox):
|
||||
|
||||
def write_file(self, path: str, content: str, append: bool = False) -> None:
|
||||
resolved_path = self._resolve_path(path)
|
||||
if self._is_read_only_path(resolved_path):
|
||||
raise OSError(errno.EROFS, "Read-only file system", path)
|
||||
try:
|
||||
dir_path = os.path.dirname(resolved_path)
|
||||
if dir_path:
|
||||
@@ -259,8 +300,43 @@ class LocalSandbox(Sandbox):
|
||||
# Re-raise with the original path for clearer error messages, hiding internal resolved paths
|
||||
raise type(e)(e.errno, e.strerror, path) from None
|
||||
|
||||
def glob(self, path: str, pattern: str, *, include_dirs: bool = False, max_results: int = 200) -> tuple[list[str], bool]:
|
||||
resolved_path = Path(self._resolve_path(path))
|
||||
matches, truncated = find_glob_matches(resolved_path, pattern, include_dirs=include_dirs, max_results=max_results)
|
||||
return [self._reverse_resolve_path(match) for match in matches], truncated
|
||||
|
||||
def grep(
|
||||
self,
|
||||
path: str,
|
||||
pattern: str,
|
||||
*,
|
||||
glob: str | None = None,
|
||||
literal: bool = False,
|
||||
case_sensitive: bool = False,
|
||||
max_results: int = 100,
|
||||
) -> tuple[list[GrepMatch], bool]:
|
||||
resolved_path = Path(self._resolve_path(path))
|
||||
matches, truncated = find_grep_matches(
|
||||
resolved_path,
|
||||
pattern,
|
||||
glob_pattern=glob,
|
||||
literal=literal,
|
||||
case_sensitive=case_sensitive,
|
||||
max_results=max_results,
|
||||
)
|
||||
return [
|
||||
GrepMatch(
|
||||
path=self._reverse_resolve_path(match.path),
|
||||
line_number=match.line_number,
|
||||
line=match.line,
|
||||
)
|
||||
for match in matches
|
||||
], truncated
|
||||
|
||||
def update_file(self, path: str, content: bytes) -> None:
|
||||
resolved_path = self._resolve_path(path)
|
||||
if self._is_read_only_path(resolved_path):
|
||||
raise OSError(errno.EROFS, "Read-only file system", path)
|
||||
try:
|
||||
dir_path = os.path.dirname(resolved_path)
|
||||
if dir_path:
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from deerflow.sandbox.local.local_sandbox import LocalSandbox
|
||||
from deerflow.sandbox.local.local_sandbox import LocalSandbox, PathMapping
|
||||
from deerflow.sandbox.sandbox import Sandbox
|
||||
from deerflow.sandbox.sandbox_provider import SandboxProvider
|
||||
|
||||
@@ -14,16 +15,17 @@ class LocalSandboxProvider(SandboxProvider):
|
||||
"""Initialize the local sandbox provider with path mappings."""
|
||||
self._path_mappings = self._setup_path_mappings()
|
||||
|
||||
def _setup_path_mappings(self) -> dict[str, str]:
|
||||
def _setup_path_mappings(self) -> list[PathMapping]:
|
||||
"""
|
||||
Setup path mappings for local sandbox.
|
||||
|
||||
Maps container paths to actual local paths, including skills directory.
|
||||
Maps container paths to actual local paths, including skills directory
|
||||
and any custom mounts configured in config.yaml.
|
||||
|
||||
Returns:
|
||||
Dictionary of path mappings
|
||||
List of path mappings
|
||||
"""
|
||||
mappings = {}
|
||||
mappings: list[PathMapping] = []
|
||||
|
||||
# Map skills container path to local skills directory
|
||||
try:
|
||||
@@ -35,10 +37,63 @@ class LocalSandboxProvider(SandboxProvider):
|
||||
|
||||
# Only add mapping if skills directory exists
|
||||
if skills_path.exists():
|
||||
mappings[container_path] = str(skills_path)
|
||||
mappings.append(
|
||||
PathMapping(
|
||||
container_path=container_path,
|
||||
local_path=str(skills_path),
|
||||
read_only=True, # Skills directory is always read-only
|
||||
)
|
||||
)
|
||||
|
||||
# Map custom mounts from sandbox config
|
||||
_RESERVED_CONTAINER_PREFIXES = [container_path, "/mnt/acp-workspace", "/mnt/user-data"]
|
||||
sandbox_config = config.sandbox
|
||||
if sandbox_config and sandbox_config.mounts:
|
||||
for mount in sandbox_config.mounts:
|
||||
host_path = Path(mount.host_path)
|
||||
container_path = mount.container_path.rstrip("/") or "/"
|
||||
|
||||
if not host_path.is_absolute():
|
||||
logger.warning(
|
||||
"Mount host_path must be absolute, skipping: %s -> %s",
|
||||
mount.host_path,
|
||||
mount.container_path,
|
||||
)
|
||||
continue
|
||||
|
||||
if not container_path.startswith("/"):
|
||||
logger.warning(
|
||||
"Mount container_path must be absolute, skipping: %s -> %s",
|
||||
mount.host_path,
|
||||
mount.container_path,
|
||||
)
|
||||
continue
|
||||
|
||||
# Reject mounts that conflict with reserved container paths
|
||||
if any(container_path == p or container_path.startswith(p + "/") for p in _RESERVED_CONTAINER_PREFIXES):
|
||||
logger.warning(
|
||||
"Mount container_path conflicts with reserved prefix, skipping: %s",
|
||||
mount.container_path,
|
||||
)
|
||||
continue
|
||||
# Ensure the host path exists before adding mapping
|
||||
if host_path.exists():
|
||||
mappings.append(
|
||||
PathMapping(
|
||||
container_path=container_path,
|
||||
local_path=str(host_path.resolve()),
|
||||
read_only=mount.read_only,
|
||||
)
|
||||
)
|
||||
else:
|
||||
logger.warning(
|
||||
"Mount host_path does not exist, skipping: %s -> %s",
|
||||
mount.host_path,
|
||||
mount.container_path,
|
||||
)
|
||||
except Exception as e:
|
||||
# Log but don't fail if config loading fails
|
||||
logger.warning("Could not setup skills path mapping: %s", e, exc_info=True)
|
||||
logger.warning("Could not setup path mappings: %s", e, exc_info=True)
|
||||
|
||||
return mappings
|
||||
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
from deerflow.sandbox.search import GrepMatch
|
||||
|
||||
|
||||
class Sandbox(ABC):
|
||||
"""Abstract base class for sandbox environments"""
|
||||
@@ -61,6 +63,25 @@ class Sandbox(ABC):
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def glob(self, path: str, pattern: str, *, include_dirs: bool = False, max_results: int = 200) -> tuple[list[str], bool]:
|
||||
"""Find paths that match a glob pattern under a root directory."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def grep(
|
||||
self,
|
||||
path: str,
|
||||
pattern: str,
|
||||
*,
|
||||
glob: str | None = None,
|
||||
literal: bool = False,
|
||||
case_sensitive: bool = False,
|
||||
max_results: int = 100,
|
||||
) -> tuple[list[GrepMatch], bool]:
|
||||
"""Search for matches inside text files under a directory."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def update_file(self, path: str, content: bytes) -> None:
|
||||
"""Update a file with binary content.
|
||||
|
||||
@@ -0,0 +1,210 @@
|
||||
import fnmatch
|
||||
import os
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path, PurePosixPath
|
||||
|
||||
IGNORE_PATTERNS = [
|
||||
".git",
|
||||
".svn",
|
||||
".hg",
|
||||
".bzr",
|
||||
"node_modules",
|
||||
"__pycache__",
|
||||
".venv",
|
||||
"venv",
|
||||
".env",
|
||||
"env",
|
||||
".tox",
|
||||
".nox",
|
||||
".eggs",
|
||||
"*.egg-info",
|
||||
"site-packages",
|
||||
"dist",
|
||||
"build",
|
||||
".next",
|
||||
".nuxt",
|
||||
".output",
|
||||
".turbo",
|
||||
"target",
|
||||
"out",
|
||||
".idea",
|
||||
".vscode",
|
||||
"*.swp",
|
||||
"*.swo",
|
||||
"*~",
|
||||
".project",
|
||||
".classpath",
|
||||
".settings",
|
||||
".DS_Store",
|
||||
"Thumbs.db",
|
||||
"desktop.ini",
|
||||
"*.lnk",
|
||||
"*.log",
|
||||
"*.tmp",
|
||||
"*.temp",
|
||||
"*.bak",
|
||||
"*.cache",
|
||||
".cache",
|
||||
"logs",
|
||||
".coverage",
|
||||
"coverage",
|
||||
".nyc_output",
|
||||
"htmlcov",
|
||||
".pytest_cache",
|
||||
".mypy_cache",
|
||||
".ruff_cache",
|
||||
]
|
||||
|
||||
DEFAULT_MAX_FILE_SIZE_BYTES = 1_000_000
|
||||
DEFAULT_LINE_SUMMARY_LENGTH = 200
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class GrepMatch:
|
||||
path: str
|
||||
line_number: int
|
||||
line: str
|
||||
|
||||
|
||||
def should_ignore_name(name: str) -> bool:
|
||||
for pattern in IGNORE_PATTERNS:
|
||||
if fnmatch.fnmatch(name, pattern):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def should_ignore_path(path: str) -> bool:
|
||||
return any(should_ignore_name(segment) for segment in path.replace("\\", "/").split("/") if segment)
|
||||
|
||||
|
||||
def path_matches(pattern: str, rel_path: str) -> bool:
|
||||
path = PurePosixPath(rel_path)
|
||||
if path.match(pattern):
|
||||
return True
|
||||
if pattern.startswith("**/"):
|
||||
return path.match(pattern[3:])
|
||||
return False
|
||||
|
||||
|
||||
def truncate_line(line: str, max_chars: int = DEFAULT_LINE_SUMMARY_LENGTH) -> str:
|
||||
line = line.rstrip("\n\r")
|
||||
if len(line) <= max_chars:
|
||||
return line
|
||||
return line[: max_chars - 3] + "..."
|
||||
|
||||
|
||||
def is_binary_file(path: Path, sample_size: int = 8192) -> bool:
|
||||
try:
|
||||
with path.open("rb") as handle:
|
||||
return b"\0" in handle.read(sample_size)
|
||||
except OSError:
|
||||
return True
|
||||
|
||||
|
||||
def find_glob_matches(root: Path, pattern: str, *, include_dirs: bool = False, max_results: int = 200) -> tuple[list[str], bool]:
|
||||
matches: list[str] = []
|
||||
truncated = False
|
||||
root = root.resolve()
|
||||
|
||||
if not root.exists():
|
||||
raise FileNotFoundError(root)
|
||||
if not root.is_dir():
|
||||
raise NotADirectoryError(root)
|
||||
|
||||
for current_root, dirs, files in os.walk(root):
|
||||
dirs[:] = [name for name in dirs if not should_ignore_name(name)]
|
||||
# root is already resolved; os.walk builds current_root by joining under root,
|
||||
# so relative_to() works without an extra stat()/resolve() per directory.
|
||||
rel_dir = Path(current_root).relative_to(root)
|
||||
|
||||
if include_dirs:
|
||||
for name in dirs:
|
||||
rel_path = (rel_dir / name).as_posix()
|
||||
if path_matches(pattern, rel_path):
|
||||
matches.append(str(Path(current_root) / name))
|
||||
if len(matches) >= max_results:
|
||||
truncated = True
|
||||
return matches, truncated
|
||||
|
||||
for name in files:
|
||||
if should_ignore_name(name):
|
||||
continue
|
||||
rel_path = (rel_dir / name).as_posix()
|
||||
if path_matches(pattern, rel_path):
|
||||
matches.append(str(Path(current_root) / name))
|
||||
if len(matches) >= max_results:
|
||||
truncated = True
|
||||
return matches, truncated
|
||||
|
||||
return matches, truncated
|
||||
|
||||
|
||||
def find_grep_matches(
|
||||
root: Path,
|
||||
pattern: str,
|
||||
*,
|
||||
glob_pattern: str | None = None,
|
||||
literal: bool = False,
|
||||
case_sensitive: bool = False,
|
||||
max_results: int = 100,
|
||||
max_file_size: int = DEFAULT_MAX_FILE_SIZE_BYTES,
|
||||
line_summary_length: int = DEFAULT_LINE_SUMMARY_LENGTH,
|
||||
) -> tuple[list[GrepMatch], bool]:
|
||||
matches: list[GrepMatch] = []
|
||||
truncated = False
|
||||
root = root.resolve()
|
||||
|
||||
if not root.exists():
|
||||
raise FileNotFoundError(root)
|
||||
if not root.is_dir():
|
||||
raise NotADirectoryError(root)
|
||||
|
||||
regex_source = re.escape(pattern) if literal else pattern
|
||||
flags = 0 if case_sensitive else re.IGNORECASE
|
||||
regex = re.compile(regex_source, flags)
|
||||
|
||||
# Skip lines longer than this to prevent ReDoS on minified / no-newline files.
|
||||
_max_line_chars = line_summary_length * 10
|
||||
|
||||
for current_root, dirs, files in os.walk(root):
|
||||
dirs[:] = [name for name in dirs if not should_ignore_name(name)]
|
||||
rel_dir = Path(current_root).relative_to(root)
|
||||
|
||||
for name in files:
|
||||
if should_ignore_name(name):
|
||||
continue
|
||||
|
||||
candidate_path = Path(current_root) / name
|
||||
rel_path = (rel_dir / name).as_posix()
|
||||
|
||||
if glob_pattern is not None and not path_matches(glob_pattern, rel_path):
|
||||
continue
|
||||
|
||||
try:
|
||||
if candidate_path.is_symlink():
|
||||
continue
|
||||
file_path = candidate_path.resolve()
|
||||
if not file_path.is_relative_to(root):
|
||||
continue
|
||||
if file_path.stat().st_size > max_file_size or is_binary_file(file_path):
|
||||
continue
|
||||
with file_path.open(encoding="utf-8", errors="replace") as handle:
|
||||
for line_number, line in enumerate(handle, start=1):
|
||||
if len(line) > _max_line_chars:
|
||||
continue
|
||||
if regex.search(line):
|
||||
matches.append(
|
||||
GrepMatch(
|
||||
path=str(file_path),
|
||||
line_number=line_number,
|
||||
line=truncate_line(line, line_summary_length),
|
||||
)
|
||||
)
|
||||
if len(matches) >= max_results:
|
||||
truncated = True
|
||||
return matches, truncated
|
||||
except OSError:
|
||||
continue
|
||||
|
||||
return matches, truncated
|
||||
@@ -7,6 +7,7 @@ from langchain.tools import ToolRuntime, tool
|
||||
from langgraph.typing import ContextT
|
||||
|
||||
from deerflow.agents.thread_state import ThreadDataState, ThreadState
|
||||
from deerflow.config import get_app_config
|
||||
from deerflow.config.paths import VIRTUAL_PATH_PREFIX
|
||||
from deerflow.sandbox.exceptions import (
|
||||
SandboxError,
|
||||
@@ -16,6 +17,7 @@ from deerflow.sandbox.exceptions import (
|
||||
from deerflow.sandbox.file_operation_lock import get_file_operation_lock
|
||||
from deerflow.sandbox.sandbox import Sandbox
|
||||
from deerflow.sandbox.sandbox_provider import get_sandbox_provider
|
||||
from deerflow.sandbox.search import GrepMatch
|
||||
from deerflow.sandbox.security import LOCAL_HOST_BASH_DISABLED_MESSAGE, is_host_bash_allowed
|
||||
|
||||
_ABSOLUTE_PATH_PATTERN = re.compile(r"(?<![:\w])(?<!:/)/(?:[^\s\"'`;&|<>()]+)")
|
||||
@@ -31,6 +33,10 @@ _LOCAL_BASH_SYSTEM_PATH_PREFIXES = (
|
||||
|
||||
_DEFAULT_SKILLS_CONTAINER_PATH = "/mnt/skills"
|
||||
_ACP_WORKSPACE_VIRTUAL_PATH = "/mnt/acp-workspace"
|
||||
_DEFAULT_GLOB_MAX_RESULTS = 200
|
||||
_MAX_GLOB_MAX_RESULTS = 1000
|
||||
_DEFAULT_GREP_MAX_RESULTS = 100
|
||||
_MAX_GREP_MAX_RESULTS = 500
|
||||
|
||||
|
||||
def _get_skills_container_path() -> str:
|
||||
@@ -113,6 +119,54 @@ def _is_acp_workspace_path(path: str) -> bool:
|
||||
return path == _ACP_WORKSPACE_VIRTUAL_PATH or path.startswith(f"{_ACP_WORKSPACE_VIRTUAL_PATH}/")
|
||||
|
||||
|
||||
def _get_custom_mounts():
|
||||
"""Get custom volume mounts from sandbox config.
|
||||
|
||||
Result is cached after the first successful config load. If config loading
|
||||
fails an empty list is returned *without* caching so that a later call can
|
||||
pick up the real value once the config is available.
|
||||
"""
|
||||
cached = getattr(_get_custom_mounts, "_cached", None)
|
||||
if cached is not None:
|
||||
return cached
|
||||
try:
|
||||
from pathlib import Path
|
||||
|
||||
from deerflow.config import get_app_config
|
||||
|
||||
config = get_app_config()
|
||||
mounts = []
|
||||
if config.sandbox and config.sandbox.mounts:
|
||||
# Only include mounts whose host_path exists, consistent with
|
||||
# LocalSandboxProvider._setup_path_mappings() which also filters
|
||||
# by host_path.exists().
|
||||
mounts = [m for m in config.sandbox.mounts if Path(m.host_path).exists()]
|
||||
_get_custom_mounts._cached = mounts # type: ignore[attr-defined]
|
||||
return mounts
|
||||
except Exception:
|
||||
# If config loading fails, return an empty list without caching so that
|
||||
# a later call can retry once the config is available.
|
||||
return []
|
||||
|
||||
|
||||
def _is_custom_mount_path(path: str) -> bool:
|
||||
"""Check if path is under a custom mount container_path."""
|
||||
for mount in _get_custom_mounts():
|
||||
if path == mount.container_path or path.startswith(f"{mount.container_path}/"):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _get_custom_mount_for_path(path: str):
|
||||
"""Get the mount config matching this path (longest prefix first)."""
|
||||
best = None
|
||||
for mount in _get_custom_mounts():
|
||||
if path == mount.container_path or path.startswith(f"{mount.container_path}/"):
|
||||
if best is None or len(mount.container_path) > len(best.container_path):
|
||||
best = mount
|
||||
return best
|
||||
|
||||
|
||||
def _extract_thread_id_from_thread_data(thread_data: "ThreadDataState | None") -> str | None:
|
||||
"""Extract thread_id from thread_data by inspecting workspace_path.
|
||||
|
||||
@@ -245,16 +299,84 @@ def _get_mcp_allowed_paths() -> list[str]:
|
||||
return allowed_paths
|
||||
|
||||
|
||||
def _get_tool_config_int(name: str, key: str, default: int) -> int:
|
||||
try:
|
||||
tool_config = get_app_config().get_tool_config(name)
|
||||
if tool_config is not None and key in tool_config.model_extra:
|
||||
value = tool_config.model_extra.get(key)
|
||||
if isinstance(value, int):
|
||||
return value
|
||||
except Exception:
|
||||
pass
|
||||
return default
|
||||
|
||||
|
||||
def _clamp_max_results(value: int, *, default: int, upper_bound: int) -> int:
|
||||
if value <= 0:
|
||||
return default
|
||||
return min(value, upper_bound)
|
||||
|
||||
|
||||
def _resolve_max_results(name: str, requested: int, *, default: int, upper_bound: int) -> int:
|
||||
requested_max_results = _clamp_max_results(requested, default=default, upper_bound=upper_bound)
|
||||
configured_max_results = _clamp_max_results(
|
||||
_get_tool_config_int(name, "max_results", default),
|
||||
default=default,
|
||||
upper_bound=upper_bound,
|
||||
)
|
||||
return min(requested_max_results, configured_max_results)
|
||||
|
||||
|
||||
def _resolve_local_read_path(path: str, thread_data: ThreadDataState) -> str:
|
||||
validate_local_tool_path(path, thread_data, read_only=True)
|
||||
if _is_skills_path(path):
|
||||
return _resolve_skills_path(path)
|
||||
if _is_acp_workspace_path(path):
|
||||
return _resolve_acp_workspace_path(path, _extract_thread_id_from_thread_data(thread_data))
|
||||
return _resolve_and_validate_user_data_path(path, thread_data)
|
||||
|
||||
|
||||
def _format_glob_results(root_path: str, matches: list[str], truncated: bool) -> str:
|
||||
if not matches:
|
||||
return f"No files matched under {root_path}"
|
||||
|
||||
lines = [f"Found {len(matches)} paths under {root_path}"]
|
||||
if truncated:
|
||||
lines[0] += f" (showing first {len(matches)})"
|
||||
lines.extend(f"{index}. {path}" for index, path in enumerate(matches, start=1))
|
||||
if truncated:
|
||||
lines.append("Results truncated. Narrow the path or pattern to see fewer matches.")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _format_grep_results(root_path: str, matches: list[GrepMatch], truncated: bool) -> str:
|
||||
if not matches:
|
||||
return f"No matches found under {root_path}"
|
||||
|
||||
lines = [f"Found {len(matches)} matches under {root_path}"]
|
||||
if truncated:
|
||||
lines[0] += f" (showing first {len(matches)})"
|
||||
lines.extend(f"{match.path}:{match.line_number}: {match.line}" for match in matches)
|
||||
if truncated:
|
||||
lines.append("Results truncated. Narrow the path or add a glob filter.")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _path_variants(path: str) -> set[str]:
|
||||
return {path, path.replace("\\", "/"), path.replace("/", "\\")}
|
||||
|
||||
|
||||
def _path_separator_for_style(path: str) -> str:
|
||||
return "\\" if "\\" in path and "/" not in path else "/"
|
||||
|
||||
|
||||
def _join_path_preserving_style(base: str, relative: str) -> str:
|
||||
if not relative:
|
||||
return base
|
||||
if "/" in base and "\\" not in base:
|
||||
return f"{base.rstrip('/')}/{relative}"
|
||||
return str(Path(base) / relative)
|
||||
separator = _path_separator_for_style(base)
|
||||
normalized_relative = relative.replace("\\" if separator == "/" else "/", separator).lstrip("/\\")
|
||||
stripped_base = base.rstrip("/\\")
|
||||
return f"{stripped_base}{separator}{normalized_relative}"
|
||||
|
||||
|
||||
def _sanitize_error(error: Exception, runtime: "ToolRuntime[ContextT, ThreadState] | None" = None) -> str:
|
||||
@@ -299,7 +421,10 @@ def replace_virtual_path(path: str, thread_data: ThreadDataState | None) -> str:
|
||||
return actual_base
|
||||
if path.startswith(f"{virtual_base}/"):
|
||||
rest = path[len(virtual_base) :].lstrip("/")
|
||||
return _join_path_preserving_style(actual_base, rest)
|
||||
result = _join_path_preserving_style(actual_base, rest)
|
||||
if path.endswith("/") and not result.endswith(("/", "\\")):
|
||||
result += _path_separator_for_style(actual_base)
|
||||
return result
|
||||
|
||||
return path
|
||||
|
||||
@@ -379,6 +504,8 @@ def mask_local_paths_in_output(output: str, thread_data: ThreadDataState | None)
|
||||
|
||||
result = pattern.sub(replace_acp, result)
|
||||
|
||||
# Custom mount host paths are masked by LocalSandbox._reverse_resolve_paths_in_output()
|
||||
|
||||
# Mask user-data host paths
|
||||
if thread_data is None:
|
||||
return result
|
||||
@@ -427,6 +554,7 @@ def validate_local_tool_path(path: str, thread_data: ThreadDataState | None, *,
|
||||
- ``/mnt/user-data/*`` — always allowed (read + write)
|
||||
- ``/mnt/skills/*`` — allowed only when *read_only* is True
|
||||
- ``/mnt/acp-workspace/*`` — allowed only when *read_only* is True
|
||||
- Custom mount paths (from config.yaml) — respects per-mount ``read_only`` flag
|
||||
|
||||
Args:
|
||||
path: The virtual path to validate.
|
||||
@@ -458,7 +586,14 @@ def validate_local_tool_path(path: str, thread_data: ThreadDataState | None, *,
|
||||
if path.startswith(f"{VIRTUAL_PATH_PREFIX}/"):
|
||||
return
|
||||
|
||||
raise PermissionError(f"Only paths under {VIRTUAL_PATH_PREFIX}/, {_get_skills_container_path()}/, or {_ACP_WORKSPACE_VIRTUAL_PATH}/ are allowed")
|
||||
# Custom mount paths — respect read_only config
|
||||
if _is_custom_mount_path(path):
|
||||
mount = _get_custom_mount_for_path(path)
|
||||
if mount and mount.read_only and not read_only:
|
||||
raise PermissionError(f"Write access to read-only mount is not allowed: {path}")
|
||||
return
|
||||
|
||||
raise PermissionError(f"Only paths under {VIRTUAL_PATH_PREFIX}/, {_get_skills_container_path()}/, {_ACP_WORKSPACE_VIRTUAL_PATH}/, or configured mount paths are allowed")
|
||||
|
||||
|
||||
def _validate_resolved_user_data_path(resolved: Path, thread_data: ThreadDataState) -> None:
|
||||
@@ -508,9 +643,10 @@ def validate_local_bash_command_paths(command: str, thread_data: ThreadDataState
|
||||
boundary and must not be treated as isolation from the host filesystem.
|
||||
|
||||
In local mode, commands must use virtual paths under /mnt/user-data for
|
||||
user data access. Skills paths under /mnt/skills and ACP workspace paths
|
||||
under /mnt/acp-workspace are allowed (path-traversal checks only; write
|
||||
prevention for bash commands is not enforced here).
|
||||
user data access. Skills paths under /mnt/skills, ACP workspace paths
|
||||
under /mnt/acp-workspace, and custom mount container paths (configured in
|
||||
config.yaml) are allowed (path-traversal checks only; write prevention
|
||||
for bash commands is not enforced here).
|
||||
A small allowlist of common system path prefixes is kept for executable
|
||||
and device references (e.g. /bin/sh, /dev/null).
|
||||
"""
|
||||
@@ -545,6 +681,11 @@ def validate_local_bash_command_paths(command: str, thread_data: ThreadDataState
|
||||
_reject_path_traversal(absolute_path)
|
||||
continue
|
||||
|
||||
# Allow custom mount container paths
|
||||
if _is_custom_mount_path(absolute_path):
|
||||
_reject_path_traversal(absolute_path)
|
||||
continue
|
||||
|
||||
if any(absolute_path == prefix.rstrip("/") or absolute_path.startswith(prefix) for prefix in _LOCAL_BASH_SYSTEM_PATH_PREFIXES):
|
||||
continue
|
||||
|
||||
@@ -589,6 +730,8 @@ def replace_virtual_paths_in_command(command: str, thread_data: ThreadDataState
|
||||
|
||||
result = acp_pattern.sub(replace_acp_match, result)
|
||||
|
||||
# Custom mount paths are resolved by LocalSandbox._resolve_paths_in_command()
|
||||
|
||||
# Replace user-data paths
|
||||
if VIRTUAL_PATH_PREFIX in result and thread_data is not None:
|
||||
pattern = re.compile(rf"{re.escape(VIRTUAL_PATH_PREFIX)}(/[^\s\"';&|<>()]*)?")
|
||||
@@ -666,7 +809,8 @@ def sandbox_from_runtime(runtime: ToolRuntime[ContextT, ThreadState] | None = No
|
||||
if sandbox is None:
|
||||
raise SandboxNotFoundError(f"Sandbox with ID '{sandbox_id}' not found", sandbox_id=sandbox_id)
|
||||
|
||||
runtime.context["sandbox_id"] = sandbox_id # Ensure sandbox_id is in context for downstream use
|
||||
if runtime.context is not None:
|
||||
runtime.context["sandbox_id"] = sandbox_id # Ensure sandbox_id is in context for downstream use
|
||||
return sandbox
|
||||
|
||||
|
||||
@@ -701,7 +845,8 @@ def ensure_sandbox_initialized(runtime: ToolRuntime[ContextT, ThreadState] | Non
|
||||
if sandbox_id is not None:
|
||||
sandbox = get_sandbox_provider().get(sandbox_id)
|
||||
if sandbox is not None:
|
||||
runtime.context["sandbox_id"] = sandbox_id # Ensure sandbox_id is in context for releasing in after_agent
|
||||
if runtime.context is not None:
|
||||
runtime.context["sandbox_id"] = sandbox_id # Ensure sandbox_id is in context for releasing in after_agent
|
||||
return sandbox
|
||||
# Sandbox was released, fall through to acquire new one
|
||||
|
||||
@@ -723,7 +868,8 @@ def ensure_sandbox_initialized(runtime: ToolRuntime[ContextT, ThreadState] | Non
|
||||
if sandbox is None:
|
||||
raise SandboxNotFoundError("Sandbox not found after acquisition", sandbox_id=sandbox_id)
|
||||
|
||||
runtime.context["sandbox_id"] = sandbox_id # Ensure sandbox_id is in context for releasing in after_agent
|
||||
if runtime.context is not None:
|
||||
runtime.context["sandbox_id"] = sandbox_id # Ensure sandbox_id is in context for releasing in after_agent
|
||||
return sandbox
|
||||
|
||||
|
||||
@@ -885,8 +1031,9 @@ def ls_tool(runtime: ToolRuntime[ContextT, ThreadState], description: str, path:
|
||||
path = _resolve_skills_path(path)
|
||||
elif _is_acp_workspace_path(path):
|
||||
path = _resolve_acp_workspace_path(path, _extract_thread_id_from_thread_data(thread_data))
|
||||
else:
|
||||
elif not _is_custom_mount_path(path):
|
||||
path = _resolve_and_validate_user_data_path(path, thread_data)
|
||||
# Custom mount paths are resolved by LocalSandbox._resolve_path()
|
||||
children = sandbox.list_dir(path)
|
||||
if not children:
|
||||
return "(empty)"
|
||||
@@ -901,6 +1048,126 @@ def ls_tool(runtime: ToolRuntime[ContextT, ThreadState], description: str, path:
|
||||
return f"Error: Unexpected error listing directory: {_sanitize_error(e, runtime)}"
|
||||
|
||||
|
||||
@tool("glob", parse_docstring=True)
|
||||
def glob_tool(
|
||||
runtime: ToolRuntime[ContextT, ThreadState],
|
||||
description: str,
|
||||
pattern: str,
|
||||
path: str,
|
||||
include_dirs: bool = False,
|
||||
max_results: int = _DEFAULT_GLOB_MAX_RESULTS,
|
||||
) -> str:
|
||||
"""Find files or directories that match a glob pattern under a root directory.
|
||||
|
||||
Args:
|
||||
description: Explain why you are searching for these paths in short words. ALWAYS PROVIDE THIS PARAMETER FIRST.
|
||||
pattern: The glob pattern to match relative to the root path, for example `**/*.py`.
|
||||
path: The **absolute** root directory to search under.
|
||||
include_dirs: Whether matching directories should also be returned. Default is False.
|
||||
max_results: Maximum number of paths to return. Default is 200.
|
||||
"""
|
||||
try:
|
||||
sandbox = ensure_sandbox_initialized(runtime)
|
||||
ensure_thread_directories_exist(runtime)
|
||||
requested_path = path
|
||||
effective_max_results = _resolve_max_results(
|
||||
"glob",
|
||||
max_results,
|
||||
default=_DEFAULT_GLOB_MAX_RESULTS,
|
||||
upper_bound=_MAX_GLOB_MAX_RESULTS,
|
||||
)
|
||||
thread_data = None
|
||||
if is_local_sandbox(runtime):
|
||||
thread_data = get_thread_data(runtime)
|
||||
if thread_data is None:
|
||||
raise SandboxRuntimeError("Thread data not available for local sandbox")
|
||||
path = _resolve_local_read_path(path, thread_data)
|
||||
matches, truncated = sandbox.glob(path, pattern, include_dirs=include_dirs, max_results=effective_max_results)
|
||||
if thread_data is not None:
|
||||
matches = [mask_local_paths_in_output(match, thread_data) for match in matches]
|
||||
return _format_glob_results(requested_path, matches, truncated)
|
||||
except SandboxError as e:
|
||||
return f"Error: {e}"
|
||||
except FileNotFoundError:
|
||||
return f"Error: Directory not found: {requested_path}"
|
||||
except NotADirectoryError:
|
||||
return f"Error: Path is not a directory: {requested_path}"
|
||||
except PermissionError:
|
||||
return f"Error: Permission denied: {requested_path}"
|
||||
except Exception as e:
|
||||
return f"Error: Unexpected error searching paths: {_sanitize_error(e, runtime)}"
|
||||
|
||||
|
||||
@tool("grep", parse_docstring=True)
|
||||
def grep_tool(
|
||||
runtime: ToolRuntime[ContextT, ThreadState],
|
||||
description: str,
|
||||
pattern: str,
|
||||
path: str,
|
||||
glob: str | None = None,
|
||||
literal: bool = False,
|
||||
case_sensitive: bool = False,
|
||||
max_results: int = _DEFAULT_GREP_MAX_RESULTS,
|
||||
) -> str:
|
||||
"""Search for matching lines inside text files under a root directory.
|
||||
|
||||
Args:
|
||||
description: Explain why you are searching file contents in short words. ALWAYS PROVIDE THIS PARAMETER FIRST.
|
||||
pattern: The string or regex pattern to search for.
|
||||
path: The **absolute** root directory to search under.
|
||||
glob: Optional glob filter for candidate files, for example `**/*.py`.
|
||||
literal: Whether to treat `pattern` as a plain string. Default is False.
|
||||
case_sensitive: Whether matching is case-sensitive. Default is False.
|
||||
max_results: Maximum number of matching lines to return. Default is 100.
|
||||
"""
|
||||
try:
|
||||
sandbox = ensure_sandbox_initialized(runtime)
|
||||
ensure_thread_directories_exist(runtime)
|
||||
requested_path = path
|
||||
effective_max_results = _resolve_max_results(
|
||||
"grep",
|
||||
max_results,
|
||||
default=_DEFAULT_GREP_MAX_RESULTS,
|
||||
upper_bound=_MAX_GREP_MAX_RESULTS,
|
||||
)
|
||||
thread_data = None
|
||||
if is_local_sandbox(runtime):
|
||||
thread_data = get_thread_data(runtime)
|
||||
if thread_data is None:
|
||||
raise SandboxRuntimeError("Thread data not available for local sandbox")
|
||||
path = _resolve_local_read_path(path, thread_data)
|
||||
matches, truncated = sandbox.grep(
|
||||
path,
|
||||
pattern,
|
||||
glob=glob,
|
||||
literal=literal,
|
||||
case_sensitive=case_sensitive,
|
||||
max_results=effective_max_results,
|
||||
)
|
||||
if thread_data is not None:
|
||||
matches = [
|
||||
GrepMatch(
|
||||
path=mask_local_paths_in_output(match.path, thread_data),
|
||||
line_number=match.line_number,
|
||||
line=match.line,
|
||||
)
|
||||
for match in matches
|
||||
]
|
||||
return _format_grep_results(requested_path, matches, truncated)
|
||||
except SandboxError as e:
|
||||
return f"Error: {e}"
|
||||
except FileNotFoundError:
|
||||
return f"Error: Directory not found: {requested_path}"
|
||||
except NotADirectoryError:
|
||||
return f"Error: Path is not a directory: {requested_path}"
|
||||
except re.error as e:
|
||||
return f"Error: Invalid regex pattern: {e}"
|
||||
except PermissionError:
|
||||
return f"Error: Permission denied: {requested_path}"
|
||||
except Exception as e:
|
||||
return f"Error: Unexpected error searching file contents: {_sanitize_error(e, runtime)}"
|
||||
|
||||
|
||||
@tool("read_file", parse_docstring=True)
|
||||
def read_file_tool(
|
||||
runtime: ToolRuntime[ContextT, ThreadState],
|
||||
@@ -928,8 +1195,9 @@ def read_file_tool(
|
||||
path = _resolve_skills_path(path)
|
||||
elif _is_acp_workspace_path(path):
|
||||
path = _resolve_acp_workspace_path(path, _extract_thread_id_from_thread_data(thread_data))
|
||||
else:
|
||||
elif not _is_custom_mount_path(path):
|
||||
path = _resolve_and_validate_user_data_path(path, thread_data)
|
||||
# Custom mount paths are resolved by LocalSandbox._resolve_path()
|
||||
content = sandbox.read_file(path)
|
||||
if not content:
|
||||
return "(empty)"
|
||||
@@ -977,7 +1245,9 @@ def write_file_tool(
|
||||
if is_local_sandbox(runtime):
|
||||
thread_data = get_thread_data(runtime)
|
||||
validate_local_tool_path(path, thread_data)
|
||||
path = _resolve_and_validate_user_data_path(path, thread_data)
|
||||
if not _is_custom_mount_path(path):
|
||||
path = _resolve_and_validate_user_data_path(path, thread_data)
|
||||
# Custom mount paths are resolved by LocalSandbox._resolve_path()
|
||||
with get_file_operation_lock(sandbox, path):
|
||||
sandbox.write_file(path, content, append)
|
||||
return "OK"
|
||||
@@ -1019,7 +1289,9 @@ def str_replace_tool(
|
||||
if is_local_sandbox(runtime):
|
||||
thread_data = get_thread_data(runtime)
|
||||
validate_local_tool_path(path, thread_data)
|
||||
path = _resolve_and_validate_user_data_path(path, thread_data)
|
||||
if not _is_custom_mount_path(path):
|
||||
path = _resolve_and_validate_user_data_path(path, thread_data)
|
||||
# Custom mount paths are resolved by LocalSandbox._resolve_path()
|
||||
with get_file_operation_lock(sandbox, path):
|
||||
content = sandbox.read_file(path)
|
||||
if not content:
|
||||
|
||||
@@ -43,5 +43,5 @@ You have access to the sandbox environment:
|
||||
tools=["bash", "ls", "read_file", "write_file", "str_replace"], # Sandbox tools only
|
||||
disallowed_tools=["task", "ask_clarification", "present_files"],
|
||||
model="inherit",
|
||||
max_turns=30,
|
||||
max_turns=60,
|
||||
)
|
||||
|
||||
@@ -44,5 +44,5 @@ You have access to the same sandbox environment as the parent agent:
|
||||
tools=None, # Inherit all tools from parent
|
||||
disallowed_tools=["task", "ask_clarification", "present_files"], # Prevent nesting and clarification
|
||||
model="inherit",
|
||||
max_turns=50,
|
||||
max_turns=100,
|
||||
)
|
||||
|
||||
@@ -28,9 +28,27 @@ def get_subagent_config(name: str) -> SubagentConfig | None:
|
||||
|
||||
app_config = get_subagents_app_config()
|
||||
effective_timeout = app_config.get_timeout_for(name)
|
||||
effective_max_turns = app_config.get_max_turns_for(name, config.max_turns)
|
||||
|
||||
overrides = {}
|
||||
if effective_timeout != config.timeout_seconds:
|
||||
logger.debug(f"Subagent '{name}': timeout overridden by config.yaml ({config.timeout_seconds}s -> {effective_timeout}s)")
|
||||
config = replace(config, timeout_seconds=effective_timeout)
|
||||
logger.debug(
|
||||
"Subagent '%s': timeout overridden by config.yaml (%ss -> %ss)",
|
||||
name,
|
||||
config.timeout_seconds,
|
||||
effective_timeout,
|
||||
)
|
||||
overrides["timeout_seconds"] = effective_timeout
|
||||
if effective_max_turns != config.max_turns:
|
||||
logger.debug(
|
||||
"Subagent '%s': max_turns overridden by config.yaml (%s -> %s)",
|
||||
name,
|
||||
config.max_turns,
|
||||
effective_max_turns,
|
||||
)
|
||||
overrides["max_turns"] = effective_max_turns
|
||||
if overrides:
|
||||
config = replace(config, **overrides)
|
||||
|
||||
return config
|
||||
|
||||
|
||||
@@ -57,6 +57,42 @@ def _build_mcp_servers() -> dict[str, dict[str, Any]]:
|
||||
return build_servers_config(ExtensionsConfig.from_file())
|
||||
|
||||
|
||||
def _build_acp_mcp_servers() -> list[dict[str, Any]]:
|
||||
"""Build ACP ``mcpServers`` payload for ``new_session``.
|
||||
|
||||
The ACP client expects a list of server objects, while DeerFlow's MCP helper
|
||||
returns a name -> config mapping for the LangChain MCP adapter. This helper
|
||||
converts the enabled servers into the ACP wire format.
|
||||
"""
|
||||
from deerflow.config.extensions_config import ExtensionsConfig
|
||||
|
||||
extensions_config = ExtensionsConfig.from_file()
|
||||
enabled_servers = extensions_config.get_enabled_mcp_servers()
|
||||
|
||||
mcp_servers: list[dict[str, Any]] = []
|
||||
for name, server_config in enabled_servers.items():
|
||||
transport_type = server_config.type or "stdio"
|
||||
payload: dict[str, Any] = {"name": name, "type": transport_type}
|
||||
|
||||
if transport_type == "stdio":
|
||||
if not server_config.command:
|
||||
raise ValueError(f"MCP server '{name}' with stdio transport requires 'command' field")
|
||||
payload["command"] = server_config.command
|
||||
payload["args"] = server_config.args
|
||||
payload["env"] = [{"name": key, "value": value} for key, value in server_config.env.items()]
|
||||
elif transport_type in ("http", "sse"):
|
||||
if not server_config.url:
|
||||
raise ValueError(f"MCP server '{name}' with {transport_type} transport requires 'url' field")
|
||||
payload["url"] = server_config.url
|
||||
payload["headers"] = [{"name": key, "value": value} for key, value in server_config.headers.items()]
|
||||
else:
|
||||
raise ValueError(f"MCP server '{name}' has unsupported transport type: {transport_type}")
|
||||
|
||||
mcp_servers.append(payload)
|
||||
|
||||
return mcp_servers
|
||||
|
||||
|
||||
def _build_permission_response(options: list[Any], *, auto_approve: bool) -> Any:
|
||||
"""Build an ACP permission response.
|
||||
|
||||
@@ -173,7 +209,15 @@ def build_invoke_acp_agent_tool(agents: dict) -> BaseTool:
|
||||
cmd = agent_config.command
|
||||
args = agent_config.args or []
|
||||
physical_cwd = _get_work_dir(thread_id)
|
||||
mcp_servers = _build_mcp_servers()
|
||||
try:
|
||||
mcp_servers = _build_acp_mcp_servers()
|
||||
except ValueError as exc:
|
||||
logger.warning(
|
||||
"Invalid MCP server configuration for ACP agent '%s'; continuing without MCP servers: %s",
|
||||
agent,
|
||||
exc,
|
||||
)
|
||||
mcp_servers = []
|
||||
agent_env: dict[str, str] | None = None
|
||||
if agent_config.env:
|
||||
agent_env = {k: (os.environ.get(v[1:], "") if v.startswith("$") else v) for k, v in agent_config.env.items()}
|
||||
|
||||
@@ -1,10 +1,22 @@
|
||||
"""File conversion utilities.
|
||||
|
||||
Converts document files (PDF, PPT, Excel, Word) to Markdown using markitdown.
|
||||
Converts document files (PDF, PPT, Excel, Word) to Markdown.
|
||||
|
||||
PDF conversion strategy (auto mode):
|
||||
1. Try pymupdf4llm if installed — better heading detection, faster on most files.
|
||||
2. If output is suspiciously short (< _MIN_CHARS_PER_PAGE chars/page, or < 200 chars
|
||||
total when page count is unavailable), treat as image-based and fall back to MarkItDown.
|
||||
3. If pymupdf4llm is not installed, use MarkItDown directly (existing behaviour).
|
||||
|
||||
Large files (> ASYNC_THRESHOLD_BYTES) are converted in a thread pool via
|
||||
asyncio.to_thread() to avoid blocking the event loop (fixes #1569).
|
||||
|
||||
No FastAPI or HTTP dependencies — pure utility functions.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -20,28 +32,278 @@ CONVERTIBLE_EXTENSIONS = {
|
||||
".docx",
|
||||
}
|
||||
|
||||
# Files larger than this threshold are converted in a background thread.
|
||||
# Small files complete in < 1s synchronously; spawning a thread adds unnecessary
|
||||
# scheduling overhead for them.
|
||||
_ASYNC_THRESHOLD_BYTES = 1 * 1024 * 1024 # 1 MB
|
||||
|
||||
# If pymupdf4llm produces fewer characters *per page* than this threshold,
|
||||
# the PDF is likely image-based or encrypted — fall back to MarkItDown.
|
||||
# Rationale: normal text PDFs yield 200-2000 chars/page; image-based PDFs
|
||||
# yield close to 0. 50 chars/page gives a wide safety margin.
|
||||
# Falls back to absolute 200-char check when page count is unavailable.
|
||||
_MIN_CHARS_PER_PAGE = 50
|
||||
|
||||
|
||||
def _pymupdf_output_too_sparse(text: str, file_path: Path) -> bool:
|
||||
"""Return True if pymupdf4llm output is suspiciously short (image-based PDF).
|
||||
|
||||
Uses chars-per-page rather than an absolute threshold so that both short
|
||||
documents (few pages, few chars) and long documents (many pages, many chars)
|
||||
are handled correctly.
|
||||
"""
|
||||
chars = len(text.strip())
|
||||
doc = None
|
||||
pages: int | None = None
|
||||
try:
|
||||
import pymupdf
|
||||
|
||||
doc = pymupdf.open(str(file_path))
|
||||
pages = len(doc)
|
||||
except Exception:
|
||||
pass
|
||||
finally:
|
||||
if doc is not None:
|
||||
try:
|
||||
doc.close()
|
||||
except Exception:
|
||||
pass
|
||||
if pages is not None and pages > 0:
|
||||
return (chars / pages) < _MIN_CHARS_PER_PAGE
|
||||
# Fallback: absolute threshold when page count is unavailable
|
||||
return chars < 200
|
||||
|
||||
|
||||
def _convert_pdf_with_pymupdf4llm(file_path: Path) -> str | None:
|
||||
"""Attempt PDF conversion with pymupdf4llm.
|
||||
|
||||
Returns the markdown text, or None if pymupdf4llm is not installed or
|
||||
if conversion fails (e.g. encrypted/corrupt PDF).
|
||||
"""
|
||||
try:
|
||||
import pymupdf4llm
|
||||
except ImportError:
|
||||
return None
|
||||
|
||||
try:
|
||||
return pymupdf4llm.to_markdown(str(file_path))
|
||||
except Exception:
|
||||
logger.exception("pymupdf4llm failed to convert %s; falling back to MarkItDown", file_path.name)
|
||||
return None
|
||||
|
||||
|
||||
def _convert_with_markitdown(file_path: Path) -> str:
|
||||
"""Convert any supported file to markdown text using MarkItDown."""
|
||||
from markitdown import MarkItDown
|
||||
|
||||
md = MarkItDown()
|
||||
return md.convert(str(file_path)).text_content
|
||||
|
||||
|
||||
def _do_convert(file_path: Path, pdf_converter: str) -> str:
|
||||
"""Synchronous conversion — called directly or via asyncio.to_thread.
|
||||
|
||||
Args:
|
||||
file_path: Path to the file.
|
||||
pdf_converter: "auto" | "pymupdf4llm" | "markitdown"
|
||||
"""
|
||||
is_pdf = file_path.suffix.lower() == ".pdf"
|
||||
|
||||
if is_pdf and pdf_converter != "markitdown":
|
||||
# Try pymupdf4llm first (auto or explicit)
|
||||
pymupdf_text = _convert_pdf_with_pymupdf4llm(file_path)
|
||||
|
||||
if pymupdf_text is not None:
|
||||
# pymupdf4llm is installed
|
||||
if pdf_converter == "pymupdf4llm":
|
||||
# Explicit — use as-is regardless of output length
|
||||
return pymupdf_text
|
||||
# auto mode: fall back if output looks like a failed parse.
|
||||
# Use chars-per-page to distinguish image-based PDFs (near 0) from
|
||||
# legitimately short documents.
|
||||
if not _pymupdf_output_too_sparse(pymupdf_text, file_path):
|
||||
return pymupdf_text
|
||||
logger.warning(
|
||||
"pymupdf4llm produced only %d chars for %s (likely image-based PDF); falling back to MarkItDown",
|
||||
len(pymupdf_text.strip()),
|
||||
file_path.name,
|
||||
)
|
||||
# pymupdf4llm not installed or fallback triggered → use MarkItDown
|
||||
|
||||
return _convert_with_markitdown(file_path)
|
||||
|
||||
|
||||
async def convert_file_to_markdown(file_path: Path) -> Path | None:
|
||||
"""Convert a file to markdown using markitdown.
|
||||
"""Convert a supported document file to Markdown.
|
||||
|
||||
PDF files are handled with a two-converter strategy (see module docstring).
|
||||
Large files (> 1 MB) are offloaded to a thread pool to avoid blocking the
|
||||
event loop.
|
||||
|
||||
Args:
|
||||
file_path: Path to the file to convert.
|
||||
|
||||
Returns:
|
||||
Path to the markdown file if conversion was successful, None otherwise.
|
||||
Path to the generated .md file, or None if conversion failed.
|
||||
"""
|
||||
try:
|
||||
from markitdown import MarkItDown
|
||||
pdf_converter = _get_pdf_converter()
|
||||
file_size = file_path.stat().st_size
|
||||
|
||||
md = MarkItDown()
|
||||
result = md.convert(str(file_path))
|
||||
if file_size > _ASYNC_THRESHOLD_BYTES:
|
||||
text = await asyncio.to_thread(_do_convert, file_path, pdf_converter)
|
||||
else:
|
||||
text = _do_convert(file_path, pdf_converter)
|
||||
|
||||
# Save as .md file with same name
|
||||
md_path = file_path.with_suffix(".md")
|
||||
md_path.write_text(result.text_content, encoding="utf-8")
|
||||
md_path.write_text(text, encoding="utf-8")
|
||||
|
||||
logger.info(f"Converted {file_path.name} to markdown: {md_path.name}")
|
||||
logger.info("Converted %s to markdown: %s (%d chars)", file_path.name, md_path.name, len(text))
|
||||
return md_path
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to convert {file_path.name} to markdown: {e}")
|
||||
logger.error("Failed to convert %s to markdown: %s", file_path.name, e)
|
||||
return None
|
||||
|
||||
|
||||
# Regex for bold-only lines that look like section headings.
|
||||
# Targets SEC filing structural headings that pymupdf4llm renders as **bold**
|
||||
# rather than # Markdown headings (because they use same font size as body text,
|
||||
# distinguished only by bold+caps formatting).
|
||||
#
|
||||
# Pattern requires ALL of:
|
||||
# 1. Entire line is a single **...** block (no surrounding prose)
|
||||
# 2. Starts with a recognised structural keyword:
|
||||
# - ITEM / PART / SECTION (with optional number/letter after)
|
||||
# - SCHEDULE, EXHIBIT, APPENDIX, ANNEX, CHAPTER
|
||||
# All-caps addresses, boilerplate ("CURRENT REPORT", "SIGNATURES",
|
||||
# "WASHINGTON, DC 20549") do NOT start with these keywords and are excluded.
|
||||
#
|
||||
# Chinese headings (第三节...) are already captured as standard # headings
|
||||
# by pymupdf4llm, so they don't need this pattern.
|
||||
_BOLD_HEADING_RE = re.compile(r"^\*\*((ITEM|PART|SECTION|SCHEDULE|EXHIBIT|APPENDIX|ANNEX|CHAPTER)\b[A-Z0-9 .,\-]*)\*\*\s*$")
|
||||
|
||||
# Regex for split-bold headings produced by pymupdf4llm when a heading spans
|
||||
# multiple text spans in the PDF (e.g. section number and title are separate spans).
|
||||
# Matches lines like: **1** **Introduction** or **3.2** **Multi-Head Attention**
|
||||
# Requirements:
|
||||
# 1. Entire line consists only of **...** blocks separated by whitespace (no prose)
|
||||
# 2. First block is a section number (digits and dots, e.g. "1", "3.2", "A.1")
|
||||
# 3. Second block must not be purely numeric/punctuation — excludes financial table
|
||||
# headers like **2023** **2022** **2021** while allowing non-ASCII titles such as
|
||||
# **1** **概述** or accented words (negative lookahead instead of [A-Za-z])
|
||||
# 4. At most two additional blocks (four total) with [^*]+ (no * inside) to keep
|
||||
# the regex linear and avoid ReDoS on attacker-controlled content
|
||||
_SPLIT_BOLD_HEADING_RE = re.compile(r"^\*\*[\dA-Z][\d\.]*\*\*\s+\*\*(?!\d[\d\s.,\-–—/:()%]*\*\*)[^*]+\*\*(?:\s+\*\*[^*]+\*\*){0,2}\s*$")
|
||||
|
||||
# Maximum number of outline entries injected into the agent context.
|
||||
# Keeps prompt size bounded even for very long documents.
|
||||
MAX_OUTLINE_ENTRIES = 50
|
||||
|
||||
_ALLOWED_PDF_CONVERTERS = {"auto", "pymupdf4llm", "markitdown"}
|
||||
|
||||
|
||||
def _clean_bold_title(raw: str) -> str:
|
||||
"""Normalise a title string that may contain pymupdf4llm bold artefacts.
|
||||
|
||||
pymupdf4llm sometimes emits adjacent bold spans as ``**A** **B**`` instead
|
||||
of a single ``**A B**`` block. This helper merges those fragments and then
|
||||
strips the outermost ``**...**`` wrapper so the caller gets plain text.
|
||||
|
||||
Examples::
|
||||
|
||||
"**Overview**" → "Overview"
|
||||
"**UNITED STATES** **SECURITIES**" → "UNITED STATES SECURITIES"
|
||||
"plain text" → "plain text" (unchanged)
|
||||
"""
|
||||
# Merge adjacent bold spans: "** **" → " "
|
||||
merged = re.sub(r"\*\*\s*\*\*", " ", raw).strip()
|
||||
# Strip outermost **...** if the whole string is wrapped
|
||||
if m := re.fullmatch(r"\*\*(.+?)\*\*", merged, re.DOTALL):
|
||||
return m.group(1).strip()
|
||||
return merged
|
||||
|
||||
|
||||
def extract_outline(md_path: Path) -> list[dict]:
|
||||
"""Extract document outline (headings) from a Markdown file.
|
||||
|
||||
Recognises three heading styles produced by pymupdf4llm:
|
||||
|
||||
1. Standard Markdown headings: lines starting with one or more '#'.
|
||||
Inline ``**...**`` wrappers and adjacent bold spans (``** **``) are
|
||||
cleaned so the title is plain text.
|
||||
|
||||
2. Bold-only structural headings: ``**ITEM 1. BUSINESS**``, ``**PART II**``,
|
||||
etc. SEC filings use bold+caps for section headings with the same font
|
||||
size as body text, so pymupdf4llm cannot promote them to # headings.
|
||||
|
||||
3. Split-bold headings: ``**1** **Introduction**``, ``**3.2** **Attention**``.
|
||||
pymupdf4llm emits these when the section number and title text are
|
||||
separate spans in the underlying PDF (common in academic papers).
|
||||
|
||||
Args:
|
||||
md_path: Path to the .md file.
|
||||
|
||||
Returns:
|
||||
List of dicts with keys: title (str), line (int, 1-based).
|
||||
When the outline is truncated at MAX_OUTLINE_ENTRIES, a sentinel entry
|
||||
``{"truncated": True}`` is appended as the last element so callers can
|
||||
render a "showing first N headings" hint without re-scanning the file.
|
||||
Returns an empty list if the file cannot be read or has no headings.
|
||||
"""
|
||||
outline: list[dict] = []
|
||||
try:
|
||||
with md_path.open(encoding="utf-8") as f:
|
||||
for lineno, line in enumerate(f, 1):
|
||||
stripped = line.strip()
|
||||
if not stripped:
|
||||
continue
|
||||
|
||||
# Style 1: standard Markdown heading
|
||||
if stripped.startswith("#"):
|
||||
title = _clean_bold_title(stripped.lstrip("#").strip())
|
||||
if title:
|
||||
outline.append({"title": title, "line": lineno})
|
||||
|
||||
# Style 2: single bold block with SEC structural keyword
|
||||
elif m := _BOLD_HEADING_RE.match(stripped):
|
||||
title = m.group(1).strip()
|
||||
if title:
|
||||
outline.append({"title": title, "line": lineno})
|
||||
|
||||
# Style 3: split-bold heading — **<num>** **<title>**
|
||||
# Regex already enforces max 4 blocks and non-numeric second block.
|
||||
elif _SPLIT_BOLD_HEADING_RE.match(stripped):
|
||||
title = " ".join(re.findall(r"\*\*([^*]+)\*\*", stripped))
|
||||
if title:
|
||||
outline.append({"title": title, "line": lineno})
|
||||
|
||||
if len(outline) >= MAX_OUTLINE_ENTRIES:
|
||||
outline.append({"truncated": True})
|
||||
break
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
return outline
|
||||
|
||||
|
||||
def _get_pdf_converter() -> str:
|
||||
"""Read pdf_converter setting from app config, defaulting to 'auto'.
|
||||
|
||||
Normalizes the value to lowercase and validates it against the allowed set
|
||||
so that values like 'AUTO' or 'MarkItDown' from config.yaml don't silently
|
||||
fall through to unexpected behaviour.
|
||||
"""
|
||||
try:
|
||||
from deerflow.config.app_config import get_app_config
|
||||
|
||||
cfg = get_app_config()
|
||||
uploads_cfg = getattr(cfg, "uploads", None)
|
||||
if uploads_cfg is not None:
|
||||
raw = str(getattr(uploads_cfg, "pdf_converter", "auto")).strip().lower()
|
||||
if raw not in _ALLOWED_PDF_CONVERTERS:
|
||||
logger.warning("Invalid pdf_converter value %r; falling back to 'auto'", raw)
|
||||
return "auto"
|
||||
return raw
|
||||
except Exception:
|
||||
pass
|
||||
return "auto"
|
||||
|
||||
@@ -9,16 +9,17 @@ dependencies = [
|
||||
"dotenv>=0.9.9",
|
||||
"httpx>=0.28.0",
|
||||
"kubernetes>=30.0.0",
|
||||
"langchain>=1.2.3",
|
||||
"langchain>=1.2.3,<1.2.10",
|
||||
"langchain-anthropic>=1.3.4",
|
||||
"langchain-deepseek>=1.0.1",
|
||||
"langchain-mcp-adapters>=0.1.0",
|
||||
"langchain-openai>=1.1.7",
|
||||
"langfuse>=3.4.1",
|
||||
"langgraph>=1.0.6,<1.0.10",
|
||||
"langgraph-prebuilt>=1.0.6,<1.0.9",
|
||||
"langgraph-api>=0.7.0,<0.8.0",
|
||||
"langgraph-cli>=0.4.14",
|
||||
"langgraph-runtime-inmem>=0.22.1",
|
||||
"langgraph-runtime-inmem>=0.22.1,<0.27.0",
|
||||
"markdownify>=1.2.2",
|
||||
"markitdown[all,xlsx]>=0.0.1a2",
|
||||
"pydantic>=2.12.5",
|
||||
@@ -34,6 +35,9 @@ dependencies = [
|
||||
"langgraph-sdk>=0.1.51",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
pymupdf = ["pymupdf4llm>=0.0.17"]
|
||||
|
||||
[build-system]
|
||||
requires = ["hatchling"]
|
||||
build-backend = "hatchling.build"
|
||||
|
||||
Reference in New Issue
Block a user