mirror of
https://github.com/bytedance/deer-flow.git
synced 2026-05-24 08:55:59 +00:00
Merge branch 'main' into release/2.0-rc
This commit is contained in:
@@ -2,8 +2,14 @@ from .checkpointer import get_checkpointer, make_checkpointer, reset_checkpointe
|
||||
from .factory import create_deerflow_agent
|
||||
from .features import Next, Prev, RuntimeFeatures
|
||||
from .lead_agent import make_lead_agent
|
||||
from .lead_agent.prompt import prime_enabled_skills_cache
|
||||
from .thread_state import SandboxState, ThreadState
|
||||
|
||||
# LangGraph imports deerflow.agents when registering the graph. Prime the
|
||||
# enabled-skills cache here so the request path can usually read a warm cache
|
||||
# without forcing synchronous filesystem work during prompt module import.
|
||||
prime_enabled_skills_cache()
|
||||
|
||||
__all__ = [
|
||||
"create_deerflow_agent",
|
||||
"RuntimeFeatures",
|
||||
|
||||
@@ -17,6 +17,7 @@ For sync usage see :mod:`deerflow.agents.checkpointer.provider`.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import contextlib
|
||||
import logging
|
||||
from collections.abc import AsyncIterator
|
||||
@@ -54,7 +55,7 @@ async def _async_checkpointer(config) -> AsyncIterator[Checkpointer]:
|
||||
raise ImportError(SQLITE_INSTALL) from exc
|
||||
|
||||
conn_str = resolve_sqlite_conn_str(config.connection_string or "store.db")
|
||||
ensure_sqlite_parent_dir(conn_str)
|
||||
await asyncio.to_thread(ensure_sqlite_parent_dir, conn_str)
|
||||
async with AsyncSqliteSaver.from_conn_string(conn_str) as saver:
|
||||
await saver.setup()
|
||||
yield saver
|
||||
|
||||
@@ -289,14 +289,14 @@ def make_lead_agent(config: RunnableConfig):
|
||||
agent_name = cfg.get("agent_name")
|
||||
|
||||
agent_config = load_agent_config(agent_name) if not is_bootstrap else None
|
||||
# Custom agent model or fallback to global/default model resolution
|
||||
agent_model_name = agent_config.model if agent_config and agent_config.model else _resolve_model_name()
|
||||
# Custom agent model from agent config (if any), or None to let _resolve_model_name pick the default
|
||||
agent_model_name = agent_config.model if agent_config and agent_config.model else None
|
||||
|
||||
# Final model name resolution with request override, then agent config, then global default
|
||||
model_name = requested_model_name or agent_model_name
|
||||
# Final model name resolution: request → agent config → global default, with fallback for unknown names
|
||||
model_name = _resolve_model_name(requested_model_name or agent_model_name)
|
||||
|
||||
app_config = get_app_config()
|
||||
model_config = app_config.get_model_config(model_name) if model_name else None
|
||||
model_config = app_config.get_model_config(model_name)
|
||||
|
||||
if model_config is None:
|
||||
raise ValueError("No chat model could be resolved. Please configure at least one model in config.yaml or provide a valid 'model_name'/'model' in the request.")
|
||||
|
||||
@@ -1,20 +1,167 @@
|
||||
import asyncio
|
||||
import logging
|
||||
import threading
|
||||
from datetime import datetime
|
||||
from functools import lru_cache
|
||||
|
||||
from deerflow.config.agents_config import load_agent_soul
|
||||
from deerflow.skills import load_skills
|
||||
from deerflow.skills.types import Skill
|
||||
from deerflow.subagents import get_available_subagent_names
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_ENABLED_SKILLS_REFRESH_WAIT_TIMEOUT_SECONDS = 5.0
|
||||
_enabled_skills_lock = threading.Lock()
|
||||
_enabled_skills_cache: list[Skill] | None = None
|
||||
_enabled_skills_refresh_active = False
|
||||
_enabled_skills_refresh_version = 0
|
||||
_enabled_skills_refresh_event = threading.Event()
|
||||
|
||||
|
||||
def _load_enabled_skills_sync() -> list[Skill]:
|
||||
return list(load_skills(enabled_only=True))
|
||||
|
||||
|
||||
def _start_enabled_skills_refresh_thread() -> None:
|
||||
threading.Thread(
|
||||
target=_refresh_enabled_skills_cache_worker,
|
||||
name="deerflow-enabled-skills-loader",
|
||||
daemon=True,
|
||||
).start()
|
||||
|
||||
|
||||
def _refresh_enabled_skills_cache_worker() -> None:
|
||||
global _enabled_skills_cache, _enabled_skills_refresh_active
|
||||
|
||||
while True:
|
||||
with _enabled_skills_lock:
|
||||
target_version = _enabled_skills_refresh_version
|
||||
|
||||
try:
|
||||
skills = _load_enabled_skills_sync()
|
||||
except Exception:
|
||||
logger.exception("Failed to load enabled skills for prompt injection")
|
||||
skills = []
|
||||
|
||||
with _enabled_skills_lock:
|
||||
if _enabled_skills_refresh_version == target_version:
|
||||
_enabled_skills_cache = skills
|
||||
_enabled_skills_refresh_active = False
|
||||
_enabled_skills_refresh_event.set()
|
||||
return
|
||||
|
||||
# A newer invalidation happened while loading. Keep the worker alive
|
||||
# and loop again so the cache always converges on the latest version.
|
||||
_enabled_skills_cache = None
|
||||
|
||||
|
||||
def _ensure_enabled_skills_cache() -> threading.Event:
|
||||
global _enabled_skills_refresh_active
|
||||
|
||||
with _enabled_skills_lock:
|
||||
if _enabled_skills_cache is not None:
|
||||
_enabled_skills_refresh_event.set()
|
||||
return _enabled_skills_refresh_event
|
||||
if _enabled_skills_refresh_active:
|
||||
return _enabled_skills_refresh_event
|
||||
_enabled_skills_refresh_active = True
|
||||
_enabled_skills_refresh_event.clear()
|
||||
|
||||
_start_enabled_skills_refresh_thread()
|
||||
return _enabled_skills_refresh_event
|
||||
|
||||
|
||||
def _invalidate_enabled_skills_cache() -> threading.Event:
|
||||
global _enabled_skills_cache, _enabled_skills_refresh_active, _enabled_skills_refresh_version
|
||||
|
||||
_get_cached_skills_prompt_section.cache_clear()
|
||||
with _enabled_skills_lock:
|
||||
_enabled_skills_cache = None
|
||||
_enabled_skills_refresh_version += 1
|
||||
_enabled_skills_refresh_event.clear()
|
||||
if _enabled_skills_refresh_active:
|
||||
return _enabled_skills_refresh_event
|
||||
_enabled_skills_refresh_active = True
|
||||
|
||||
_start_enabled_skills_refresh_thread()
|
||||
return _enabled_skills_refresh_event
|
||||
|
||||
|
||||
def prime_enabled_skills_cache() -> None:
|
||||
_ensure_enabled_skills_cache()
|
||||
|
||||
|
||||
def warm_enabled_skills_cache(timeout_seconds: float = _ENABLED_SKILLS_REFRESH_WAIT_TIMEOUT_SECONDS) -> bool:
|
||||
if _ensure_enabled_skills_cache().wait(timeout=timeout_seconds):
|
||||
return True
|
||||
|
||||
logger.warning("Timed out waiting %.1fs for enabled skills cache warm-up", timeout_seconds)
|
||||
return False
|
||||
|
||||
|
||||
def _get_enabled_skills():
|
||||
with _enabled_skills_lock:
|
||||
cached = _enabled_skills_cache
|
||||
|
||||
if cached is not None:
|
||||
return list(cached)
|
||||
|
||||
_ensure_enabled_skills_cache()
|
||||
return []
|
||||
|
||||
|
||||
def _skill_mutability_label(category: str) -> str:
|
||||
return "[custom, editable]" if category == "custom" else "[built-in]"
|
||||
|
||||
|
||||
def clear_skills_system_prompt_cache() -> None:
|
||||
_invalidate_enabled_skills_cache()
|
||||
|
||||
|
||||
async def refresh_skills_system_prompt_cache_async() -> None:
|
||||
await asyncio.to_thread(_invalidate_enabled_skills_cache().wait)
|
||||
|
||||
|
||||
def _reset_skills_system_prompt_cache_state() -> None:
|
||||
global _enabled_skills_cache, _enabled_skills_refresh_active, _enabled_skills_refresh_version
|
||||
|
||||
_get_cached_skills_prompt_section.cache_clear()
|
||||
with _enabled_skills_lock:
|
||||
_enabled_skills_cache = None
|
||||
_enabled_skills_refresh_active = False
|
||||
_enabled_skills_refresh_version = 0
|
||||
_enabled_skills_refresh_event.clear()
|
||||
|
||||
|
||||
def _refresh_enabled_skills_cache() -> None:
|
||||
"""Backward-compatible test helper for direct synchronous reload."""
|
||||
try:
|
||||
return list(load_skills(enabled_only=True))
|
||||
skills = _load_enabled_skills_sync()
|
||||
except Exception:
|
||||
logger.exception("Failed to load enabled skills for prompt injection")
|
||||
return []
|
||||
skills = []
|
||||
|
||||
with _enabled_skills_lock:
|
||||
_enabled_skills_cache = skills
|
||||
_enabled_skills_refresh_active = False
|
||||
_enabled_skills_refresh_event.set()
|
||||
|
||||
|
||||
def _build_skill_evolution_section(skill_evolution_enabled: bool) -> str:
|
||||
if not skill_evolution_enabled:
|
||||
return ""
|
||||
return """
|
||||
## Skill Self-Evolution
|
||||
After completing a task, consider creating or updating a skill when:
|
||||
- The task required 5+ tool calls to resolve
|
||||
- You overcame non-obvious errors or pitfalls
|
||||
- The user corrected your approach and the corrected version worked
|
||||
- You discovered a non-trivial, recurring workflow
|
||||
If you used a skill and encountered issues not covered by it, patch it immediately.
|
||||
Prefer patch over edit. Before creating a new skill, confirm with the user first.
|
||||
Skip simple one-off tasks.
|
||||
"""
|
||||
|
||||
|
||||
def _skill_mutability_label(category: str) -> str:
|
||||
@@ -294,6 +441,9 @@ You: "Deploying to staging..." [proceed]
|
||||
- Use `read_file` tool to read uploaded files using their paths from the list
|
||||
- For PDF, PPT, Excel, and Word files, converted Markdown versions (*.md) are available alongside originals
|
||||
- All temporary work happens in `/mnt/user-data/workspace`
|
||||
- Treat `/mnt/user-data/workspace` as your default current working directory for coding and file-editing tasks
|
||||
- When writing scripts or commands that create/read files from the workspace, prefer relative paths such as `hello.txt`, `../uploads/data.csv`, and `../outputs/report.md`
|
||||
- Avoid hardcoding `/mnt/user-data/...` inside generated scripts when a relative path from the workspace is enough
|
||||
- Final deliverables must be copied to `/mnt/user-data/outputs` and presented using `present_file` tool
|
||||
{acp_section}
|
||||
</working_directory>
|
||||
|
||||
@@ -4,7 +4,7 @@ import logging
|
||||
import threading
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from datetime import UTC, datetime
|
||||
from typing import Any
|
||||
|
||||
from deerflow.config.memory_config import get_memory_config
|
||||
@@ -18,7 +18,7 @@ class ConversationContext:
|
||||
|
||||
thread_id: str
|
||||
messages: list[Any]
|
||||
timestamp: datetime = field(default_factory=datetime.utcnow)
|
||||
timestamp: datetime = field(default_factory=lambda: datetime.now(UTC))
|
||||
agent_name: str | None = None
|
||||
correction_detected: bool = False
|
||||
reinforcement_detected: bool = False
|
||||
|
||||
@@ -4,7 +4,7 @@ import abc
|
||||
import json
|
||||
import logging
|
||||
import threading
|
||||
from datetime import datetime
|
||||
from datetime import UTC, datetime
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
@@ -15,11 +15,16 @@ from deerflow.config.paths import get_paths
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def utc_now_iso_z() -> str:
|
||||
"""Current UTC time as ISO-8601 with ``Z`` suffix (matches prior naive-UTC output)."""
|
||||
return datetime.now(UTC).isoformat().removesuffix("+00:00") + "Z"
|
||||
|
||||
|
||||
def create_empty_memory() -> dict[str, Any]:
|
||||
"""Create an empty memory structure."""
|
||||
return {
|
||||
"version": "1.0",
|
||||
"lastUpdated": datetime.utcnow().isoformat() + "Z",
|
||||
"lastUpdated": utc_now_iso_z(),
|
||||
"user": {
|
||||
"workContext": {"summary": "", "updatedAt": ""},
|
||||
"personalContext": {"summary": "", "updatedAt": ""},
|
||||
@@ -137,7 +142,7 @@ class FileMemoryStorage(MemoryStorage):
|
||||
|
||||
try:
|
||||
file_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
memory_data["lastUpdated"] = datetime.utcnow().isoformat() + "Z"
|
||||
memory_data["lastUpdated"] = utc_now_iso_z()
|
||||
|
||||
temp_path = file_path.with_suffix(".tmp")
|
||||
with open(temp_path, "w", encoding="utf-8") as f:
|
||||
|
||||
@@ -5,14 +5,17 @@ import logging
|
||||
import math
|
||||
import re
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
from deerflow.agents.memory.prompt import (
|
||||
MEMORY_UPDATE_PROMPT,
|
||||
format_conversation_for_update,
|
||||
)
|
||||
from deerflow.agents.memory.storage import create_empty_memory, get_memory_storage
|
||||
from deerflow.agents.memory.storage import (
|
||||
create_empty_memory,
|
||||
get_memory_storage,
|
||||
utc_now_iso_z,
|
||||
)
|
||||
from deerflow.config.memory_config import get_memory_config
|
||||
from deerflow.models import create_chat_model
|
||||
|
||||
@@ -86,7 +89,7 @@ def create_memory_fact(
|
||||
|
||||
normalized_category = category.strip() or "context"
|
||||
validated_confidence = _validate_confidence(confidence)
|
||||
now = datetime.utcnow().isoformat() + "Z"
|
||||
now = utc_now_iso_z()
|
||||
memory_data = get_memory_data(agent_name)
|
||||
updated_memory = dict(memory_data)
|
||||
facts = list(memory_data.get("facts", []))
|
||||
@@ -376,7 +379,7 @@ class MemoryUpdater:
|
||||
Updated memory data.
|
||||
"""
|
||||
config = get_memory_config()
|
||||
now = datetime.utcnow().isoformat() + "Z"
|
||||
now = utc_now_iso_z()
|
||||
|
||||
# Update user sections
|
||||
user_updates = update_data.get("user", {})
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
"""Middleware for intercepting clarification requests and presenting them to the user."""
|
||||
|
||||
import json
|
||||
import logging
|
||||
from collections.abc import Callable
|
||||
from typing import override
|
||||
@@ -60,6 +61,20 @@ class ClarificationMiddleware(AgentMiddleware[ClarificationMiddlewareState]):
|
||||
context = args.get("context")
|
||||
options = args.get("options", [])
|
||||
|
||||
# Some models (e.g. Qwen3-Max) serialize array parameters as JSON strings
|
||||
# instead of native arrays. Deserialize and normalize so `options`
|
||||
# is always a list for the rendering logic below.
|
||||
if isinstance(options, str):
|
||||
try:
|
||||
options = json.loads(options)
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
options = [options]
|
||||
|
||||
if options is None:
|
||||
options = []
|
||||
elif not isinstance(options, list):
|
||||
options = [options]
|
||||
|
||||
# Type-specific icons
|
||||
type_icons = {
|
||||
"missing_info": "❓",
|
||||
|
||||
@@ -33,30 +33,92 @@ _DEFAULT_WINDOW_SIZE = 20 # track last N tool calls
|
||||
_DEFAULT_MAX_TRACKED_THREADS = 100 # LRU eviction limit
|
||||
|
||||
|
||||
def _normalize_tool_call_args(raw_args: object) -> tuple[dict, str | None]:
|
||||
"""Normalize tool call args to a dict plus an optional fallback key.
|
||||
|
||||
Some providers serialize ``args`` as a JSON string instead of a dict.
|
||||
We defensively parse those cases so loop detection does not crash while
|
||||
still preserving a stable fallback key for non-dict payloads.
|
||||
"""
|
||||
if isinstance(raw_args, dict):
|
||||
return raw_args, None
|
||||
|
||||
if isinstance(raw_args, str):
|
||||
try:
|
||||
parsed = json.loads(raw_args)
|
||||
except (TypeError, ValueError, json.JSONDecodeError):
|
||||
return {}, raw_args
|
||||
|
||||
if isinstance(parsed, dict):
|
||||
return parsed, None
|
||||
return {}, json.dumps(parsed, sort_keys=True, default=str)
|
||||
|
||||
if raw_args is None:
|
||||
return {}, None
|
||||
|
||||
return {}, json.dumps(raw_args, sort_keys=True, default=str)
|
||||
|
||||
|
||||
def _stable_tool_key(name: str, args: dict, fallback_key: str | None) -> str:
|
||||
"""Derive a stable key from salient args without overfitting to noise."""
|
||||
if name == "read_file" and fallback_key is None:
|
||||
path = args.get("path") or ""
|
||||
start_line = args.get("start_line")
|
||||
end_line = args.get("end_line")
|
||||
|
||||
bucket_size = 200
|
||||
try:
|
||||
start_line = int(start_line) if start_line is not None else 1
|
||||
except (TypeError, ValueError):
|
||||
start_line = 1
|
||||
try:
|
||||
end_line = int(end_line) if end_line is not None else start_line
|
||||
except (TypeError, ValueError):
|
||||
end_line = start_line
|
||||
|
||||
start_line, end_line = sorted((start_line, end_line))
|
||||
bucket_start = max(start_line, 1)
|
||||
bucket_end = max(end_line, 1)
|
||||
bucket_start = (bucket_start - 1) // bucket_size
|
||||
bucket_end = (bucket_end - 1) // bucket_size
|
||||
return f"{path}:{bucket_start}-{bucket_end}"
|
||||
|
||||
# write_file / str_replace are content-sensitive: same path may be updated
|
||||
# with different payloads during iteration. Using only salient fields (path)
|
||||
# can collapse distinct calls, so we hash full args to reduce false positives.
|
||||
if name in {"write_file", "str_replace"}:
|
||||
if fallback_key is not None:
|
||||
return fallback_key
|
||||
return json.dumps(args, sort_keys=True, default=str)
|
||||
|
||||
salient_fields = ("path", "url", "query", "command", "pattern", "glob", "cmd")
|
||||
stable_args = {field: args[field] for field in salient_fields if args.get(field) is not None}
|
||||
if stable_args:
|
||||
return json.dumps(stable_args, sort_keys=True, default=str)
|
||||
|
||||
if fallback_key is not None:
|
||||
return fallback_key
|
||||
|
||||
return json.dumps(args, sort_keys=True, default=str)
|
||||
|
||||
|
||||
def _hash_tool_calls(tool_calls: list[dict]) -> str:
|
||||
"""Deterministic hash of a set of tool calls (name + args).
|
||||
"""Deterministic hash of a set of tool calls (name + stable key).
|
||||
|
||||
This is intended to be order-independent: the same multiset of tool calls
|
||||
should always produce the same hash, regardless of their input order.
|
||||
"""
|
||||
# First normalize each tool call to a minimal (name, args) structure.
|
||||
normalized: list[dict] = []
|
||||
# Normalize each tool call to a stable (name, key) structure.
|
||||
normalized: list[str] = []
|
||||
for tc in tool_calls:
|
||||
normalized.append(
|
||||
{
|
||||
"name": tc.get("name", ""),
|
||||
"args": tc.get("args", {}),
|
||||
}
|
||||
)
|
||||
name = tc.get("name", "")
|
||||
args, fallback_key = _normalize_tool_call_args(tc.get("args", {}))
|
||||
key = _stable_tool_key(name, args, fallback_key)
|
||||
|
||||
# Sort by both name and a deterministic serialization of args so that
|
||||
# permutations of the same multiset of calls yield the same ordering.
|
||||
normalized.sort(
|
||||
key=lambda tc: (
|
||||
tc["name"],
|
||||
json.dumps(tc["args"], sort_keys=True, default=str),
|
||||
)
|
||||
)
|
||||
normalized.append(f"{name}:{key}")
|
||||
|
||||
# Sort so permutations of the same multiset of calls yield the same ordering.
|
||||
normalized.sort()
|
||||
blob = json.dumps(normalized, sort_keys=True, default=str)
|
||||
return hashlib.md5(blob.encode()).hexdigest()[:12]
|
||||
|
||||
|
||||
@@ -23,25 +23,119 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
# Each pattern is compiled once at import time.
|
||||
_HIGH_RISK_PATTERNS: list[re.Pattern[str]] = [
|
||||
re.compile(r"rm\s+-[^\s]*r[^\s]*\s+(/\*?|~/?\*?|/home\b|/root\b)\s*$"), # rm -rf / /* ~ /home /root
|
||||
re.compile(r"(curl|wget).+\|\s*(ba)?sh"), # curl|sh, wget|sh
|
||||
# --- original rules (retained) ---
|
||||
re.compile(r"rm\s+-[^\s]*r[^\s]*\s+(/\*?|~/?\*?|/home\b|/root\b)\s*$"),
|
||||
re.compile(r"dd\s+if="),
|
||||
re.compile(r"mkfs"),
|
||||
re.compile(r"cat\s+/etc/shadow"),
|
||||
re.compile(r">\s*/etc/"), # overwrite /etc/ files
|
||||
re.compile(r">+\s*/etc/"),
|
||||
# --- pipe to sh/bash (generalised, replaces old curl|sh rule) ---
|
||||
re.compile(r"\|\s*(ba)?sh\b"),
|
||||
# --- command substitution (targeted – only dangerous executables) ---
|
||||
re.compile(r"[`$]\(?\s*(curl|wget|bash|sh|python|ruby|perl|base64)"),
|
||||
# --- base64 decode piped to execution ---
|
||||
re.compile(r"base64\s+.*-d.*\|"),
|
||||
# --- overwrite system binaries ---
|
||||
re.compile(r">+\s*(/usr/bin/|/bin/|/sbin/)"),
|
||||
# --- overwrite shell startup files ---
|
||||
re.compile(r">+\s*~/?\.(bashrc|profile|zshrc|bash_profile)"),
|
||||
# --- process environment leakage ---
|
||||
re.compile(r"/proc/[^/]+/environ"),
|
||||
# --- dynamic linker hijack (one-step escalation) ---
|
||||
re.compile(r"\b(LD_PRELOAD|LD_LIBRARY_PATH)\s*="),
|
||||
# --- bash built-in networking (bypasses tool allowlists) ---
|
||||
re.compile(r"/dev/tcp/"),
|
||||
# --- fork bomb ---
|
||||
re.compile(r"\S+\(\)\s*\{[^}]*\|\s*\S+\s*&"), # :(){ :|:& };:
|
||||
re.compile(r"while\s+true.*&\s*done"), # while true; do bash & done
|
||||
]
|
||||
|
||||
_MEDIUM_RISK_PATTERNS: list[re.Pattern[str]] = [
|
||||
re.compile(r"chmod\s+777"), # overly permissive, but reversible
|
||||
re.compile(r"pip\s+install"),
|
||||
re.compile(r"pip3\s+install"),
|
||||
re.compile(r"chmod\s+777"),
|
||||
re.compile(r"pip3?\s+install"),
|
||||
re.compile(r"apt(-get)?\s+install"),
|
||||
# sudo/su: no-op under Docker root; warn so LLM is aware
|
||||
re.compile(r"\b(sudo|su)\b"),
|
||||
# PATH modification: long attack chain, warn rather than block
|
||||
re.compile(r"\bPATH\s*="),
|
||||
]
|
||||
|
||||
|
||||
def _classify_command(command: str) -> str:
|
||||
"""Return 'block', 'warn', or 'pass'."""
|
||||
# Normalize for matching (collapse whitespace)
|
||||
def _split_compound_command(command: str) -> list[str]:
|
||||
"""Split a compound command into sub-commands (quote-aware).
|
||||
|
||||
Scans the raw command string so unquoted shell control operators are
|
||||
recognised even when they are not surrounded by whitespace
|
||||
(e.g. ``safe;rm -rf /`` or ``rm -rf /&&echo ok``). Operators inside
|
||||
quotes are ignored. If the command ends with an unclosed quote or a
|
||||
dangling escape, return the whole command unchanged (fail-closed —
|
||||
safer to classify the unsplit string than silently drop parts).
|
||||
"""
|
||||
parts: list[str] = []
|
||||
current: list[str] = []
|
||||
in_single_quote = False
|
||||
in_double_quote = False
|
||||
escaping = False
|
||||
index = 0
|
||||
|
||||
while index < len(command):
|
||||
char = command[index]
|
||||
|
||||
if escaping:
|
||||
current.append(char)
|
||||
escaping = False
|
||||
index += 1
|
||||
continue
|
||||
|
||||
if char == "\\" and not in_single_quote:
|
||||
current.append(char)
|
||||
escaping = True
|
||||
index += 1
|
||||
continue
|
||||
|
||||
if char == "'" and not in_double_quote:
|
||||
in_single_quote = not in_single_quote
|
||||
current.append(char)
|
||||
index += 1
|
||||
continue
|
||||
|
||||
if char == '"' and not in_single_quote:
|
||||
in_double_quote = not in_double_quote
|
||||
current.append(char)
|
||||
index += 1
|
||||
continue
|
||||
|
||||
if not in_single_quote and not in_double_quote:
|
||||
if command.startswith("&&", index) or command.startswith("||", index):
|
||||
part = "".join(current).strip()
|
||||
if part:
|
||||
parts.append(part)
|
||||
current = []
|
||||
index += 2
|
||||
continue
|
||||
if char == ";":
|
||||
part = "".join(current).strip()
|
||||
if part:
|
||||
parts.append(part)
|
||||
current = []
|
||||
index += 1
|
||||
continue
|
||||
|
||||
current.append(char)
|
||||
index += 1
|
||||
|
||||
# Unclosed quote or dangling escape → fail-closed, return whole command
|
||||
if in_single_quote or in_double_quote or escaping:
|
||||
return [command]
|
||||
|
||||
part = "".join(current).strip()
|
||||
if part:
|
||||
parts.append(part)
|
||||
return parts if parts else [command]
|
||||
|
||||
|
||||
def _classify_single_command(command: str) -> str:
|
||||
"""Classify a single (non-compound) command. Return 'block', 'warn', or 'pass'."""
|
||||
normalized = " ".join(command.split())
|
||||
|
||||
for pattern in _HIGH_RISK_PATTERNS:
|
||||
@@ -66,6 +160,35 @@ def _classify_command(command: str) -> str:
|
||||
return "pass"
|
||||
|
||||
|
||||
def _classify_command(command: str) -> str:
|
||||
"""Return 'block', 'warn', or 'pass'.
|
||||
|
||||
Strategy:
|
||||
1. First scan the *whole* raw command against high-risk patterns. This
|
||||
catches structural attacks like ``while true; do bash & done`` or
|
||||
``:(){ :|:& };:`` that span multiple shell statements — splitting them
|
||||
on ``;`` would destroy the pattern context.
|
||||
2. Then split compound commands (e.g. ``cmd1 && cmd2 ; cmd3``) and
|
||||
classify each sub-command independently. The most severe verdict wins.
|
||||
"""
|
||||
# Pass 1: whole-command high-risk scan (catches multi-statement patterns)
|
||||
normalized = " ".join(command.split())
|
||||
for pattern in _HIGH_RISK_PATTERNS:
|
||||
if pattern.search(normalized):
|
||||
return "block"
|
||||
|
||||
# Pass 2: per-sub-command classification
|
||||
sub_commands = _split_compound_command(command)
|
||||
worst = "pass"
|
||||
for sub in sub_commands:
|
||||
verdict = _classify_single_command(sub)
|
||||
if verdict == "block":
|
||||
return "block" # short-circuit: can't get worse
|
||||
if verdict == "warn":
|
||||
worst = "warn"
|
||||
return worst
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Middleware
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
Reference in New Issue
Block a user