mirror of
https://github.com/bytedance/deer-flow.git
synced 2026-05-23 00:16:48 +00:00
chore(dev): add async/thread boundary detector (#2936)
* chore(dev): add thread boundary detector * chore(dev): reduce thread boundary detector false positives
This commit is contained in:
@@ -1,6 +1,6 @@
|
|||||||
# DeerFlow - Unified Development Environment
|
# DeerFlow - Unified Development Environment
|
||||||
|
|
||||||
.PHONY: help config config-upgrade check install setup doctor dev dev-daemon start start-daemon stop up down clean docker-init docker-start docker-stop docker-logs docker-logs-frontend docker-logs-gateway
|
.PHONY: help config config-upgrade check install setup doctor detect-thread-boundaries dev dev-daemon start start-daemon stop up down clean docker-init docker-start docker-stop docker-logs docker-logs-frontend docker-logs-gateway
|
||||||
|
|
||||||
BASH ?= bash
|
BASH ?= bash
|
||||||
BACKEND_UV_RUN = cd backend && uv run
|
BACKEND_UV_RUN = cd backend && uv run
|
||||||
@@ -23,6 +23,7 @@ help:
|
|||||||
@echo " make config - Generate local config files (aborts if config already exists)"
|
@echo " make config - Generate local config files (aborts if config already exists)"
|
||||||
@echo " make config-upgrade - Merge new fields from config.example.yaml into config.yaml"
|
@echo " make config-upgrade - Merge new fields from config.example.yaml into config.yaml"
|
||||||
@echo " make check - Check if all required tools are installed"
|
@echo " make check - Check if all required tools are installed"
|
||||||
|
@echo " make detect-thread-boundaries - Inventory async/thread boundary points"
|
||||||
@echo " make install - Install all dependencies (frontend + backend + pre-commit hooks)"
|
@echo " make install - Install all dependencies (frontend + backend + pre-commit hooks)"
|
||||||
@echo " make setup-sandbox - Pre-pull sandbox container image (recommended)"
|
@echo " make setup-sandbox - Pre-pull sandbox container image (recommended)"
|
||||||
@echo " make dev - Start all services in development mode (with hot-reloading)"
|
@echo " make dev - Start all services in development mode (with hot-reloading)"
|
||||||
@@ -51,6 +52,9 @@ setup:
|
|||||||
doctor:
|
doctor:
|
||||||
@$(BACKEND_UV_RUN) python ../scripts/doctor.py
|
@$(BACKEND_UV_RUN) python ../scripts/doctor.py
|
||||||
|
|
||||||
|
detect-thread-boundaries:
|
||||||
|
@$(PYTHON) ./scripts/detect_thread_boundaries.py
|
||||||
|
|
||||||
config:
|
config:
|
||||||
@$(PYTHON) ./scripts/configure.py
|
@$(PYTHON) ./scripts/configure.py
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,507 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Inventory async/thread boundary points for developer review.
|
||||||
|
|
||||||
|
This detector is intentionally non-invasive: it parses Python source with AST
|
||||||
|
and reports places where code crosses sync/async/thread boundaries. Findings
|
||||||
|
are review evidence, not automatic bug decisions.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import ast
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from collections.abc import Iterable, Sequence
|
||||||
|
from dataclasses import asdict, dataclass
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
REPO_ROOT = Path(__file__).resolve().parents[4]
|
||||||
|
DEFAULT_SCAN_PATHS = (
|
||||||
|
REPO_ROOT / "backend" / "app",
|
||||||
|
REPO_ROOT / "backend" / "packages" / "harness" / "deerflow",
|
||||||
|
)
|
||||||
|
IGNORED_DIR_NAMES = {
|
||||||
|
".git",
|
||||||
|
".mypy_cache",
|
||||||
|
".pytest_cache",
|
||||||
|
".ruff_cache",
|
||||||
|
".venv",
|
||||||
|
"__pycache__",
|
||||||
|
"node_modules",
|
||||||
|
}
|
||||||
|
SEVERITY_ORDER = {"INFO": 0, "WARN": 1, "FAIL": 2}
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class BoundaryFinding:
|
||||||
|
severity: str
|
||||||
|
category: str
|
||||||
|
path: str
|
||||||
|
line: int
|
||||||
|
column: int
|
||||||
|
function: str
|
||||||
|
async_context: bool
|
||||||
|
symbol: str
|
||||||
|
message: str
|
||||||
|
code: str
|
||||||
|
|
||||||
|
def to_dict(self) -> dict[str, object]:
|
||||||
|
return asdict(self)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class _FunctionContext:
|
||||||
|
name: str
|
||||||
|
is_async: bool
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class _CallRule:
|
||||||
|
severity: str
|
||||||
|
category: str
|
||||||
|
message: str
|
||||||
|
|
||||||
|
|
||||||
|
EXACT_CALL_RULES: dict[str, _CallRule] = {
|
||||||
|
"asyncio.run": _CallRule(
|
||||||
|
"WARN",
|
||||||
|
"SYNC_ASYNC_BRIDGE",
|
||||||
|
"Runs a coroutine from synchronous code by creating an event loop boundary.",
|
||||||
|
),
|
||||||
|
"asyncio.to_thread": _CallRule(
|
||||||
|
"INFO",
|
||||||
|
"ASYNC_THREAD_OFFLOAD",
|
||||||
|
"Offloads synchronous work from an async context into a worker thread.",
|
||||||
|
),
|
||||||
|
"asyncio.new_event_loop": _CallRule(
|
||||||
|
"WARN",
|
||||||
|
"NEW_EVENT_LOOP",
|
||||||
|
"Creates a separate event loop; review resource ownership across loops.",
|
||||||
|
),
|
||||||
|
"asyncio.run_coroutine_threadsafe": _CallRule(
|
||||||
|
"WARN",
|
||||||
|
"CROSS_THREAD_COROUTINE",
|
||||||
|
"Submits a coroutine to an event loop from another thread.",
|
||||||
|
),
|
||||||
|
"concurrent.futures.ThreadPoolExecutor": _CallRule(
|
||||||
|
"INFO",
|
||||||
|
"THREAD_POOL",
|
||||||
|
"Creates a thread pool boundary.",
|
||||||
|
),
|
||||||
|
"threading.Thread": _CallRule(
|
||||||
|
"INFO",
|
||||||
|
"RAW_THREAD",
|
||||||
|
"Creates a raw thread; ContextVar values do not propagate automatically.",
|
||||||
|
),
|
||||||
|
"threading.Timer": _CallRule(
|
||||||
|
"INFO",
|
||||||
|
"RAW_TIMER_THREAD",
|
||||||
|
"Creates a timer-backed raw thread; ContextVar values do not propagate automatically.",
|
||||||
|
),
|
||||||
|
"make_sync_tool_wrapper": _CallRule(
|
||||||
|
"INFO",
|
||||||
|
"SYNC_TOOL_WRAPPER",
|
||||||
|
"Adapts an async tool coroutine for synchronous tool invocation.",
|
||||||
|
),
|
||||||
|
}
|
||||||
|
THREAD_POOL_CONSTRUCTORS = {"concurrent.futures.ThreadPoolExecutor"}
|
||||||
|
ASYNC_TOOL_FACTORY_CALLS = {
|
||||||
|
"StructuredTool.from_function",
|
||||||
|
"langchain.tools.StructuredTool.from_function",
|
||||||
|
"langchain_core.tools.StructuredTool.from_function",
|
||||||
|
}
|
||||||
|
LANGCHAIN_INVOKE_RECEIVER_NAMES = {
|
||||||
|
"agent",
|
||||||
|
"chain",
|
||||||
|
"chat_model",
|
||||||
|
"graph",
|
||||||
|
"llm",
|
||||||
|
"model",
|
||||||
|
"runnable",
|
||||||
|
}
|
||||||
|
LANGCHAIN_INVOKE_RECEIVER_SUFFIXES = (
|
||||||
|
"_agent",
|
||||||
|
"_chain",
|
||||||
|
"_graph",
|
||||||
|
"_llm",
|
||||||
|
"_model",
|
||||||
|
"_runnable",
|
||||||
|
)
|
||||||
|
|
||||||
|
ASYNC_BLOCKING_CALL_RULES: dict[str, _CallRule] = {
|
||||||
|
"time.sleep": _CallRule(
|
||||||
|
"WARN",
|
||||||
|
"BLOCKING_CALL_IN_ASYNC",
|
||||||
|
"Blocks the event loop when called directly inside async code.",
|
||||||
|
),
|
||||||
|
"subprocess.run": _CallRule(
|
||||||
|
"WARN",
|
||||||
|
"BLOCKING_SUBPROCESS_IN_ASYNC",
|
||||||
|
"Runs a blocking subprocess from async code.",
|
||||||
|
),
|
||||||
|
"subprocess.check_call": _CallRule(
|
||||||
|
"WARN",
|
||||||
|
"BLOCKING_SUBPROCESS_IN_ASYNC",
|
||||||
|
"Runs a blocking subprocess from async code.",
|
||||||
|
),
|
||||||
|
"subprocess.check_output": _CallRule(
|
||||||
|
"WARN",
|
||||||
|
"BLOCKING_SUBPROCESS_IN_ASYNC",
|
||||||
|
"Runs a blocking subprocess from async code.",
|
||||||
|
),
|
||||||
|
"subprocess.Popen": _CallRule(
|
||||||
|
"WARN",
|
||||||
|
"BLOCKING_SUBPROCESS_IN_ASYNC",
|
||||||
|
"Starts a subprocess from async code; review whether it blocks later.",
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def dotted_name(node: ast.AST | None) -> str | None:
|
||||||
|
if isinstance(node, ast.Name):
|
||||||
|
return node.id
|
||||||
|
if isinstance(node, ast.Attribute):
|
||||||
|
parent = dotted_name(node.value)
|
||||||
|
if parent:
|
||||||
|
return f"{parent}.{node.attr}"
|
||||||
|
return node.attr
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def call_receiver_name(node: ast.Call) -> str | None:
|
||||||
|
if not isinstance(node.func, ast.Attribute):
|
||||||
|
return None
|
||||||
|
return dotted_name(node.func.value)
|
||||||
|
|
||||||
|
|
||||||
|
def is_none_node(node: ast.AST | None) -> bool:
|
||||||
|
return isinstance(node, ast.Constant) and node.value is None
|
||||||
|
|
||||||
|
|
||||||
|
class BoundaryVisitor(ast.NodeVisitor):
|
||||||
|
def __init__(self, path: Path, relative_path: str, source_lines: Sequence[str]) -> None:
|
||||||
|
self.path = path
|
||||||
|
self.relative_path = relative_path
|
||||||
|
self.source_lines = source_lines
|
||||||
|
self.findings: list[BoundaryFinding] = []
|
||||||
|
self.function_stack: list[_FunctionContext] = []
|
||||||
|
self.import_aliases: dict[str, str] = {}
|
||||||
|
self.executor_names: set[str] = set()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def current_function(self) -> str:
|
||||||
|
if not self.function_stack:
|
||||||
|
return "<module>"
|
||||||
|
return ".".join(context.name for context in self.function_stack)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def in_async_context(self) -> bool:
|
||||||
|
return bool(self.function_stack and self.function_stack[-1].is_async)
|
||||||
|
|
||||||
|
def visit_Import(self, node: ast.Import) -> None:
|
||||||
|
for alias in node.names:
|
||||||
|
local_name = alias.asname or alias.name.split(".", 1)[0]
|
||||||
|
canonical_name = alias.name if alias.asname else local_name
|
||||||
|
self.import_aliases[local_name] = canonical_name
|
||||||
|
|
||||||
|
def visit_ImportFrom(self, node: ast.ImportFrom) -> None:
|
||||||
|
if node.module is None:
|
||||||
|
return
|
||||||
|
for alias in node.names:
|
||||||
|
local_name = alias.asname or alias.name
|
||||||
|
self.import_aliases[local_name] = f"{node.module}.{alias.name}"
|
||||||
|
|
||||||
|
def visit_Assign(self, node: ast.Assign) -> None:
|
||||||
|
self._record_executor_targets(node.value, node.targets)
|
||||||
|
self.generic_visit(node)
|
||||||
|
|
||||||
|
def visit_AnnAssign(self, node: ast.AnnAssign) -> None:
|
||||||
|
if node.value is not None:
|
||||||
|
self._record_executor_targets(node.value, [node.target])
|
||||||
|
self.generic_visit(node)
|
||||||
|
|
||||||
|
def visit_With(self, node: ast.With) -> None:
|
||||||
|
for item in node.items:
|
||||||
|
if item.optional_vars is not None:
|
||||||
|
self._record_executor_targets(item.context_expr, [item.optional_vars])
|
||||||
|
self.generic_visit(node)
|
||||||
|
|
||||||
|
def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
|
||||||
|
self.function_stack.append(_FunctionContext(node.name, is_async=False))
|
||||||
|
self.generic_visit(node)
|
||||||
|
self.function_stack.pop()
|
||||||
|
|
||||||
|
def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None:
|
||||||
|
self.function_stack.append(_FunctionContext(node.name, is_async=True))
|
||||||
|
try:
|
||||||
|
self._check_async_tool_definition(node)
|
||||||
|
self.generic_visit(node)
|
||||||
|
finally:
|
||||||
|
self.function_stack.pop()
|
||||||
|
|
||||||
|
def visit_Call(self, node: ast.Call) -> None:
|
||||||
|
call_name = self._canonical_name(dotted_name(node.func))
|
||||||
|
if call_name:
|
||||||
|
self._check_call(node, call_name)
|
||||||
|
self.generic_visit(node)
|
||||||
|
|
||||||
|
def _check_async_tool_definition(self, node: ast.AsyncFunctionDef) -> None:
|
||||||
|
for decorator in node.decorator_list:
|
||||||
|
decorator_call = decorator.func if isinstance(decorator, ast.Call) else decorator
|
||||||
|
decorator_name = self._canonical_name(dotted_name(decorator_call))
|
||||||
|
if decorator_name in {"langchain.tools.tool", "langchain_core.tools.tool"}:
|
||||||
|
self._emit(
|
||||||
|
node,
|
||||||
|
severity="INFO",
|
||||||
|
category="ASYNC_TOOL_DEFINITION",
|
||||||
|
symbol=decorator_name,
|
||||||
|
message="Defines an async LangChain tool; sync clients need a wrapper before invoke().",
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
def _check_call(self, node: ast.Call, call_name: str) -> None:
|
||||||
|
rule = EXACT_CALL_RULES.get(call_name)
|
||||||
|
if rule:
|
||||||
|
self._emit_rule(node, call_name, rule)
|
||||||
|
|
||||||
|
if call_name.endswith(".run_until_complete"):
|
||||||
|
self._emit(
|
||||||
|
node,
|
||||||
|
severity="WARN",
|
||||||
|
category="RUN_UNTIL_COMPLETE",
|
||||||
|
symbol=call_name,
|
||||||
|
message="Drives an event loop from synchronous code; review nested-loop behavior.",
|
||||||
|
)
|
||||||
|
|
||||||
|
if self._is_executor_submit(node, call_name):
|
||||||
|
self._emit(
|
||||||
|
node,
|
||||||
|
severity="INFO",
|
||||||
|
category="EXECUTOR_SUBMIT",
|
||||||
|
symbol=call_name,
|
||||||
|
message="Submits work to an executor; review context propagation and cancellation.",
|
||||||
|
)
|
||||||
|
|
||||||
|
if call_name in ASYNC_TOOL_FACTORY_CALLS:
|
||||||
|
if any(keyword.arg == "coroutine" and not is_none_node(keyword.value) for keyword in node.keywords):
|
||||||
|
self._emit(
|
||||||
|
node,
|
||||||
|
severity="INFO",
|
||||||
|
category="ASYNC_ONLY_TOOL_FACTORY",
|
||||||
|
symbol=call_name,
|
||||||
|
message="Creates a StructuredTool from a coroutine; sync clients need a wrapper.",
|
||||||
|
)
|
||||||
|
|
||||||
|
if self.in_async_context and call_name in ASYNC_BLOCKING_CALL_RULES:
|
||||||
|
self._emit_rule(node, call_name, ASYNC_BLOCKING_CALL_RULES[call_name])
|
||||||
|
|
||||||
|
if self.in_async_context and self._is_langchain_invoke(node, call_name, method_name="invoke"):
|
||||||
|
self._emit(
|
||||||
|
node,
|
||||||
|
severity="WARN",
|
||||||
|
category="SYNC_INVOKE_IN_ASYNC",
|
||||||
|
symbol=call_name,
|
||||||
|
message="Calls a synchronous invoke() from async code; review event-loop blocking.",
|
||||||
|
)
|
||||||
|
|
||||||
|
if not self.in_async_context and self._is_langchain_invoke(node, call_name, method_name="ainvoke"):
|
||||||
|
self._emit(
|
||||||
|
node,
|
||||||
|
severity="WARN",
|
||||||
|
category="ASYNC_INVOKE_IN_SYNC",
|
||||||
|
symbol=call_name,
|
||||||
|
message="Calls async ainvoke() from sync code; review how the coroutine is awaited.",
|
||||||
|
)
|
||||||
|
|
||||||
|
def _canonical_name(self, name: str | None) -> str | None:
|
||||||
|
if name is None:
|
||||||
|
return None
|
||||||
|
parts = name.split(".")
|
||||||
|
if parts and parts[0] in self.import_aliases:
|
||||||
|
return ".".join((self.import_aliases[parts[0]], *parts[1:]))
|
||||||
|
return name
|
||||||
|
|
||||||
|
def _record_executor_targets(self, value: ast.AST, targets: Sequence[ast.AST]) -> None:
|
||||||
|
if not isinstance(value, ast.Call):
|
||||||
|
return
|
||||||
|
call_name = self._canonical_name(dotted_name(value.func))
|
||||||
|
if call_name not in THREAD_POOL_CONSTRUCTORS:
|
||||||
|
return
|
||||||
|
for target in targets:
|
||||||
|
for name in self._target_names(target):
|
||||||
|
self.executor_names.add(name)
|
||||||
|
|
||||||
|
def _target_names(self, target: ast.AST) -> Iterable[str]:
|
||||||
|
if isinstance(target, ast.Name):
|
||||||
|
yield target.id
|
||||||
|
elif isinstance(target, (ast.Tuple, ast.List)):
|
||||||
|
for element in target.elts:
|
||||||
|
yield from self._target_names(element)
|
||||||
|
|
||||||
|
def _is_executor_submit(self, node: ast.Call, call_name: str) -> bool:
|
||||||
|
if not call_name.endswith(".submit"):
|
||||||
|
return False
|
||||||
|
receiver_name = call_receiver_name(node)
|
||||||
|
return receiver_name in self.executor_names
|
||||||
|
|
||||||
|
def _is_langchain_invoke(self, node: ast.Call, call_name: str, *, method_name: str) -> bool:
|
||||||
|
if not call_name.endswith(f".{method_name}"):
|
||||||
|
return False
|
||||||
|
receiver_name = call_receiver_name(node)
|
||||||
|
if receiver_name is None:
|
||||||
|
return False
|
||||||
|
receiver_leaf = receiver_name.rsplit(".", 1)[-1]
|
||||||
|
return receiver_leaf in LANGCHAIN_INVOKE_RECEIVER_NAMES or receiver_leaf.endswith(LANGCHAIN_INVOKE_RECEIVER_SUFFIXES)
|
||||||
|
|
||||||
|
def _emit_rule(self, node: ast.AST, symbol: str, rule: _CallRule) -> None:
|
||||||
|
self._emit(
|
||||||
|
node,
|
||||||
|
severity=rule.severity,
|
||||||
|
category=rule.category,
|
||||||
|
symbol=symbol,
|
||||||
|
message=rule.message,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _emit(self, node: ast.AST, *, severity: str, category: str, symbol: str, message: str) -> None:
|
||||||
|
line = getattr(node, "lineno", 0)
|
||||||
|
column = getattr(node, "col_offset", 0)
|
||||||
|
code = ""
|
||||||
|
if line > 0 and line <= len(self.source_lines):
|
||||||
|
code = self.source_lines[line - 1].strip()
|
||||||
|
self.findings.append(
|
||||||
|
BoundaryFinding(
|
||||||
|
severity=severity,
|
||||||
|
category=category,
|
||||||
|
path=self.relative_path,
|
||||||
|
line=line,
|
||||||
|
column=column,
|
||||||
|
function=self.current_function,
|
||||||
|
async_context=self.in_async_context,
|
||||||
|
symbol=symbol,
|
||||||
|
message=message,
|
||||||
|
code=code,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def relative_to_repo(path: Path, repo_root: Path = REPO_ROOT) -> str:
|
||||||
|
try:
|
||||||
|
return path.resolve().relative_to(repo_root.resolve()).as_posix()
|
||||||
|
except ValueError:
|
||||||
|
return path.as_posix()
|
||||||
|
|
||||||
|
|
||||||
|
def scan_file(path: Path, *, repo_root: Path = REPO_ROOT) -> list[BoundaryFinding]:
|
||||||
|
source = path.read_text(encoding="utf-8")
|
||||||
|
source_lines = source.splitlines()
|
||||||
|
relative_path = relative_to_repo(path, repo_root)
|
||||||
|
try:
|
||||||
|
tree = ast.parse(source, filename=str(path))
|
||||||
|
except SyntaxError as exc:
|
||||||
|
line = exc.lineno or 0
|
||||||
|
code = source_lines[line - 1].strip() if line > 0 and line <= len(source_lines) else ""
|
||||||
|
return [
|
||||||
|
BoundaryFinding(
|
||||||
|
severity="WARN",
|
||||||
|
category="PARSE_ERROR",
|
||||||
|
path=relative_path,
|
||||||
|
line=line,
|
||||||
|
column=max((exc.offset or 1) - 1, 0),
|
||||||
|
function="<module>",
|
||||||
|
async_context=False,
|
||||||
|
symbol="SyntaxError",
|
||||||
|
message=str(exc),
|
||||||
|
code=code,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
visitor = BoundaryVisitor(path, relative_path, source_lines)
|
||||||
|
visitor.visit(tree)
|
||||||
|
return visitor.findings
|
||||||
|
|
||||||
|
|
||||||
|
def is_ignored_path(path: Path) -> bool:
|
||||||
|
return any(part in IGNORED_DIR_NAMES for part in path.parts)
|
||||||
|
|
||||||
|
|
||||||
|
def iter_python_files(paths: Iterable[Path]) -> Iterable[Path]:
|
||||||
|
for path in paths:
|
||||||
|
if not path.exists() or is_ignored_path(path):
|
||||||
|
continue
|
||||||
|
if path.is_file():
|
||||||
|
if path.suffix == ".py" and not is_ignored_path(path):
|
||||||
|
yield path
|
||||||
|
continue
|
||||||
|
for dirpath, dirnames, filenames in os.walk(path):
|
||||||
|
dirnames[:] = [dirname for dirname in dirnames if dirname not in IGNORED_DIR_NAMES]
|
||||||
|
for filename in filenames:
|
||||||
|
if filename.endswith(".py"):
|
||||||
|
yield Path(dirpath) / filename
|
||||||
|
|
||||||
|
|
||||||
|
def scan_paths(paths: Iterable[Path], *, repo_root: Path = REPO_ROOT) -> list[BoundaryFinding]:
|
||||||
|
findings: list[BoundaryFinding] = []
|
||||||
|
for path in sorted(iter_python_files(paths)):
|
||||||
|
findings.extend(scan_file(path, repo_root=repo_root))
|
||||||
|
return sorted(findings, key=lambda finding: (finding.path, finding.line, finding.column, finding.category))
|
||||||
|
|
||||||
|
|
||||||
|
def filter_findings(findings: Iterable[BoundaryFinding], min_severity: str) -> list[BoundaryFinding]:
|
||||||
|
threshold = SEVERITY_ORDER[min_severity]
|
||||||
|
return [finding for finding in findings if SEVERITY_ORDER[finding.severity] >= threshold]
|
||||||
|
|
||||||
|
|
||||||
|
def format_text(findings: Sequence[BoundaryFinding]) -> str:
|
||||||
|
if not findings:
|
||||||
|
return "No async/thread boundary findings."
|
||||||
|
|
||||||
|
lines: list[str] = []
|
||||||
|
for finding in findings:
|
||||||
|
lines.append(f"{finding.severity} {finding.category} {finding.path}:{finding.line}:{finding.column + 1} in {finding.function} async={str(finding.async_context).lower()}")
|
||||||
|
lines.append(f" symbol: {finding.symbol}")
|
||||||
|
lines.append(f" note: {finding.message}")
|
||||||
|
if finding.code:
|
||||||
|
lines.append(f" code: {finding.code}")
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def build_parser() -> argparse.ArgumentParser:
|
||||||
|
parser = argparse.ArgumentParser(description=("Detect async/thread boundary points for developer review. Findings are an inventory, not automatic bug decisions."))
|
||||||
|
parser.add_argument(
|
||||||
|
"paths",
|
||||||
|
nargs="*",
|
||||||
|
type=Path,
|
||||||
|
help="Files or directories to scan. Defaults to backend app and harness sources.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--format",
|
||||||
|
choices=("text", "json"),
|
||||||
|
default="text",
|
||||||
|
help="Output format.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--min-severity",
|
||||||
|
choices=tuple(SEVERITY_ORDER),
|
||||||
|
default="INFO",
|
||||||
|
help="Only show findings at or above this severity.",
|
||||||
|
)
|
||||||
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
def main(argv: Sequence[str] | None = None) -> int:
|
||||||
|
parser = build_parser()
|
||||||
|
args = parser.parse_args(argv)
|
||||||
|
paths = args.paths or list(DEFAULT_SCAN_PATHS)
|
||||||
|
findings = filter_findings(scan_paths(paths), args.min_severity)
|
||||||
|
|
||||||
|
if args.format == "json":
|
||||||
|
print(json.dumps([finding.to_dict() for finding in findings], indent=2, sort_keys=True))
|
||||||
|
else:
|
||||||
|
print(format_text(findings))
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(main())
|
||||||
@@ -0,0 +1,182 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import textwrap
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from support.detectors import thread_boundaries as detector
|
||||||
|
|
||||||
|
|
||||||
|
def _write_python(path: Path, source: str) -> Path:
|
||||||
|
path.write_text(textwrap.dedent(source).strip() + "\n", encoding="utf-8")
|
||||||
|
return path
|
||||||
|
|
||||||
|
|
||||||
|
def test_scan_file_detects_async_thread_and_tool_boundaries(tmp_path):
|
||||||
|
source_file = _write_python(
|
||||||
|
tmp_path / "sample.py",
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
import threading
|
||||||
|
import time
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
from langchain.tools import tool
|
||||||
|
from langchain_core.tools import StructuredTool
|
||||||
|
|
||||||
|
@tool
|
||||||
|
async def async_tool(value: int) -> str:
|
||||||
|
return str(value)
|
||||||
|
|
||||||
|
async def handler(model):
|
||||||
|
await asyncio.to_thread(str, "x")
|
||||||
|
model.invoke("blocking")
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
def sync_entry():
|
||||||
|
asyncio.run(handler(None))
|
||||||
|
pool = ThreadPoolExecutor(max_workers=1)
|
||||||
|
pool.submit(str, "x")
|
||||||
|
threading.Thread(target=sync_entry).start()
|
||||||
|
return StructuredTool.from_function(
|
||||||
|
name="factory_tool",
|
||||||
|
description="factory",
|
||||||
|
coroutine=async_tool,
|
||||||
|
)
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
|
||||||
|
findings = detector.scan_file(source_file, repo_root=tmp_path)
|
||||||
|
categories = {finding.category for finding in findings}
|
||||||
|
async_tool_finding = next(finding for finding in findings if finding.category == "ASYNC_TOOL_DEFINITION")
|
||||||
|
|
||||||
|
assert "ASYNC_TOOL_DEFINITION" in categories
|
||||||
|
assert async_tool_finding.function == "async_tool"
|
||||||
|
assert async_tool_finding.async_context is True
|
||||||
|
assert "ASYNC_THREAD_OFFLOAD" in categories
|
||||||
|
assert "SYNC_INVOKE_IN_ASYNC" in categories
|
||||||
|
assert "BLOCKING_CALL_IN_ASYNC" in categories
|
||||||
|
assert "SYNC_ASYNC_BRIDGE" in categories
|
||||||
|
assert "THREAD_POOL" in categories
|
||||||
|
assert "EXECUTOR_SUBMIT" in categories
|
||||||
|
assert "RAW_THREAD" in categories
|
||||||
|
assert "ASYNC_ONLY_TOOL_FACTORY" in categories
|
||||||
|
|
||||||
|
|
||||||
|
def test_scan_file_ignores_unqualified_threads_and_generic_method_names(tmp_path):
|
||||||
|
source_file = _write_python(
|
||||||
|
tmp_path / "sample.py",
|
||||||
|
"""
|
||||||
|
class Thread:
|
||||||
|
pass
|
||||||
|
|
||||||
|
class Timer:
|
||||||
|
pass
|
||||||
|
|
||||||
|
async def handler(form, runner):
|
||||||
|
form.submit()
|
||||||
|
runner.invoke("not a langchain model")
|
||||||
|
|
||||||
|
def sync_entry(runner):
|
||||||
|
Thread()
|
||||||
|
Timer()
|
||||||
|
runner.ainvoke("not a langchain model")
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
|
||||||
|
findings = detector.scan_file(source_file, repo_root=tmp_path)
|
||||||
|
categories = {finding.category for finding in findings}
|
||||||
|
|
||||||
|
assert "RAW_THREAD" not in categories
|
||||||
|
assert "RAW_TIMER_THREAD" not in categories
|
||||||
|
assert "EXECUTOR_SUBMIT" not in categories
|
||||||
|
assert "SYNC_INVOKE_IN_ASYNC" not in categories
|
||||||
|
assert "ASYNC_INVOKE_IN_SYNC" not in categories
|
||||||
|
|
||||||
|
|
||||||
|
def test_scan_file_uses_import_evidence_for_thread_and_executor_aliases(tmp_path):
|
||||||
|
source_file = _write_python(
|
||||||
|
tmp_path / "sample.py",
|
||||||
|
"""
|
||||||
|
from concurrent.futures import ThreadPoolExecutor as Pool
|
||||||
|
from threading import Thread as WorkerThread, Timer
|
||||||
|
|
||||||
|
def sync_entry():
|
||||||
|
pool = Pool(max_workers=1)
|
||||||
|
pool.submit(str, "x")
|
||||||
|
WorkerThread(target=sync_entry).start()
|
||||||
|
Timer(1, sync_entry).start()
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
|
||||||
|
findings = detector.scan_file(source_file, repo_root=tmp_path)
|
||||||
|
categories = {finding.category for finding in findings}
|
||||||
|
|
||||||
|
assert "THREAD_POOL" in categories
|
||||||
|
assert "EXECUTOR_SUBMIT" in categories
|
||||||
|
assert "RAW_THREAD" in categories
|
||||||
|
assert "RAW_TIMER_THREAD" in categories
|
||||||
|
|
||||||
|
|
||||||
|
def test_scan_paths_ignores_virtualenv_like_directories(tmp_path):
|
||||||
|
scanned_file = _write_python(
|
||||||
|
tmp_path / "app.py",
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
def main():
|
||||||
|
return asyncio.run(asyncio.sleep(0))
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
ignored_dir = tmp_path / ".venv"
|
||||||
|
ignored_dir.mkdir()
|
||||||
|
_write_python(
|
||||||
|
ignored_dir / "ignored.py",
|
||||||
|
"""
|
||||||
|
import threading
|
||||||
|
|
||||||
|
thread = threading.Thread(target=lambda: None)
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
|
||||||
|
findings = detector.scan_paths([tmp_path], repo_root=tmp_path)
|
||||||
|
|
||||||
|
assert any(finding.path == scanned_file.name for finding in findings)
|
||||||
|
assert all(".venv" not in finding.path for finding in findings)
|
||||||
|
|
||||||
|
|
||||||
|
def test_json_output_and_min_severity_filter(tmp_path, capsys):
|
||||||
|
source_file = _write_python(
|
||||||
|
tmp_path / "sample.py",
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
async def handler(model):
|
||||||
|
await asyncio.to_thread(str, "x")
|
||||||
|
model.invoke("blocking")
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
|
||||||
|
exit_code = detector.main(["--format", "json", "--min-severity", "WARN", str(source_file)])
|
||||||
|
|
||||||
|
assert exit_code == 0
|
||||||
|
payload = json.loads(capsys.readouterr().out)
|
||||||
|
categories = {finding["category"] for finding in payload}
|
||||||
|
assert categories == {"SYNC_INVOKE_IN_ASYNC"}
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_errors_are_reported_as_findings(tmp_path):
|
||||||
|
source_file = _write_python(
|
||||||
|
tmp_path / "broken.py",
|
||||||
|
"""
|
||||||
|
def broken(:
|
||||||
|
pass
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
|
||||||
|
findings = detector.scan_file(source_file, repo_root=tmp_path)
|
||||||
|
|
||||||
|
assert len(findings) == 1
|
||||||
|
assert findings[0].category == "PARSE_ERROR"
|
||||||
|
assert findings[0].severity == "WARN"
|
||||||
|
assert findings[0].column == 11
|
||||||
|
assert f"{source_file.name}:1:12" in detector.format_text(findings)
|
||||||
@@ -0,0 +1,23 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""CLI wrapper for the async/thread boundary detector."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import sys
|
||||||
|
from collections.abc import Sequence
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
REPO_ROOT = Path(__file__).resolve().parents[1]
|
||||||
|
TEST_SUPPORT_PATH = REPO_ROOT / "backend" / "tests"
|
||||||
|
if str(TEST_SUPPORT_PATH) not in sys.path:
|
||||||
|
sys.path.insert(0, str(TEST_SUPPORT_PATH))
|
||||||
|
|
||||||
|
|
||||||
|
def main(argv: Sequence[str] | None = None) -> int:
|
||||||
|
from support.detectors.thread_boundaries import main as detector_main
|
||||||
|
|
||||||
|
return detector_main(argv)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(main())
|
||||||
Reference in New Issue
Block a user