mirror of
https://github.com/bytedance/deer-flow.git
synced 2026-05-26 18:06:00 +00:00
da41701f87
* feat(detectors): add static blocking IO inventory * refactor(detectors): drop superseded runtime probe; clarify static report path - Remove the #2924 custom runtime blocking IO probe entirely: backend/tests/support/detectors/blocking_io.py, backend/tests/test_blocking_io_detector.py, backend/tests/test_blocking_io_probe_integration.py, and the pytest_addoption / pytest_runtest_call / pytest_runtest_teardown / pytest_sessionfinish / pytest_terminal_summary hooks plus the blocking_io_detector fixture from backend/tests/conftest.py. Its narrow DEFAULT_BLOCKING_CALL_SPECS (time.sleep, requests, httpx, os.walk, Path.resolve, Path.read_text, Path.write_text) cannot serve as a CI gate; a Blockbuster-backed runtime detector will land in a separate follow-up PR. Leaving the half-coverage probe alongside the static inventory in this PR added a redundant detect path with no production value. - Address Copilot review comments on backend/README.md and backend/CLAUDE.md by stating explicitly that the JSON report writes to .deer-flow/blocking-io-findings.json at the repository root, whether the target is invoked from the repo root or from backend/. Verified: pytest tests/test_detect_blocking_io_static.py (18 passed), ruff check + format on touched files (passed), make detect-blocking-io from both repo root and backend/ produce the same 105-finding report at <repo-root>/.deer-flow/blocking-io-findings.json. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.7 <noreply@anthropic.com> Co-authored-by: Willem Jiang <willem.jiang@gmail.com>
422 lines
13 KiB
Python
422 lines
13 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
import textwrap
|
|
from pathlib import Path
|
|
|
|
from support.detectors import blocking_io_static as detector
|
|
|
|
|
|
def _write_python(path: Path, source: str) -> Path:
|
|
path.write_text(textwrap.dedent(source).strip() + "\n", encoding="utf-8")
|
|
return path
|
|
|
|
|
|
def _payload(path: Path, repo_root: Path) -> list[dict[str, object]]:
|
|
return [finding.to_dict() for finding in detector.scan_file(path, repo_root=repo_root)]
|
|
|
|
|
|
def test_scan_file_detects_direct_blocking_calls_in_async_code(tmp_path: Path) -> None:
|
|
source_file = _write_python(
|
|
tmp_path / "sample.py",
|
|
"""
|
|
import subprocess
|
|
import time
|
|
import urllib.request
|
|
from pathlib import Path
|
|
|
|
async def handler(path: Path):
|
|
time.sleep(1)
|
|
subprocess.run(["echo", "ok"])
|
|
path.read_text(encoding="utf-8")
|
|
with open(path, encoding="utf-8") as handle:
|
|
return urllib.request.urlopen(handle.read())
|
|
""",
|
|
)
|
|
|
|
findings = _payload(source_file, tmp_path)
|
|
categories = {finding["blocking_call"]["category"] for finding in findings}
|
|
symbols = {finding["blocking_call"]["symbol"] for finding in findings}
|
|
|
|
assert categories == {
|
|
"BLOCKING_FILE_IO",
|
|
"BLOCKING_HTTP_IO",
|
|
"BLOCKING_SLEEP",
|
|
"BLOCKING_SUBPROCESS",
|
|
}
|
|
assert {"time.sleep", "subprocess.run", "path.read_text", "open", "urllib.request.urlopen"}.issubset(symbols)
|
|
assert {finding["event_loop_exposure"] for finding in findings} == {"DIRECT_ASYNC"}
|
|
|
|
|
|
def test_scan_file_detects_blocking_calls_in_sync_helper_reached_from_async_code(tmp_path: Path) -> None:
|
|
source_file = _write_python(
|
|
tmp_path / "sample.py",
|
|
"""
|
|
from pathlib import Path
|
|
|
|
def load_payload(path: Path) -> bytes:
|
|
return path.read_bytes()
|
|
|
|
async def route(path: Path) -> bytes:
|
|
return load_payload(path)
|
|
""",
|
|
)
|
|
|
|
findings = _payload(source_file, tmp_path)
|
|
|
|
assert len(findings) == 1
|
|
assert findings[0]["blocking_call"]["category"] == "BLOCKING_FILE_IO"
|
|
assert findings[0]["location"]["function"] == "load_payload"
|
|
assert findings[0]["event_loop_exposure"] == "ASYNC_REACHABLE_SAME_FILE"
|
|
assert findings[0]["blocking_call"]["symbol"] == "path.read_bytes"
|
|
|
|
|
|
def test_scan_file_omits_sync_only_blocking_calls_from_default_results(tmp_path: Path) -> None:
|
|
source_file = _write_python(
|
|
tmp_path / "sample.py",
|
|
"""
|
|
from pathlib import Path
|
|
|
|
def load_payload(path: Path) -> str:
|
|
return path.read_text()
|
|
""",
|
|
)
|
|
|
|
assert detector.scan_file(source_file, repo_root=tmp_path) == []
|
|
|
|
|
|
def test_scan_file_detects_self_helper_reached_from_async_method(tmp_path: Path) -> None:
|
|
source_file = _write_python(
|
|
tmp_path / "sample.py",
|
|
"""
|
|
class ArtifactRouter:
|
|
def read_payload(self, path):
|
|
return path.read_text(encoding="utf-8")
|
|
|
|
async def get(self, path):
|
|
return self.read_payload(path)
|
|
""",
|
|
)
|
|
|
|
findings = _payload(source_file, tmp_path)
|
|
|
|
assert len(findings) == 1
|
|
assert findings[0]["location"]["function"] == "ArtifactRouter.read_payload"
|
|
assert findings[0]["event_loop_exposure"] == "ASYNC_REACHABLE_SAME_FILE"
|
|
|
|
|
|
def test_json_output_uses_concise_review_record_schema(tmp_path: Path, capsys) -> None:
|
|
source_file = _write_python(
|
|
tmp_path / "sample.py",
|
|
"""
|
|
import subprocess
|
|
|
|
async def handler():
|
|
subprocess.run(["echo", "ok"])
|
|
""",
|
|
)
|
|
|
|
exit_code = detector.main(["--format", "json", str(source_file)])
|
|
|
|
assert exit_code == 0
|
|
payload = json.loads(capsys.readouterr().out)
|
|
assert payload == [
|
|
{
|
|
"priority": "HIGH",
|
|
"location": {
|
|
"path": str(source_file),
|
|
"line": 4,
|
|
"column": 5,
|
|
"function": "handler",
|
|
},
|
|
"blocking_call": {
|
|
"category": "BLOCKING_SUBPROCESS",
|
|
"operation": "SUBPROCESS",
|
|
"symbol": "subprocess.run",
|
|
},
|
|
"event_loop_exposure": "DIRECT_ASYNC",
|
|
"reason": "SUBPROCESS is called directly inside an async function.",
|
|
"code": 'subprocess.run(["echo", "ok"])',
|
|
}
|
|
]
|
|
assert "confidence" not in payload[0]
|
|
assert "severity" not in payload[0]
|
|
assert "event_loop_risk" not in payload[0]
|
|
|
|
|
|
def test_summary_output_writes_json_report(tmp_path: Path, capsys) -> None:
|
|
source_file = _write_python(
|
|
tmp_path / "sample.py",
|
|
"""
|
|
import subprocess
|
|
|
|
async def handler():
|
|
subprocess.run(["echo", "ok"])
|
|
""",
|
|
)
|
|
output_path = tmp_path / "reports" / "blocking-io.json"
|
|
|
|
exit_code = detector.main(["--output", str(output_path), str(source_file)])
|
|
|
|
assert exit_code == 0
|
|
stdout = capsys.readouterr().out
|
|
assert "Static blocking IO event-loop risk findings: 1" in stdout
|
|
assert "By category:" in stdout
|
|
assert "BLOCKING_SUBPROCESS" in stdout
|
|
assert "Full JSON report:" in stdout
|
|
payload = json.loads(output_path.read_text(encoding="utf-8"))
|
|
assert [finding["blocking_call"]["category"] for finding in payload] == ["BLOCKING_SUBPROCESS"]
|
|
|
|
|
|
def test_json_output_ranks_operations_without_confidence_noise(tmp_path: Path, capsys) -> None:
|
|
source_file = _write_python(
|
|
tmp_path / "sample.py",
|
|
"""
|
|
import shutil
|
|
|
|
async def handler(path):
|
|
path.exists()
|
|
path.read_text()
|
|
shutil.rmtree(path)
|
|
""",
|
|
)
|
|
|
|
exit_code = detector.main(["--format", "json", str(source_file)])
|
|
|
|
assert exit_code == 0
|
|
payload = json.loads(capsys.readouterr().out)
|
|
by_symbol = {finding["blocking_call"]["symbol"]: finding for finding in payload}
|
|
assert by_symbol["path.exists"]["blocking_call"]["operation"] == "FILE_METADATA"
|
|
assert by_symbol["path.exists"]["priority"] == "LOW"
|
|
assert by_symbol["path.read_text"]["blocking_call"]["operation"] == "FILE_READ"
|
|
assert by_symbol["path.read_text"]["priority"] == "MEDIUM"
|
|
assert by_symbol["shutil.rmtree"]["blocking_call"]["operation"] == "FILE_TREE_DELETE"
|
|
assert by_symbol["shutil.rmtree"]["priority"] == "HIGH"
|
|
assert {finding["event_loop_exposure"] for finding in payload} == {"DIRECT_ASYNC"}
|
|
assert all("confidence" not in finding for finding in payload)
|
|
|
|
|
|
def test_path_receiver_detection_uses_path_annotations(tmp_path: Path) -> None:
|
|
source_file = _write_python(
|
|
tmp_path / "sample.py",
|
|
"""
|
|
from pathlib import Path
|
|
|
|
async def typed(path: Path):
|
|
return path.read_text()
|
|
|
|
async def constructed():
|
|
return Path("payload.txt").read_text()
|
|
""",
|
|
)
|
|
|
|
findings = _payload(source_file, tmp_path)
|
|
|
|
assert {finding["blocking_call"]["symbol"] for finding in findings} == {"path.read_text", "pathlib.Path.read_text"}
|
|
assert {finding["priority"] for finding in findings} == {"MEDIUM"}
|
|
|
|
|
|
def test_summary_groups_findings_by_priority_and_operation(tmp_path: Path, capsys) -> None:
|
|
source_file = _write_python(
|
|
tmp_path / "sample.py",
|
|
"""
|
|
import os
|
|
from pathlib import Path
|
|
|
|
def load_payload(path: Path) -> str:
|
|
return path.read_text()
|
|
|
|
async def handler(path: Path) -> str:
|
|
path.exists()
|
|
list(os.walk(path))
|
|
return load_payload(path)
|
|
""",
|
|
)
|
|
|
|
exit_code = detector.main([str(source_file)])
|
|
|
|
assert exit_code == 0
|
|
stdout = capsys.readouterr().out
|
|
assert "By priority:" in stdout
|
|
assert "HIGH" in stdout
|
|
assert "MEDIUM" in stdout
|
|
assert "By operation:" in stdout
|
|
assert "FILE_ENUMERATION" in stdout
|
|
assert "FILE_METADATA" in stdout
|
|
assert "FILE_READ" in stdout
|
|
assert "By event-loop exposure:" in stdout
|
|
assert "DIRECT_ASYNC" in stdout
|
|
assert "ASYNC_REACHABLE_SAME_FILE" in stdout
|
|
|
|
|
|
def test_source_code_snippet_is_truncated_for_json_output(tmp_path: Path) -> None:
|
|
long_suffix = " + ".join('"chunk"' for _ in range(80))
|
|
source_file = _write_python(
|
|
tmp_path / "sample.py",
|
|
f"""
|
|
async def handler(path):
|
|
return path.read_text() + {long_suffix}
|
|
""",
|
|
)
|
|
|
|
findings = _payload(source_file, tmp_path)
|
|
|
|
assert len(findings) == 1
|
|
assert len(findings[0]["code"]) <= 203
|
|
assert findings[0]["code"].endswith("...")
|
|
|
|
|
|
def test_cli_default_filters_sync_only_inventory_items(tmp_path: Path, capsys) -> None:
|
|
source_file = _write_python(
|
|
tmp_path / "sample.py",
|
|
"""
|
|
from pathlib import Path
|
|
|
|
def load_payload(path: Path) -> str:
|
|
return path.read_text()
|
|
""",
|
|
)
|
|
|
|
exit_code = detector.main(["--format", "json", str(source_file)])
|
|
|
|
assert exit_code == 0
|
|
assert json.loads(capsys.readouterr().out) == []
|
|
|
|
|
|
def test_sync_only_agent_middleware_hook_gets_event_loop_exposure(tmp_path: Path) -> None:
|
|
source_file = _write_python(
|
|
tmp_path / "sample.py",
|
|
"""
|
|
from langchain.agents.middleware import AgentMiddleware
|
|
from pathlib import Path
|
|
|
|
class UploadsMiddleware(AgentMiddleware):
|
|
def before_agent(self, state, runtime):
|
|
return self._load(Path("uploads"))
|
|
|
|
def _load(self, path: Path) -> str:
|
|
return path.read_text()
|
|
""",
|
|
)
|
|
|
|
findings = _payload(source_file, tmp_path)
|
|
|
|
assert len(findings) == 1
|
|
assert findings[0]["location"]["function"] == "UploadsMiddleware._load"
|
|
assert findings[0]["event_loop_exposure"] == "SYNC_AGENT_MIDDLEWARE_HOOK"
|
|
assert "statically reachable from a sync AgentMiddleware hook" in findings[0]["reason"]
|
|
|
|
|
|
def test_sync_agent_middleware_hook_with_async_counterpart_is_not_reported(tmp_path: Path) -> None:
|
|
source_file = _write_python(
|
|
tmp_path / "sample.py",
|
|
"""
|
|
from langchain.agents.middleware import AgentMiddleware
|
|
from pathlib import Path
|
|
|
|
class UploadsMiddleware(AgentMiddleware):
|
|
def before_agent(self, state, runtime):
|
|
return Path("uploads").read_text()
|
|
|
|
async def abefore_agent(self, state, runtime):
|
|
return None
|
|
""",
|
|
)
|
|
|
|
assert detector.scan_file(source_file, repo_root=tmp_path) == []
|
|
|
|
|
|
def test_scan_file_detects_sync_httpx_client_methods_in_async_code(tmp_path: Path) -> None:
|
|
source_file = _write_python(
|
|
tmp_path / "sample.py",
|
|
"""
|
|
import httpx
|
|
|
|
async def search() -> str:
|
|
with httpx.Client(timeout=30) as client:
|
|
return client.post("https://example.invalid").text
|
|
""",
|
|
)
|
|
|
|
findings = _payload(source_file, tmp_path)
|
|
|
|
assert len(findings) == 1
|
|
assert findings[0]["blocking_call"]["category"] == "BLOCKING_HTTP_IO"
|
|
assert findings[0]["location"]["function"] == "search"
|
|
assert findings[0]["event_loop_exposure"] == "DIRECT_ASYNC"
|
|
assert findings[0]["blocking_call"]["symbol"] == "httpx.Client.post"
|
|
|
|
|
|
def test_scan_file_detects_chained_sync_http_client_methods_in_async_code(tmp_path: Path) -> None:
|
|
source_file = _write_python(
|
|
tmp_path / "sample.py",
|
|
"""
|
|
import httpx
|
|
import requests
|
|
|
|
async def fetch() -> tuple[object, object]:
|
|
return (
|
|
httpx.Client().get("https://example.invalid"),
|
|
requests.Session().post("https://example.invalid"),
|
|
)
|
|
""",
|
|
)
|
|
|
|
findings = _payload(source_file, tmp_path)
|
|
symbols = {finding["blocking_call"]["symbol"] for finding in findings}
|
|
|
|
assert symbols == {"httpx.Client.get", "requests.Session.post"}
|
|
assert {finding["blocking_call"]["category"] for finding in findings} == {"BLOCKING_HTTP_IO"}
|
|
|
|
|
|
def test_scan_file_detects_os_walk_and_path_resolve_in_async_code(tmp_path: Path) -> None:
|
|
source_file = _write_python(
|
|
tmp_path / "sample.py",
|
|
"""
|
|
import os
|
|
from pathlib import Path
|
|
|
|
async def inspect_tree(path: Path) -> list[str]:
|
|
root = path.resolve()
|
|
return [name for _, _, names in os.walk(root) for name in names]
|
|
""",
|
|
)
|
|
|
|
findings = _payload(source_file, tmp_path)
|
|
symbols = {finding["blocking_call"]["symbol"] for finding in findings}
|
|
|
|
assert symbols == {"path.resolve", "os.walk"}
|
|
assert {finding["blocking_call"]["category"] for finding in findings} == {"BLOCKING_FILE_IO"}
|
|
|
|
|
|
def test_scan_file_does_not_treat_string_replace_as_file_io(tmp_path: Path) -> None:
|
|
source_file = _write_python(
|
|
tmp_path / "sample.py",
|
|
"""
|
|
def _path_variants(path: str) -> set[str]:
|
|
return {path, path.replace("\\\\", "/"), path.replace("/", "\\\\")}
|
|
|
|
async def normalize(text: str) -> str:
|
|
return text.replace("a", "b")
|
|
""",
|
|
)
|
|
|
|
assert detector.scan_file(source_file, repo_root=tmp_path) == []
|
|
|
|
|
|
def test_parse_errors_are_reported_as_findings(tmp_path: Path) -> None:
|
|
source_file = _write_python(
|
|
tmp_path / "broken.py",
|
|
"""
|
|
async def broken(:
|
|
pass
|
|
""",
|
|
)
|
|
|
|
findings = _payload(source_file, tmp_path)
|
|
|
|
assert len(findings) == 1
|
|
assert findings[0]["blocking_call"]["category"] == "PARSE_ERROR"
|
|
assert findings[0]["priority"] == "MEDIUM"
|
|
assert f"{source_file.name}:1:18" in detector.format_text(detector.scan_file(source_file, repo_root=tmp_path))
|