fix(middleware): add per-tool-type frequency detection to LoopDetectionMiddleware (#1988)

* fix(middleware): add per-tool-type frequency detection to LoopDetectionMiddleware

The existing hash-based loop detection only catches identical tool call
sets. When the agent calls the same tool type (e.g. read_file) on many
different files, each call produces a unique hash and bypasses detection.
This causes the agent to exhaust recursion_limit, consuming 150K-225K
tokens per failed run.

Add a second detection layer that tracks cumulative call counts per tool
type per thread. Warns at 30 calls (configurable) and forces stop at 50.
The hard stop message now uses the actual returned message instead of a
hardcoded constant, so both hash-based and frequency-based stops produce
accurate diagnostics.

Also fix _apply() to use the warning message returned by
_track_and_check() for hard stops, instead of always using _HARD_STOP_MSG.

Closes #1987

* Apply suggestions from code review

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* fix(lint): remove unused imports and fix line length

- Remove unused _TOOL_FREQ_HARD_STOP_MSG and _TOOL_FREQ_WARNING_MSG
  imports from test file (F401)
- Break long _TOOL_FREQ_WARNING_MSG string to fit within 240 char limit (E501)

* style: apply ruff format

* test: add LRU eviction and per-thread reset coverage for frequency state

Address review feedback from @WillemJiang:
- Verify _tool_freq and _tool_freq_warned are cleaned on LRU eviction
- Add test for reset(thread_id=...) clearing only the target thread's
  frequency state while leaving others intact

* fix(makefile): route Windows shell-script targets through Git Bash (#2060)

---------

Co-authored-by: Willem Jiang <willem.jiang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Asish Kumar <87874775+officialasishkumar@users.noreply.github.com>
This commit is contained in:
ZHANG Ning
2026-04-11 17:33:27 +08:00
committed by GitHub
parent 02569136df
commit 5b633449f8
2 changed files with 256 additions and 3 deletions
@@ -280,6 +280,8 @@ class TestLoopDetection:
mw._apply(_make_state(tool_calls=call), runtime_new)
assert "thread-0" not in mw._history
assert "thread-0" not in mw._tool_freq
assert "thread-0" not in mw._tool_freq_warned
assert "thread-new" in mw._history
assert len(mw._history) == 3
@@ -410,3 +412,188 @@ class TestHardStopWithListContent:
assert isinstance(msg.content, str)
assert msg.content.startswith("thinking...")
assert _HARD_STOP_MSG in msg.content
class TestToolFrequencyDetection:
"""Tests for per-tool-type frequency detection (Layer 2).
This catches the case where an agent calls the same tool type many times
with *different* arguments (e.g. read_file on 40 different files), which
bypasses hash-based detection.
"""
def _read_call(self, path):
return {"name": "read_file", "id": f"call_read_{path}", "args": {"path": path}}
def test_below_freq_warn_returns_none(self):
mw = LoopDetectionMiddleware(tool_freq_warn=5, tool_freq_hard_limit=10)
runtime = _make_runtime()
for i in range(4):
result = mw._apply(_make_state(tool_calls=[self._read_call(f"/file_{i}.py")]), runtime)
assert result is None
def test_freq_warn_at_threshold(self):
mw = LoopDetectionMiddleware(tool_freq_warn=5, tool_freq_hard_limit=10)
runtime = _make_runtime()
for i in range(4):
mw._apply(_make_state(tool_calls=[self._read_call(f"/file_{i}.py")]), runtime)
# 5th call to read_file (different file each time) triggers freq warning
result = mw._apply(_make_state(tool_calls=[self._read_call("/file_4.py")]), runtime)
assert result is not None
msg = result["messages"][0]
assert isinstance(msg, HumanMessage)
assert "read_file" in msg.content
assert "LOOP DETECTED" in msg.content
def test_freq_warn_only_injected_once(self):
mw = LoopDetectionMiddleware(tool_freq_warn=3, tool_freq_hard_limit=10)
runtime = _make_runtime()
for i in range(2):
mw._apply(_make_state(tool_calls=[self._read_call(f"/file_{i}.py")]), runtime)
# 3rd triggers warning
result = mw._apply(_make_state(tool_calls=[self._read_call("/file_2.py")]), runtime)
assert result is not None
assert "LOOP DETECTED" in result["messages"][0].content
# 4th should not re-warn (already warned for read_file)
result = mw._apply(_make_state(tool_calls=[self._read_call("/file_3.py")]), runtime)
assert result is None
def test_freq_hard_stop_at_limit(self):
mw = LoopDetectionMiddleware(tool_freq_warn=3, tool_freq_hard_limit=6)
runtime = _make_runtime()
for i in range(5):
mw._apply(_make_state(tool_calls=[self._read_call(f"/file_{i}.py")]), runtime)
# 6th call triggers hard stop
result = mw._apply(_make_state(tool_calls=[self._read_call("/file_5.py")]), runtime)
assert result is not None
msg = result["messages"][0]
assert isinstance(msg, AIMessage)
assert msg.tool_calls == []
assert "FORCED STOP" in msg.content
assert "read_file" in msg.content
def test_different_tools_tracked_independently(self):
"""read_file and bash should have independent frequency counters."""
mw = LoopDetectionMiddleware(tool_freq_warn=3, tool_freq_hard_limit=10)
runtime = _make_runtime()
# 2 read_file calls
for i in range(2):
mw._apply(_make_state(tool_calls=[self._read_call(f"/file_{i}.py")]), runtime)
# 2 bash calls — should not trigger (bash count = 2, read_file count = 2)
for i in range(2):
result = mw._apply(_make_state(tool_calls=[_bash_call(f"cmd_{i}")]), runtime)
assert result is None
# 3rd read_file triggers (read_file count = 3)
result = mw._apply(_make_state(tool_calls=[self._read_call("/file_2.py")]), runtime)
assert result is not None
assert "read_file" in result["messages"][0].content
def test_freq_reset_clears_state(self):
mw = LoopDetectionMiddleware(tool_freq_warn=3, tool_freq_hard_limit=10)
runtime = _make_runtime()
for i in range(2):
mw._apply(_make_state(tool_calls=[self._read_call(f"/file_{i}.py")]), runtime)
mw.reset()
# After reset, count restarts — should not trigger
result = mw._apply(_make_state(tool_calls=[self._read_call("/file_new.py")]), runtime)
assert result is None
def test_freq_reset_per_thread_clears_only_target(self):
"""reset(thread_id=...) should clear frequency state for that thread only."""
mw = LoopDetectionMiddleware(tool_freq_warn=3, tool_freq_hard_limit=10)
runtime_a = _make_runtime("thread-A")
runtime_b = _make_runtime("thread-B")
# 2 calls on each thread
for i in range(2):
mw._apply(_make_state(tool_calls=[self._read_call(f"/a_{i}.py")]), runtime_a)
mw._apply(_make_state(tool_calls=[self._read_call(f"/b_{i}.py")]), runtime_b)
# Reset only thread-A
mw.reset(thread_id="thread-A")
assert "thread-A" not in mw._tool_freq
assert "thread-A" not in mw._tool_freq_warned
# thread-B state should still be intact — 3rd call triggers warn
result = mw._apply(_make_state(tool_calls=[self._read_call("/b_2.py")]), runtime_b)
assert result is not None
assert "LOOP DETECTED" in result["messages"][0].content
# thread-A restarted from 0 — should not trigger
result = mw._apply(_make_state(tool_calls=[self._read_call("/a_new.py")]), runtime_a)
assert result is None
def test_freq_per_thread_isolation(self):
"""Frequency counts should be independent per thread."""
mw = LoopDetectionMiddleware(tool_freq_warn=3, tool_freq_hard_limit=10)
runtime_a = _make_runtime("thread-A")
runtime_b = _make_runtime("thread-B")
# 2 calls on thread A
for i in range(2):
mw._apply(_make_state(tool_calls=[self._read_call(f"/file_{i}.py")]), runtime_a)
# 2 calls on thread B — should NOT push thread A over threshold
for i in range(2):
mw._apply(_make_state(tool_calls=[self._read_call(f"/other_{i}.py")]), runtime_b)
# 3rd call on thread A — triggers (count=3 for thread A only)
result = mw._apply(_make_state(tool_calls=[self._read_call("/file_2.py")]), runtime_a)
assert result is not None
assert "LOOP DETECTED" in result["messages"][0].content
def test_multi_tool_single_response_counted(self):
"""When a single response has multiple tool calls, each is counted."""
mw = LoopDetectionMiddleware(tool_freq_warn=5, tool_freq_hard_limit=10)
runtime = _make_runtime()
# Response 1: 2 read_file calls → count = 2
call = [self._read_call("/a.py"), self._read_call("/b.py")]
result = mw._apply(_make_state(tool_calls=call), runtime)
assert result is None
# Response 2: 2 more → count = 4
call = [self._read_call("/c.py"), self._read_call("/d.py")]
result = mw._apply(_make_state(tool_calls=call), runtime)
assert result is None
# Response 3: 1 more → count = 5 → triggers warn
result = mw._apply(_make_state(tool_calls=[self._read_call("/e.py")]), runtime)
assert result is not None
assert "read_file" in result["messages"][0].content
def test_hash_detection_takes_priority(self):
"""Hash-based hard stop fires before frequency check for identical calls."""
mw = LoopDetectionMiddleware(
warn_threshold=2,
hard_limit=3,
tool_freq_warn=100,
tool_freq_hard_limit=200,
)
runtime = _make_runtime()
call = [self._read_call("/same_file.py")]
for _ in range(2):
mw._apply(_make_state(tool_calls=call), runtime)
# 3rd identical call → hash hard_limit=3 fires (not freq)
result = mw._apply(_make_state(tool_calls=call), runtime)
assert result is not None
msg = result["messages"][0]
assert isinstance(msg, AIMessage)
assert _HARD_STOP_MSG in msg.content