mirror of
https://github.com/bytedance/deer-flow.git
synced 2026-06-12 02:15:58 +00:00
feat(community): add SearXNG and Browserless web search/fetch tools (#3451)
* feat(community): add SearXNG and Browserless web search/fetch tools - SearXNG web_search: privacy-focused meta search engine integration with configurable base_url via config.yaml tool settings - Browserless web_fetch: headless browser page fetching with readability article extraction - Both tools are fully configurable through tool config section - No external API keys required for basic operation * fix: address PR review feedback and add unit tests - Guard config.model_extra against None values (review #1, #2) - Coerce max_results to int when reading from config (review #2) - Fix web_fetch_tool to use direct HTTP fetch instead of reusing the web_search client config (review #3) - Fix misleading docstring for SearxngClient.fetch (review #4) - Remove unused target_url variable to pass Ruff lint (review #5) - Normalize bool config values with _normalize_bool helper to handle env-resolved string values correctly (review #6) - Add unit tests for both SearXNG and Browserless client classes and their tool functions with mocked httpx (review #7, #8) * fix: convert to async httpx to avoid blocking I/O on event loop - Replace httpx.Client with httpx.AsyncClient in both client classes - Convert tool functions to async def - Wrap readability_extractor calls in asyncio.to_thread() - Update all tests to use pytest.mark.asyncio and async mocks - Fix import sorting to pass Ruff lint * fix(browserless): replace deprecated waitUntil with waitForEvent The Browserless API has deprecated the waitUntil parameter. Replace with waitForEvent which accepts values like 'networkidle'. Default is empty (no wait), configurable via config.yaml. * fix(browserless): remove deprecated gotoTimeout and bestAttempt params The Browserless /content API does not accept gotoTimeout or bestAttempt as top-level payload keys. These were being sent unconditionally, causing 400 Bad Request errors on current Browserless versions. Changes: - Remove goto_timeout_ms parameter and 'gotoTimeout' from payload - Remove best_attempt parameter and 'bestAttempt' from payload - Remove _normalize_bool helper (no longer needed) - Remove goto_timeout_ms and best_attempt config reading in tools.py - Add tests for waitForSelector and reject params - Verify no deprecated params are sent in test_fetch_html_success * refactor(searxng): remove web_fetch_tool, decouple from web_search config SearXNG is a search engine — it should only provide web_search_tool. The web_fetch responsibility belongs to Browserless (headless Chrome) or Jina AI, not SearXNG. Changes: - Remove web_fetch_tool from SearXNG tools.py and __init__.py - Remove SearxngClient.fetch() method (no longer needed) - Remove unused asyncio/readability imports from SearXNG tools.py - Add test for max_results string-to-int coercion from config - Add test for search with categories parameter - Add test for httpx.RequestError handling - Apply ruff format fixes to browserless_client.py and test files
This commit is contained in:
@@ -0,0 +1,187 @@
|
||||
"""Tests for Browserless community tools."""
|
||||
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from deerflow.community.browserless import tools
|
||||
from deerflow.community.browserless.browserless_client import BrowserlessClient
|
||||
|
||||
|
||||
class AsyncMock(MagicMock):
|
||||
"""Mock that supports async call."""
|
||||
|
||||
async def __call__(self, *args, **kwargs):
|
||||
return super().__call__(*args, **kwargs)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
class TestBrowserlessClient:
|
||||
"""Tests for the BrowserlessClient class."""
|
||||
|
||||
async def test_fetch_html_success(self):
|
||||
"""fetch_html returns HTML content on success."""
|
||||
with patch("deerflow.community.browserless.browserless_client.httpx.AsyncClient") as mock_cls:
|
||||
mock_ctx = MagicMock()
|
||||
mock_cls.return_value.__aenter__.return_value = mock_ctx
|
||||
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.status_code = 200
|
||||
mock_resp.text = "<html><body>Page content</body></html>"
|
||||
mock_resp.headers = {}
|
||||
mock_ctx.post = AsyncMock(return_value=mock_resp)
|
||||
|
||||
client = BrowserlessClient(base_url="http://browserless:3000")
|
||||
result = await client.fetch_html("https://example.com")
|
||||
|
||||
assert result == "<html><body>Page content</body></html>"
|
||||
call_kwargs = mock_ctx.post.call_args.kwargs
|
||||
assert call_kwargs["json"]["url"] == "https://example.com"
|
||||
assert "waitUntil" not in call_kwargs["json"]
|
||||
assert "gotoTimeout" not in call_kwargs["json"]
|
||||
assert "bestAttempt" not in call_kwargs["json"]
|
||||
|
||||
async def test_fetch_html_empty_response(self):
|
||||
"""fetch_html returns error for empty response."""
|
||||
with patch("deerflow.community.browserless.browserless_client.httpx.AsyncClient") as mock_cls:
|
||||
mock_ctx = MagicMock()
|
||||
mock_cls.return_value.__aenter__.return_value = mock_ctx
|
||||
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.status_code = 200
|
||||
mock_resp.text = " "
|
||||
mock_resp.headers = {}
|
||||
mock_ctx.post = AsyncMock(return_value=mock_resp)
|
||||
|
||||
client = BrowserlessClient(base_url="http://browserless:3000")
|
||||
result = await client.fetch_html("https://example.com")
|
||||
assert result == "Error: Browserless returned empty response"
|
||||
|
||||
async def test_fetch_html_http_error(self):
|
||||
"""fetch_html returns error for non-200 status."""
|
||||
with patch("deerflow.community.browserless.browserless_client.httpx.AsyncClient") as mock_cls:
|
||||
mock_ctx = MagicMock()
|
||||
mock_cls.return_value.__aenter__.return_value = mock_ctx
|
||||
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.status_code = 500
|
||||
mock_resp.text = "Internal error"
|
||||
mock_resp.headers = {}
|
||||
mock_ctx.post = AsyncMock(return_value=mock_resp)
|
||||
|
||||
client = BrowserlessClient(base_url="http://browserless:3000")
|
||||
result = await client.fetch_html("https://example.com")
|
||||
assert "Error: Browserless HTTP 500" in result
|
||||
|
||||
async def test_fetch_html_timeout(self):
|
||||
"""fetch_html returns timeout error."""
|
||||
with patch("deerflow.community.browserless.browserless_client.httpx.AsyncClient") as mock_cls:
|
||||
mock_ctx = MagicMock()
|
||||
mock_cls.return_value.__aenter__.return_value = mock_ctx
|
||||
import httpx
|
||||
|
||||
mock_ctx.post = AsyncMock(side_effect=httpx.TimeoutException("Timed out"))
|
||||
|
||||
client = BrowserlessClient(base_url="http://browserless:3000", timeout_s=10)
|
||||
result = await client.fetch_html("https://example.com")
|
||||
assert "timed out" in result.lower() or "timeout" in result.lower()
|
||||
|
||||
async def test_fetch_html_with_token(self):
|
||||
"""fetch_html includes token in payload when set."""
|
||||
with patch("deerflow.community.browserless.browserless_client.httpx.AsyncClient") as mock_cls:
|
||||
mock_ctx = MagicMock()
|
||||
mock_cls.return_value.__aenter__.return_value = mock_ctx
|
||||
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.status_code = 200
|
||||
mock_resp.text = "<html>OK</html>"
|
||||
mock_resp.headers = {}
|
||||
mock_ctx.post = AsyncMock(return_value=mock_resp)
|
||||
|
||||
client = BrowserlessClient(base_url="http://browserless:3000", token="my-token")
|
||||
await client.fetch_html("https://example.com")
|
||||
|
||||
payload = mock_ctx.post.call_args.kwargs["json"]
|
||||
assert payload["token"] == "my-token"
|
||||
|
||||
async def test_fetch_html_with_wait_for_selector(self):
|
||||
"""fetch_html sends waitForSelector when selector is set."""
|
||||
with patch("deerflow.community.browserless.browserless_client.httpx.AsyncClient") as mock_cls:
|
||||
mock_ctx = MagicMock()
|
||||
mock_cls.return_value.__aenter__.return_value = mock_ctx
|
||||
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.status_code = 200
|
||||
mock_resp.text = "<html>OK</html>"
|
||||
mock_resp.headers = {}
|
||||
mock_ctx.post = AsyncMock(return_value=mock_resp)
|
||||
|
||||
client = BrowserlessClient(base_url="http://browserless:3000")
|
||||
await client.fetch_html("https://example.com", wait_for_selector="article")
|
||||
|
||||
payload = mock_ctx.post.call_args.kwargs["json"]
|
||||
assert payload["waitForSelector"]["selector"] == "article"
|
||||
|
||||
async def test_fetch_html_with_reject_params(self):
|
||||
"""fetch_html sends reject params when set."""
|
||||
with patch("deerflow.community.browserless.browserless_client.httpx.AsyncClient") as mock_cls:
|
||||
mock_ctx = MagicMock()
|
||||
mock_cls.return_value.__aenter__.return_value = mock_ctx
|
||||
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.status_code = 200
|
||||
mock_resp.text = "<html>OK</html>"
|
||||
mock_resp.headers = {}
|
||||
mock_ctx.post = AsyncMock(return_value=mock_resp)
|
||||
|
||||
client = BrowserlessClient(base_url="http://browserless:3000")
|
||||
await client.fetch_html(
|
||||
"https://example.com",
|
||||
reject_resource_types=["image"],
|
||||
reject_request_pattern=[r"\.css$"],
|
||||
)
|
||||
|
||||
payload = mock_ctx.post.call_args.kwargs["json"]
|
||||
assert payload["rejectResourceTypes"] == ["image"]
|
||||
assert payload["rejectRequestPattern"] == [r"\.css$"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
class TestBrowserlessTools:
|
||||
"""Tests for the Browserless tool functions."""
|
||||
|
||||
@patch("deerflow.community.browserless.tools._get_browserless_client")
|
||||
async def test_web_fetch_tool_success(self, mock_get_client):
|
||||
"""web_fetch_tool successfully fetches and extracts content."""
|
||||
mock_client = MagicMock()
|
||||
mock_client.fetch_html = AsyncMock(return_value="<html><body><article><h1>Title</h1><p>Content</p></article></body></html>")
|
||||
mock_get_client.return_value = mock_client
|
||||
|
||||
with patch("deerflow.community.browserless.tools._get_tool_config", return_value=None):
|
||||
result = await tools.web_fetch_tool.ainvoke("https://example.com/article")
|
||||
|
||||
assert "Error:" not in result
|
||||
|
||||
@patch("deerflow.community.browserless.tools._get_browserless_client")
|
||||
async def test_web_fetch_tool_error(self, mock_get_client):
|
||||
"""web_fetch_tool returns error when fetch fails."""
|
||||
mock_client = MagicMock()
|
||||
mock_client.fetch_html = AsyncMock(return_value="Error: Browserless returned empty response")
|
||||
mock_get_client.return_value = mock_client
|
||||
|
||||
with patch("deerflow.community.browserless.tools._get_tool_config", return_value=None):
|
||||
result = await tools.web_fetch_tool.ainvoke("https://example.com")
|
||||
|
||||
assert result.startswith("Error:")
|
||||
|
||||
@patch("deerflow.community.browserless.tools._get_browserless_client")
|
||||
async def test_web_fetch_tool_exception(self, mock_get_client):
|
||||
"""web_fetch_tool returns error when client raises exception."""
|
||||
mock_client = MagicMock()
|
||||
mock_client.fetch_html = AsyncMock(side_effect=Exception("Unexpected error"))
|
||||
mock_get_client.return_value = mock_client
|
||||
|
||||
with patch("deerflow.community.browserless.tools._get_tool_config", return_value=None):
|
||||
result = await tools.web_fetch_tool.ainvoke("https://example.com")
|
||||
|
||||
assert result.startswith("Error:")
|
||||
Reference in New Issue
Block a user