mirror of
https://github.com/bytedance/deer-flow.git
synced 2026-06-12 10:25:58 +00:00
feat(community): add SearXNG and Browserless web search/fetch tools (#3451)
* feat(community): add SearXNG and Browserless web search/fetch tools - SearXNG web_search: privacy-focused meta search engine integration with configurable base_url via config.yaml tool settings - Browserless web_fetch: headless browser page fetching with readability article extraction - Both tools are fully configurable through tool config section - No external API keys required for basic operation * fix: address PR review feedback and add unit tests - Guard config.model_extra against None values (review #1, #2) - Coerce max_results to int when reading from config (review #2) - Fix web_fetch_tool to use direct HTTP fetch instead of reusing the web_search client config (review #3) - Fix misleading docstring for SearxngClient.fetch (review #4) - Remove unused target_url variable to pass Ruff lint (review #5) - Normalize bool config values with _normalize_bool helper to handle env-resolved string values correctly (review #6) - Add unit tests for both SearXNG and Browserless client classes and their tool functions with mocked httpx (review #7, #8) * fix: convert to async httpx to avoid blocking I/O on event loop - Replace httpx.Client with httpx.AsyncClient in both client classes - Convert tool functions to async def - Wrap readability_extractor calls in asyncio.to_thread() - Update all tests to use pytest.mark.asyncio and async mocks - Fix import sorting to pass Ruff lint * fix(browserless): replace deprecated waitUntil with waitForEvent The Browserless API has deprecated the waitUntil parameter. Replace with waitForEvent which accepts values like 'networkidle'. Default is empty (no wait), configurable via config.yaml. * fix(browserless): remove deprecated gotoTimeout and bestAttempt params The Browserless /content API does not accept gotoTimeout or bestAttempt as top-level payload keys. These were being sent unconditionally, causing 400 Bad Request errors on current Browserless versions. Changes: - Remove goto_timeout_ms parameter and 'gotoTimeout' from payload - Remove best_attempt parameter and 'bestAttempt' from payload - Remove _normalize_bool helper (no longer needed) - Remove goto_timeout_ms and best_attempt config reading in tools.py - Add tests for waitForSelector and reject params - Verify no deprecated params are sent in test_fetch_html_success * refactor(searxng): remove web_fetch_tool, decouple from web_search config SearXNG is a search engine — it should only provide web_search_tool. The web_fetch responsibility belongs to Browserless (headless Chrome) or Jina AI, not SearXNG. Changes: - Remove web_fetch_tool from SearXNG tools.py and __init__.py - Remove SearxngClient.fetch() method (no longer needed) - Remove unused asyncio/readability imports from SearXNG tools.py - Add test for max_results string-to-int coercion from config - Add test for search with categories parameter - Add test for httpx.RequestError handling - Apply ruff format fixes to browserless_client.py and test files
This commit is contained in:
@@ -0,0 +1,58 @@
|
||||
import json
|
||||
import logging
|
||||
|
||||
from langchain.tools import tool
|
||||
|
||||
from deerflow.config import get_app_config
|
||||
|
||||
from .searxng_client import SearxngClient
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _get_tool_config(tool_name: str) -> dict | None:
|
||||
"""Get tool config extras safely, returning None if not configured."""
|
||||
config = get_app_config().get_tool_config(tool_name)
|
||||
if config is None:
|
||||
return None
|
||||
extras = config.model_extra
|
||||
return extras if extras is not None else {}
|
||||
|
||||
|
||||
def _get_searxng_client() -> SearxngClient:
|
||||
cfg = _get_tool_config("web_search")
|
||||
base_url = "http://localhost:8088"
|
||||
if cfg is not None:
|
||||
base_url = cfg.get("base_url", base_url)
|
||||
return SearxngClient(base_url=base_url)
|
||||
|
||||
|
||||
@tool("web_search", parse_docstring=True)
|
||||
async def web_search_tool(query: str) -> str:
|
||||
"""Search the web using SearXNG.
|
||||
|
||||
Args:
|
||||
query: The query to search for.
|
||||
"""
|
||||
try:
|
||||
cfg = _get_tool_config("web_search")
|
||||
max_results = 5
|
||||
if cfg is not None:
|
||||
raw = cfg.get("max_results", max_results)
|
||||
max_results = int(raw) if not isinstance(raw, int) else raw
|
||||
|
||||
client = _get_searxng_client()
|
||||
results = await client.search(query, max_results=max_results)
|
||||
|
||||
normalized = [
|
||||
{
|
||||
"title": r.get("title", ""),
|
||||
"url": r.get("url", ""),
|
||||
"snippet": r.get("content", ""),
|
||||
}
|
||||
for r in results
|
||||
]
|
||||
return json.dumps(normalized, indent=2, ensure_ascii=False)
|
||||
except Exception as e:
|
||||
logger.error(f"Error in web_search_tool: {e}")
|
||||
return json.dumps({"error": str(e), "query": query}, ensure_ascii=False)
|
||||
Reference in New Issue
Block a user