mirror of
https://github.com/bytedance/deer-flow.git
synced 2026-06-12 02:15:58 +00:00
330a2ff8c5
* feat(community): add SearXNG and Browserless web search/fetch tools - SearXNG web_search: privacy-focused meta search engine integration with configurable base_url via config.yaml tool settings - Browserless web_fetch: headless browser page fetching with readability article extraction - Both tools are fully configurable through tool config section - No external API keys required for basic operation * fix: address PR review feedback and add unit tests - Guard config.model_extra against None values (review #1, #2) - Coerce max_results to int when reading from config (review #2) - Fix web_fetch_tool to use direct HTTP fetch instead of reusing the web_search client config (review #3) - Fix misleading docstring for SearxngClient.fetch (review #4) - Remove unused target_url variable to pass Ruff lint (review #5) - Normalize bool config values with _normalize_bool helper to handle env-resolved string values correctly (review #6) - Add unit tests for both SearXNG and Browserless client classes and their tool functions with mocked httpx (review #7, #8) * fix: convert to async httpx to avoid blocking I/O on event loop - Replace httpx.Client with httpx.AsyncClient in both client classes - Convert tool functions to async def - Wrap readability_extractor calls in asyncio.to_thread() - Update all tests to use pytest.mark.asyncio and async mocks - Fix import sorting to pass Ruff lint * fix(browserless): replace deprecated waitUntil with waitForEvent The Browserless API has deprecated the waitUntil parameter. Replace with waitForEvent which accepts values like 'networkidle'. Default is empty (no wait), configurable via config.yaml. * fix(browserless): remove deprecated gotoTimeout and bestAttempt params The Browserless /content API does not accept gotoTimeout or bestAttempt as top-level payload keys. These were being sent unconditionally, causing 400 Bad Request errors on current Browserless versions. Changes: - Remove goto_timeout_ms parameter and 'gotoTimeout' from payload - Remove best_attempt parameter and 'bestAttempt' from payload - Remove _normalize_bool helper (no longer needed) - Remove goto_timeout_ms and best_attempt config reading in tools.py - Add tests for waitForSelector and reject params - Verify no deprecated params are sent in test_fetch_html_success * refactor(searxng): remove web_fetch_tool, decouple from web_search config SearXNG is a search engine — it should only provide web_search_tool. The web_fetch responsibility belongs to Browserless (headless Chrome) or Jina AI, not SearXNG. Changes: - Remove web_fetch_tool from SearXNG tools.py and __init__.py - Remove SearxngClient.fetch() method (no longer needed) - Remove unused asyncio/readability imports from SearXNG tools.py - Add test for max_results string-to-int coercion from config - Add test for search with categories parameter - Add test for httpx.RequestError handling - Apply ruff format fixes to browserless_client.py and test files
66 lines
2.0 KiB
Python
66 lines
2.0 KiB
Python
import logging
|
|
from typing import Any
|
|
|
|
import httpx
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class SearxngClient:
|
|
"""Client for SearXNG meta search engine API."""
|
|
|
|
def __init__(self, base_url: str) -> None:
|
|
self.base_url = base_url.rstrip("/")
|
|
|
|
async def search(
|
|
self,
|
|
query: str,
|
|
max_results: int = 5,
|
|
categories: list[str] | None = None,
|
|
) -> list[dict[str, Any]]:
|
|
"""Search the web using SearXNG.
|
|
|
|
Args:
|
|
query: The search query.
|
|
max_results: Maximum number of results to return.
|
|
categories: Search categories to use.
|
|
|
|
Returns:
|
|
List of search result dictionaries.
|
|
"""
|
|
params: dict[str, Any] = {
|
|
"q": query,
|
|
"format": "json",
|
|
"language": "auto",
|
|
"pageno": 1,
|
|
}
|
|
if max_results:
|
|
params["limit"] = max_results
|
|
if categories:
|
|
params["categories"] = ",".join(categories)
|
|
|
|
logger.debug(f"Searching SearXNG at {self.base_url} with query: {query}")
|
|
try:
|
|
async with httpx.AsyncClient(timeout=30) as client:
|
|
resp = await client.get(
|
|
f"{self.base_url}/search",
|
|
params=params,
|
|
headers={
|
|
"User-Agent": "Mozilla/5.0 (compatible; DeerFlow/1.0)",
|
|
"Accept": "application/json",
|
|
},
|
|
)
|
|
resp.raise_for_status()
|
|
data = resp.json()
|
|
results = data.get("results", [])
|
|
return results[:max_results] if max_results else results
|
|
except httpx.HTTPStatusError as e:
|
|
logger.error(f"SearXNG search returned error status: {e}")
|
|
raise
|
|
except httpx.RequestError as e:
|
|
logger.error(f"SearXNG search request failed: {e}")
|
|
raise
|
|
except Exception as e:
|
|
logger.error(f"An unexpected error occurred during SearXNG search: {e}")
|
|
raise
|