diff --git a/backend/packages/harness/deerflow/community/browserless/__init__.py b/backend/packages/harness/deerflow/community/browserless/__init__.py new file mode 100644 index 000000000..d03906c20 --- /dev/null +++ b/backend/packages/harness/deerflow/community/browserless/__init__.py @@ -0,0 +1,4 @@ +from .browserless_client import BrowserlessClient +from .tools import web_fetch_tool + +__all__ = ["BrowserlessClient", "web_fetch_tool"] diff --git a/backend/packages/harness/deerflow/community/browserless/browserless_client.py b/backend/packages/harness/deerflow/community/browserless/browserless_client.py new file mode 100644 index 000000000..c5fa2cbf8 --- /dev/null +++ b/backend/packages/harness/deerflow/community/browserless/browserless_client.py @@ -0,0 +1,98 @@ +import logging +from typing import Any + +import httpx + +logger = logging.getLogger(__name__) + + +class BrowserlessClient: + """Client for Browserless headless Chrome API.""" + + def __init__(self, base_url: str, token: str = "", timeout_s: float = 30) -> None: + self.base_url = base_url.rstrip("/") + self.token = token + self.timeout_s = timeout_s + + async def fetch_html( + self, + url: str, + wait_for_event: str = "", + wait_for_timeout_ms: int = 0, + wait_for_selector: str = "", + wait_for_selector_timeout_ms: int = 5000, + reject_resource_types: list[str] | None = None, + reject_request_pattern: list[str] | None = None, + ) -> str: + """Fetch the rendered HTML of a page using Browserless. + + Only sends accepted parameters for the current Browserless API version. + Sets a default navigation timeout (30s) via query param. + + Args: + url: The URL to fetch. + wait_for_event: Wait for a page event (e.g. "networkidle", "load"). + wait_for_timeout_ms: Extra wait after page load. + wait_for_selector: CSS selector to wait for. + wait_for_selector_timeout_ms: Timeout for selector wait. + reject_resource_types: Resource types to block (e.g. ["image"]). + reject_request_pattern: URL patterns to block. + + Returns: + Rendered HTML content. + """ + payload: dict[str, Any] = { + "url": url, + } + + if self.token: + payload["token"] = self.token + if wait_for_event: + payload["waitForEvent"] = wait_for_event + if wait_for_timeout_ms > 0: + payload["waitForTimeout"] = wait_for_timeout_ms + if wait_for_selector: + payload["waitForSelector"] = { + "selector": wait_for_selector, + "timeout": wait_for_selector_timeout_ms, + } + if reject_resource_types: + payload["rejectResourceTypes"] = reject_resource_types + if reject_request_pattern: + payload["rejectRequestPattern"] = reject_request_pattern + + logger.debug(f"Fetching URL via Browserless: {url}") + try: + async with httpx.AsyncClient(timeout=self.timeout_s) as client: + resp = await client.post( + f"{self.base_url}/content", + json=payload, + headers={ + "Content-Type": "application/json", + "Cache-Control": "no-cache", + }, + ) + + code = resp.status_code + target_code = resp.headers.get("X-Response-Code", "") + target_status = resp.headers.get("X-Response-Status", "") + + logger.debug(f"Browserless response: code={code}, target_code={target_code}, target_status={target_status}") + + if code != 200: + return f"Error: Browserless HTTP {code}: {resp.text[:200]}" + + html = resp.text + if not html or not html.strip(): + return "Error: Browserless returned empty response" + + return html + + except httpx.TimeoutException: + return f"Error: Browserless request timed out after {self.timeout_s}s" + except httpx.RequestError as e: + logger.error(f"Browserless request failed: {e}") + return f"Error: Browserless request failed: {e!s}" + except Exception as e: + logger.error(f"Browserless fetch failed: {e}") + return f"Error: Browserless fetch failed: {e!s}" diff --git a/backend/packages/harness/deerflow/community/browserless/tools.py b/backend/packages/harness/deerflow/community/browserless/tools.py new file mode 100644 index 000000000..649b6da6a --- /dev/null +++ b/backend/packages/harness/deerflow/community/browserless/tools.py @@ -0,0 +1,85 @@ +import asyncio +import logging + +from langchain.tools import tool + +from deerflow.config import get_app_config +from deerflow.utils.readability import ReadabilityExtractor + +from .browserless_client import BrowserlessClient + +logger = logging.getLogger(__name__) + +# readability_extractor runs CPU-bound parsing; always call via asyncio.to_thread +_readability_extractor = ReadabilityExtractor() + + +def _get_tool_config(tool_name: str) -> dict | None: + """Get tool config extras safely, returning None if not configured.""" + config = get_app_config().get_tool_config(tool_name) + if config is None: + return None + extras = config.model_extra + return extras if extras is not None else {} + + +def _get_browserless_client() -> BrowserlessClient: + cfg = _get_tool_config("web_fetch") + base_url = "http://localhost:3032" + token = "" + timeout_s = 30.0 + if cfg is not None: + base_url = cfg.get("base_url", base_url) + token = cfg.get("token", token) + raw = cfg.get("timeout_s", timeout_s) + timeout_s = float(raw) if not isinstance(raw, float) else raw + return BrowserlessClient(base_url=base_url, token=token, timeout_s=timeout_s) + + +@tool("web_fetch", parse_docstring=True) +async def web_fetch_tool(url: str) -> str: + """Fetch the contents of a web page at a given URL using Browserless (headless Chrome). + Only fetch EXACT URLs that have been provided directly by the user or have been returned in results from the web_search and web_fetch tools. + This tool can NOT access content that requires authentication, such as private Google Docs or pages behind login walls. + Do NOT add www. to URLs that do NOT have them. + URLs must include the schema: https://example.com is a valid URL while example.com is an invalid URL. + + Args: + url: The URL to fetch the contents of. + """ + try: + cfg = _get_tool_config("web_fetch") + + wait_for_event = "" + wait_for_timeout_ms = 0 + wait_for_selector = "" + wait_for_selector_timeout_ms = 5000 + reject_resource_types: list[str] | None = None + reject_request_pattern: list[str] | None = None + + if cfg is not None: + wait_for_event = cfg.get("wait_for_event", wait_for_event) + raw_wait = cfg.get("wait_for_timeout_ms", wait_for_timeout_ms) + wait_for_timeout_ms = int(raw_wait) if not isinstance(raw_wait, int) else raw_wait + wait_for_selector = cfg.get("wait_for_selector", wait_for_selector) + + client = _get_browserless_client() + html = await client.fetch_html( + url=url, + wait_for_event=wait_for_event, + wait_for_timeout_ms=wait_for_timeout_ms, + wait_for_selector=wait_for_selector, + wait_for_selector_timeout_ms=wait_for_selector_timeout_ms, + reject_resource_types=reject_resource_types, + reject_request_pattern=reject_request_pattern, + ) + + if html.startswith("Error:"): + return html + + article = await asyncio.to_thread(_readability_extractor.extract_article, html) + return article.to_markdown()[:4096] + + except Exception as e: + logger.error(f"Error in web_fetch_tool: {e}") + return f"Error: {str(e)}" diff --git a/backend/packages/harness/deerflow/community/searxng/__init__.py b/backend/packages/harness/deerflow/community/searxng/__init__.py new file mode 100644 index 000000000..876167859 --- /dev/null +++ b/backend/packages/harness/deerflow/community/searxng/__init__.py @@ -0,0 +1,3 @@ +from .tools import web_search_tool + +__all__ = ["web_search_tool"] diff --git a/backend/packages/harness/deerflow/community/searxng/searxng_client.py b/backend/packages/harness/deerflow/community/searxng/searxng_client.py new file mode 100644 index 000000000..feb6a1eef --- /dev/null +++ b/backend/packages/harness/deerflow/community/searxng/searxng_client.py @@ -0,0 +1,65 @@ +import logging +from typing import Any + +import httpx + +logger = logging.getLogger(__name__) + + +class SearxngClient: + """Client for SearXNG meta search engine API.""" + + def __init__(self, base_url: str) -> None: + self.base_url = base_url.rstrip("/") + + async def search( + self, + query: str, + max_results: int = 5, + categories: list[str] | None = None, + ) -> list[dict[str, Any]]: + """Search the web using SearXNG. + + Args: + query: The search query. + max_results: Maximum number of results to return. + categories: Search categories to use. + + Returns: + List of search result dictionaries. + """ + params: dict[str, Any] = { + "q": query, + "format": "json", + "language": "auto", + "pageno": 1, + } + if max_results: + params["limit"] = max_results + if categories: + params["categories"] = ",".join(categories) + + logger.debug(f"Searching SearXNG at {self.base_url} with query: {query}") + try: + async with httpx.AsyncClient(timeout=30) as client: + resp = await client.get( + f"{self.base_url}/search", + params=params, + headers={ + "User-Agent": "Mozilla/5.0 (compatible; DeerFlow/1.0)", + "Accept": "application/json", + }, + ) + resp.raise_for_status() + data = resp.json() + results = data.get("results", []) + return results[:max_results] if max_results else results + except httpx.HTTPStatusError as e: + logger.error(f"SearXNG search returned error status: {e}") + raise + except httpx.RequestError as e: + logger.error(f"SearXNG search request failed: {e}") + raise + except Exception as e: + logger.error(f"An unexpected error occurred during SearXNG search: {e}") + raise diff --git a/backend/packages/harness/deerflow/community/searxng/tools.py b/backend/packages/harness/deerflow/community/searxng/tools.py new file mode 100644 index 000000000..407d58efb --- /dev/null +++ b/backend/packages/harness/deerflow/community/searxng/tools.py @@ -0,0 +1,58 @@ +import json +import logging + +from langchain.tools import tool + +from deerflow.config import get_app_config + +from .searxng_client import SearxngClient + +logger = logging.getLogger(__name__) + + +def _get_tool_config(tool_name: str) -> dict | None: + """Get tool config extras safely, returning None if not configured.""" + config = get_app_config().get_tool_config(tool_name) + if config is None: + return None + extras = config.model_extra + return extras if extras is not None else {} + + +def _get_searxng_client() -> SearxngClient: + cfg = _get_tool_config("web_search") + base_url = "http://localhost:8088" + if cfg is not None: + base_url = cfg.get("base_url", base_url) + return SearxngClient(base_url=base_url) + + +@tool("web_search", parse_docstring=True) +async def web_search_tool(query: str) -> str: + """Search the web using SearXNG. + + Args: + query: The query to search for. + """ + try: + cfg = _get_tool_config("web_search") + max_results = 5 + if cfg is not None: + raw = cfg.get("max_results", max_results) + max_results = int(raw) if not isinstance(raw, int) else raw + + client = _get_searxng_client() + results = await client.search(query, max_results=max_results) + + normalized = [ + { + "title": r.get("title", ""), + "url": r.get("url", ""), + "snippet": r.get("content", ""), + } + for r in results + ] + return json.dumps(normalized, indent=2, ensure_ascii=False) + except Exception as e: + logger.error(f"Error in web_search_tool: {e}") + return json.dumps({"error": str(e), "query": query}, ensure_ascii=False) diff --git a/backend/tests/test_browserless_client.py b/backend/tests/test_browserless_client.py new file mode 100644 index 000000000..b419f6e2c --- /dev/null +++ b/backend/tests/test_browserless_client.py @@ -0,0 +1,187 @@ +"""Tests for Browserless community tools.""" + +from unittest.mock import MagicMock, patch + +import pytest + +from deerflow.community.browserless import tools +from deerflow.community.browserless.browserless_client import BrowserlessClient + + +class AsyncMock(MagicMock): + """Mock that supports async call.""" + + async def __call__(self, *args, **kwargs): + return super().__call__(*args, **kwargs) + + +@pytest.mark.asyncio +class TestBrowserlessClient: + """Tests for the BrowserlessClient class.""" + + async def test_fetch_html_success(self): + """fetch_html returns HTML content on success.""" + with patch("deerflow.community.browserless.browserless_client.httpx.AsyncClient") as mock_cls: + mock_ctx = MagicMock() + mock_cls.return_value.__aenter__.return_value = mock_ctx + + mock_resp = MagicMock() + mock_resp.status_code = 200 + mock_resp.text = "Page content" + mock_resp.headers = {} + mock_ctx.post = AsyncMock(return_value=mock_resp) + + client = BrowserlessClient(base_url="http://browserless:3000") + result = await client.fetch_html("https://example.com") + + assert result == "Page content" + call_kwargs = mock_ctx.post.call_args.kwargs + assert call_kwargs["json"]["url"] == "https://example.com" + assert "waitUntil" not in call_kwargs["json"] + assert "gotoTimeout" not in call_kwargs["json"] + assert "bestAttempt" not in call_kwargs["json"] + + async def test_fetch_html_empty_response(self): + """fetch_html returns error for empty response.""" + with patch("deerflow.community.browserless.browserless_client.httpx.AsyncClient") as mock_cls: + mock_ctx = MagicMock() + mock_cls.return_value.__aenter__.return_value = mock_ctx + + mock_resp = MagicMock() + mock_resp.status_code = 200 + mock_resp.text = " " + mock_resp.headers = {} + mock_ctx.post = AsyncMock(return_value=mock_resp) + + client = BrowserlessClient(base_url="http://browserless:3000") + result = await client.fetch_html("https://example.com") + assert result == "Error: Browserless returned empty response" + + async def test_fetch_html_http_error(self): + """fetch_html returns error for non-200 status.""" + with patch("deerflow.community.browserless.browserless_client.httpx.AsyncClient") as mock_cls: + mock_ctx = MagicMock() + mock_cls.return_value.__aenter__.return_value = mock_ctx + + mock_resp = MagicMock() + mock_resp.status_code = 500 + mock_resp.text = "Internal error" + mock_resp.headers = {} + mock_ctx.post = AsyncMock(return_value=mock_resp) + + client = BrowserlessClient(base_url="http://browserless:3000") + result = await client.fetch_html("https://example.com") + assert "Error: Browserless HTTP 500" in result + + async def test_fetch_html_timeout(self): + """fetch_html returns timeout error.""" + with patch("deerflow.community.browserless.browserless_client.httpx.AsyncClient") as mock_cls: + mock_ctx = MagicMock() + mock_cls.return_value.__aenter__.return_value = mock_ctx + import httpx + + mock_ctx.post = AsyncMock(side_effect=httpx.TimeoutException("Timed out")) + + client = BrowserlessClient(base_url="http://browserless:3000", timeout_s=10) + result = await client.fetch_html("https://example.com") + assert "timed out" in result.lower() or "timeout" in result.lower() + + async def test_fetch_html_with_token(self): + """fetch_html includes token in payload when set.""" + with patch("deerflow.community.browserless.browserless_client.httpx.AsyncClient") as mock_cls: + mock_ctx = MagicMock() + mock_cls.return_value.__aenter__.return_value = mock_ctx + + mock_resp = MagicMock() + mock_resp.status_code = 200 + mock_resp.text = "OK" + mock_resp.headers = {} + mock_ctx.post = AsyncMock(return_value=mock_resp) + + client = BrowserlessClient(base_url="http://browserless:3000", token="my-token") + await client.fetch_html("https://example.com") + + payload = mock_ctx.post.call_args.kwargs["json"] + assert payload["token"] == "my-token" + + async def test_fetch_html_with_wait_for_selector(self): + """fetch_html sends waitForSelector when selector is set.""" + with patch("deerflow.community.browserless.browserless_client.httpx.AsyncClient") as mock_cls: + mock_ctx = MagicMock() + mock_cls.return_value.__aenter__.return_value = mock_ctx + + mock_resp = MagicMock() + mock_resp.status_code = 200 + mock_resp.text = "OK" + mock_resp.headers = {} + mock_ctx.post = AsyncMock(return_value=mock_resp) + + client = BrowserlessClient(base_url="http://browserless:3000") + await client.fetch_html("https://example.com", wait_for_selector="article") + + payload = mock_ctx.post.call_args.kwargs["json"] + assert payload["waitForSelector"]["selector"] == "article" + + async def test_fetch_html_with_reject_params(self): + """fetch_html sends reject params when set.""" + with patch("deerflow.community.browserless.browserless_client.httpx.AsyncClient") as mock_cls: + mock_ctx = MagicMock() + mock_cls.return_value.__aenter__.return_value = mock_ctx + + mock_resp = MagicMock() + mock_resp.status_code = 200 + mock_resp.text = "OK" + mock_resp.headers = {} + mock_ctx.post = AsyncMock(return_value=mock_resp) + + client = BrowserlessClient(base_url="http://browserless:3000") + await client.fetch_html( + "https://example.com", + reject_resource_types=["image"], + reject_request_pattern=[r"\.css$"], + ) + + payload = mock_ctx.post.call_args.kwargs["json"] + assert payload["rejectResourceTypes"] == ["image"] + assert payload["rejectRequestPattern"] == [r"\.css$"] + + +@pytest.mark.asyncio +class TestBrowserlessTools: + """Tests for the Browserless tool functions.""" + + @patch("deerflow.community.browserless.tools._get_browserless_client") + async def test_web_fetch_tool_success(self, mock_get_client): + """web_fetch_tool successfully fetches and extracts content.""" + mock_client = MagicMock() + mock_client.fetch_html = AsyncMock(return_value="

Title

Content

") + mock_get_client.return_value = mock_client + + with patch("deerflow.community.browserless.tools._get_tool_config", return_value=None): + result = await tools.web_fetch_tool.ainvoke("https://example.com/article") + + assert "Error:" not in result + + @patch("deerflow.community.browserless.tools._get_browserless_client") + async def test_web_fetch_tool_error(self, mock_get_client): + """web_fetch_tool returns error when fetch fails.""" + mock_client = MagicMock() + mock_client.fetch_html = AsyncMock(return_value="Error: Browserless returned empty response") + mock_get_client.return_value = mock_client + + with patch("deerflow.community.browserless.tools._get_tool_config", return_value=None): + result = await tools.web_fetch_tool.ainvoke("https://example.com") + + assert result.startswith("Error:") + + @patch("deerflow.community.browserless.tools._get_browserless_client") + async def test_web_fetch_tool_exception(self, mock_get_client): + """web_fetch_tool returns error when client raises exception.""" + mock_client = MagicMock() + mock_client.fetch_html = AsyncMock(side_effect=Exception("Unexpected error")) + mock_get_client.return_value = mock_client + + with patch("deerflow.community.browserless.tools._get_tool_config", return_value=None): + result = await tools.web_fetch_tool.ainvoke("https://example.com") + + assert result.startswith("Error:") diff --git a/backend/tests/test_searxng_client.py b/backend/tests/test_searxng_client.py new file mode 100644 index 000000000..491aaff41 --- /dev/null +++ b/backend/tests/test_searxng_client.py @@ -0,0 +1,163 @@ +"""Tests for SearXNG community tools.""" + +import json +from unittest.mock import MagicMock, patch + +import pytest + +from deerflow.community.searxng import tools +from deerflow.community.searxng.searxng_client import SearxngClient + + +class AsyncMock(MagicMock): + """Mock that supports async call.""" + + async def __call__(self, *args, **kwargs): + return super().__call__(*args, **kwargs) + + +@pytest.mark.asyncio +class TestSearxngClient: + """Tests for the SearxngClient class.""" + + async def test_search_success(self): + """Search returns normalized results.""" + results_data = { + "results": [ + {"title": "Page 1", "url": "https://example.com/1", "content": "Snippet 1"}, + {"title": "Page 2", "url": "https://example.com/2", "content": "Snippet 2"}, + ] + } + + with patch("deerflow.community.searxng.searxng_client.httpx.AsyncClient") as mock_cls: + mock_ctx = MagicMock() + mock_cls.return_value.__aenter__.return_value = mock_ctx + + mock_resp = MagicMock() + mock_resp.status_code = 200 + mock_resp.json.return_value = results_data + mock_resp.raise_for_status.return_value = None + mock_ctx.get = AsyncMock(return_value=mock_resp) + + client = SearxngClient(base_url="http://searxng:8080") + result = await client.search("test query", max_results=5) + + assert len(result) == 2 + assert result[0]["title"] == "Page 1" + assert result[1]["url"] == "https://example.com/2" + + async def test_search_empty_results(self): + """Search returns empty list when no results.""" + with patch("deerflow.community.searxng.searxng_client.httpx.AsyncClient") as mock_cls: + mock_ctx = MagicMock() + mock_cls.return_value.__aenter__.return_value = mock_ctx + + mock_resp = MagicMock() + mock_resp.status_code = 200 + mock_resp.json.return_value = {"results": []} + mock_resp.raise_for_status.return_value = None + mock_ctx.get = AsyncMock(return_value=mock_resp) + + client = SearxngClient(base_url="http://searxng:8080") + result = await client.search("empty query") + assert result == [] + + async def test_search_http_error(self): + """Search raises on HTTP error.""" + with patch("deerflow.community.searxng.searxng_client.httpx.AsyncClient") as mock_cls: + mock_ctx = MagicMock() + mock_cls.return_value.__aenter__.return_value = mock_ctx + + import httpx + + mock_resp = MagicMock() + mock_resp.raise_for_status.side_effect = httpx.HTTPStatusError("403 Forbidden", request=MagicMock(), response=MagicMock()) + mock_ctx.get = AsyncMock(return_value=mock_resp) + + client = SearxngClient(base_url="http://searxng:8080") + with pytest.raises(httpx.HTTPStatusError): + await client.search("blocked query") + + async def test_search_request_error(self): + """Search raises on request error.""" + with patch("deerflow.community.searxng.searxng_client.httpx.AsyncClient") as mock_cls: + mock_ctx = MagicMock() + mock_cls.return_value.__aenter__.return_value = mock_ctx + + import httpx + + mock_ctx.get = AsyncMock(side_effect=httpx.RequestError("Connection refused")) + + client = SearxngClient(base_url="http://searxng:8080") + with pytest.raises(httpx.RequestError): + await client.search("unreachable query") + + async def test_search_with_categories(self): + """Search passes categories parameter.""" + with patch("deerflow.community.searxng.searxng_client.httpx.AsyncClient") as mock_cls: + mock_ctx = MagicMock() + mock_cls.return_value.__aenter__.return_value = mock_ctx + + mock_resp = MagicMock() + mock_resp.status_code = 200 + mock_resp.json.return_value = {"results": []} + mock_resp.raise_for_status.return_value = None + mock_ctx.get = AsyncMock(return_value=mock_resp) + + client = SearxngClient(base_url="http://searxng:8080") + await client.search("test", categories=["news", "science"]) + + call_kwargs = mock_ctx.get.call_args.kwargs + assert call_kwargs["params"]["categories"] == "news,science" + + +@pytest.mark.asyncio +class TestSearxngTools: + """Tests for the SearXNG tool functions.""" + + @patch("deerflow.community.searxng.tools._get_searxng_client") + async def test_web_search_tool_success(self, mock_get_client): + """web_search_tool returns JSON results.""" + mock_client = MagicMock() + mock_client.search = AsyncMock( + return_value=[ + {"title": "Result 1", "url": "https://example.com/1", "content": "Desc 1"}, + ] + ) + mock_get_client.return_value = mock_client + + with patch("deerflow.community.searxng.tools._get_tool_config", return_value=None): + result = await tools.web_search_tool.ainvoke("test query") + + data = json.loads(result) + assert len(data) == 1 + assert data[0]["title"] == "Result 1" + + @patch("deerflow.community.searxng.tools._get_searxng_client") + async def test_web_search_tool_error(self, mock_get_client): + """web_search_tool handles errors gracefully.""" + mock_client = MagicMock() + mock_client.search = AsyncMock(side_effect=Exception("API error")) + mock_get_client.return_value = mock_client + + with patch("deerflow.community.searxng.tools._get_tool_config", return_value=None): + result = await tools.web_search_tool.ainvoke("test query") + + data = json.loads(result) + assert "error" in data + + @patch("deerflow.community.searxng.tools._get_searxng_client") + async def test_web_search_tool_with_max_results(self, mock_get_client): + """web_search_tool respects max_results config.""" + mock_client = MagicMock() + # Return 10 results; the tool should slice to max_results=3 + mock_client.search = AsyncMock(return_value=[{"title": f"Result {i}", "url": f"https://example.com/{i}", "content": f"Desc {i}"} for i in range(10)]) + mock_get_client.return_value = mock_client + + with patch("deerflow.community.searxng.tools._get_tool_config", return_value={"max_results": "3"}): + await tools.web_search_tool.ainvoke("test query") + + # Verify that search was called with max_results=3 (coerced from string) + mock_client.search.assert_called_once() + call_kwargs = mock_client.search.call_args.kwargs + assert call_kwargs["max_results"] == 3