Files
deer-flow/backend/packages/harness/deerflow/community/jina_ai/tools.py
T
Ryker_Feng f92a26d56f fix(web_fetch): support proxy for Jina reader in restricted networks (#3418) (#3430)
* fix(web_fetch): support proxy for Jina reader in restricted networks

The web_fetch tool built a bare httpx.AsyncClient() with no proxy
awareness, so users behind a corporate proxy / in Docker / WSL could
not reach https://r.jina.ai and web_fetch timed out.

- Add optional `proxy` / `trust_env` params to JinaClient.crawl and
  wire them from the `web_fetch` tool config (with type coercion for
  YAML string values).
- Pass internal service hostnames through NO_PROXY in both compose
  files so proxy env inherited via env_file does not break in-cluster
  calls (gateway/provisioner/etc).
- Load proxy vars from .env into the shell in scripts/docker.sh so the
  NO_PROXY interpolation can merge user-provided values on `make` path.
- Document proxy/trust_env options in config.example.yaml.

Closes #3418

* Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>

---------

Co-authored-by: Willem Jiang <willem.jiang@gmail.com>
Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
2026-06-08 23:25:29 +08:00

69 lines
2.4 KiB
Python

import asyncio
from langchain.tools import tool
from deerflow.community.jina_ai.jina_client import JinaClient
from deerflow.config import get_app_config
from deerflow.utils.readability import ReadabilityExtractor
readability_extractor = ReadabilityExtractor()
def _coerce_bool(value: object, default: bool) -> bool:
if isinstance(value, bool):
return value
if isinstance(value, str):
normalized = value.strip().lower()
if normalized in {"1", "true", "yes", "on"}:
return True
if normalized in {"0", "false", "no", "off"}:
return False
return default
def _coerce_timeout(value: object, default: int) -> int:
if isinstance(value, bool):
return default
if isinstance(value, int):
return value
if isinstance(value, str):
try:
return int(value)
except ValueError:
return default
return default
def _coerce_proxy(value: object) -> str | None:
if not isinstance(value, str):
return None
proxy = value.strip()
return proxy or None
@tool("web_fetch", parse_docstring=True)
async def web_fetch_tool(url: str) -> str:
"""Fetch the contents of a web page at a given URL.
Only fetch EXACT URLs that have been provided directly by the user or have been returned in results from the web_search and web_fetch tools.
This tool can NOT access content that requires authentication, such as private Google Docs or pages behind login walls.
Do NOT add www. to URLs that do NOT have them.
URLs must include the schema: https://example.com is a valid URL while example.com is an invalid URL.
Args:
url: The URL to fetch the contents of.
"""
jina_client = JinaClient()
timeout = 10
proxy = None
trust_env = True
config = get_app_config().get_tool_config("web_fetch")
if config is not None:
timeout = _coerce_timeout(config.model_extra.get("timeout"), timeout)
proxy = _coerce_proxy(config.model_extra.get("proxy"))
trust_env = _coerce_bool(config.model_extra.get("trust_env"), trust_env)
html_content = await jina_client.crawl(url, return_format="html", timeout=timeout, proxy=proxy, trust_env=trust_env)
if isinstance(html_content, str) and html_content.startswith("Error:"):
return html_content
article = await asyncio.to_thread(readability_extractor.extract_article, html_content)
return article.to_markdown()[:4096]