# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates # SPDX-License-Identifier: MIT import re from urllib.parse import urljoin from markdownify import markdownify as md class Article: url: str def __init__(self, title: str, html_content: str): self.title = title self.html_content = html_content def to_markdown(self, including_title: bool = True) -> str: markdown = "" if including_title: markdown += f"# {self.title}\n\n" if self.html_content is None or not str(self.html_content).strip(): markdown += "*No content available*\n" else: markdown += md(self.html_content) return markdown def to_message(self) -> list[dict]: image_pattern = r"!\[.*?\]\((.*?)\)" content: list[dict[str, str]] = [] markdown = self.to_markdown() if not markdown or not markdown.strip(): return [{"type": "text", "text": "No content available"}] parts = re.split(image_pattern, markdown) for i, part in enumerate(parts): if i % 2 == 1: image_url = urljoin(self.url, part.strip()) content.append({"type": "image_url", "image_url": {"url": image_url}}) else: text_part = part.strip() if text_part: content.append({"type": "text", "text": text_part}) # If after processing all parts, content is still empty, provide a fallback message. if not content: content = [{"type": "text", "text": "No content available"}] return content