feat(eval): add report quality evaluation module and UI integration (#776)

* feat(eval): add report quality evaluation module Addresses issue #773 - How to evaluate generated report quality objectively. This module provides two evaluation approaches: 1. Automated metrics (no LLM required): - Citation count and source diversity - Word count compliance per report style - Section structure validation - Image inclusion tracking 2. LLM-as-Judge evaluation: - Factual accuracy scoring - Completeness assessment - Coherence evaluation - Relevance and citation quality checks The combined evaluator provides a final score (1-10) and letter grade (A+ to F). Files added: - src/eval/__init__.py - src/eval/metrics.py - src/eval/llm_judge.py - src/eval/evaluator.py - tests/unit/eval/test_metrics.py - tests/unit/eval/test_evaluator.py * feat(eval): integrate report evaluation with web UI This commit adds the web UI integration for the evaluation module: Backend: - Add EvaluateReportRequest/Response models in src/server/eval_request.py - Add /api/report/evaluate endpoint to src/server/app.py Frontend: - Add evaluateReport API function in web/src/core/api/evaluate.ts - Create EvaluationDialog component with grade badge, metrics display, and optional LLM deep evaluation - Add evaluation button (graduation cap icon) to research-block.tsx toolbar - Add i18n translations for English and Chinese The evaluation UI allows users to: 1. View quick metrics-only evaluation (instant) 2. Optionally run deep LLM-based evaluation for detailed analysis 3. See grade (A+ to F), score (1-10), and metric breakdown * feat(eval): improve evaluation reliability and add LLM judge tests - Extract MAX_REPORT_LENGTH constant in llm_judge.py for maintainability - Add comprehensive unit tests for LLMJudge class (parse_response, calculate_weighted_score, evaluate with mocked LLM) - Pass reportStyle prop to EvaluationDialog for accurate evaluation criteria - Add researchQueries store map to reliably associate queries with research - Add getResearchQuery helper to retrieve query by researchId - Remove unused imports in test_metrics.py * fix(eval): use resolveServiceURL for evaluate API endpoint The evaluateReport function was using a relative URL '/api/report/evaluate' which sent requests to the Next.js server instead of the FastAPI backend. Changed to use resolveServiceURL() consistent with other API functions. * fix: improve type accuracy and React hooks in evaluation components - Fix get_word_count_target return type from Optional[Dict] to Dict since it always returns a value via default fallback - Fix useEffect dependency issue in EvaluationDialog using useRef to prevent unwanted re-evaluations - Add aria-label to GradeBadge for screen reader accessibility
2026-05-21 15:36:48 +00:00 · 2025-12-25 21:55:48 +08:00
parent 84a7f7815c
commit 8d9d767051
17 changed files with 2103 additions and 2 deletions
@@ -0,0 +1,229 @@
+# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
+# SPDX-License-Identifier: MIT
+
+"""
+Automated metrics for report quality evaluation.
+
+These metrics can be computed without LLM calls, providing fast and
+deterministic quality assessment.
+"""
+
+import re
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional
+from urllib.parse import urlparse
+
+
+@dataclass
+class ReportMetrics:
+    """Container for computed report metrics."""
+
+    word_count: int = 0
+    citation_count: int = 0
+    unique_sources: int = 0
+    image_count: int = 0
+    section_count: int = 0
+    sections_found: List[str] = field(default_factory=list)
+    sections_missing: List[str] = field(default_factory=list)
+    section_coverage_score: float = 0.0
+    has_title: bool = False
+    has_key_points: bool = False
+    has_overview: bool = False
+    has_citations_section: bool = False
+
+    def to_dict(self) -> Dict:
+        """Convert metrics to dictionary."""
+        return {
+            "word_count": self.word_count,
+            "citation_count": self.citation_count,
+            "unique_sources": self.unique_sources,
+            "image_count": self.image_count,
+            "section_count": self.section_count,
+            "sections_found": self.sections_found,
+            "sections_missing": self.sections_missing,
+            "section_coverage_score": self.section_coverage_score,
+            "has_title": self.has_title,
+            "has_key_points": self.has_key_points,
+            "has_overview": self.has_overview,
+            "has_citations_section": self.has_citations_section,
+        }
+
+
+# Required sections for different report styles
+REPORT_STYLE_SECTIONS = {
+    "default": [
+        "title",
+        "key_points",
+        "overview",
+        "detailed_analysis",
+        "key_citations",
+    ],
+    "academic": [
+        "title",
+        "key_points",
+        "overview",
+        "detailed_analysis",
+        "literature_review",
+        "methodology",
+        "key_citations",
+    ],
+    "news": [
+        "title",
+        "key_points",
+        "overview",
+        "detailed_analysis",
+        "key_citations",
+    ],
+    "popular_science": [
+        "title",
+        "key_points",
+        "overview",
+        "detailed_analysis",
+        "key_citations",
+    ],
+    "social_media": [
+        "title",
+        "key_points",
+        "overview",
+        "key_citations",
+    ],
+    "strategic_investment": [
+        "title",
+        "key_points",
+        "overview",
+        "detailed_analysis",
+        "executive_summary",
+        "market_analysis",
+        "technology_analysis",
+        "investment_recommendations",
+        "key_citations",
+    ],
+}
+
+# Section name patterns for detection (supports both English and Chinese)
+SECTION_PATTERNS = {
+    "title": r"^#\s+.+",
+    "key_points": r"(?:key\s*points|要点|关键发现|核心观点)",
+    "overview": r"(?:overview|概述|简介|背景)",
+    "detailed_analysis": r"(?:detailed\s*analysis|详细分析|深度分析|分析)",
+    "key_citations": r"(?:key\s*citations|references|参考文献|引用|来源)",
+    "literature_review": r"(?:literature\s*review|文献综述|研究回顾)",
+    "methodology": r"(?:methodology|方法论|研究方法)",
+    "executive_summary": r"(?:executive\s*summary|执行摘要|投资建议)",
+    "market_analysis": r"(?:market\s*analysis|市场分析|产业分析)",
+    "technology_analysis": r"(?:technology|技术.*(?:分析|解析|深度))",
+    "investment_recommendations": r"(?:investment.*recommend|投资建议|投资评级)",
+}
+
+
+def count_words(text: str) -> int:
+    """Count words in text, handling both English and Chinese."""
+    english_words = len(re.findall(r"\b[a-zA-Z]+\b", text))
+    chinese_chars = len(re.findall(r"[\u4e00-\u9fff]", text))
+    return english_words + chinese_chars
+
+
+def count_citations(text: str) -> int:
+    """Count markdown-style citations [text](url)."""
+    pattern = r"\[.+?\]\(https?://[^\s\)]+\)"
+    return len(re.findall(pattern, text))
+
+
+def extract_domains(text: str) -> List[str]:
+    """Extract unique domains from URLs in the text."""
+    url_pattern = r"https?://([^\s\)\]]+)"
+    urls = re.findall(url_pattern, text)
+    domains = set()
+    for url in urls:
+        try:
+            parsed = urlparse(f"http://{url}")
+            domain = parsed.netloc or url.split("/")[0]
+            domain = domain.lower().replace("www.", "")
+            if domain:
+                domains.add(domain)
+        except Exception:
+            continue
+    return list(domains)
+
+
+def count_images(text: str) -> int:
+    """Count markdown images ![alt](url)."""
+    pattern = r"!\[.*?\]\(.+?\)"
+    return len(re.findall(pattern, text))
+
+
+def detect_sections(text: str, report_style: str = "default") -> Dict[str, bool]:
+    """Detect which sections are present in the report."""
+    required_sections = REPORT_STYLE_SECTIONS.get(
+        report_style, REPORT_STYLE_SECTIONS["default"]
+    )
+    detected = {}
+
+    text_lower = text.lower()
+
+    for section in required_sections:
+        pattern = SECTION_PATTERNS.get(section, section.replace("_", r"\s*"))
+        if section == "title":
+            detected[section] = bool(re.search(pattern, text, re.MULTILINE))
+        else:
+            detected[section] = bool(
+                re.search(pattern, text_lower, re.IGNORECASE | re.MULTILINE)
+            )
+
+    return detected
+
+
+def compute_metrics(
+    report: str, report_style: str = "default", target_word_count: Optional[int] = None
+) -> ReportMetrics:
+    """
+    Compute automated metrics for a report.
+
+    Args:
+        report: The report text in markdown format
+        report_style: The style of report (academic, news, etc.)
+        target_word_count: Optional target word count for compliance check
+
+    Returns:
+        ReportMetrics object with computed values
+    """
+    metrics = ReportMetrics()
+
+    metrics.word_count = count_words(report)
+    metrics.citation_count = count_citations(report)
+
+    domains = extract_domains(report)
+    metrics.unique_sources = len(domains)
+
+    metrics.image_count = count_images(report)
+
+    sections_detected = detect_sections(report, report_style)
+    metrics.sections_found = [s for s, found in sections_detected.items() if found]
+    metrics.sections_missing = [
+        s for s, found in sections_detected.items() if not found
+    ]
+    metrics.section_count = len(metrics.sections_found)
+
+    total_sections = len(sections_detected)
+    if total_sections > 0:
+        metrics.section_coverage_score = len(metrics.sections_found) / total_sections
+
+    metrics.has_title = sections_detected.get("title", False)
+    metrics.has_key_points = sections_detected.get("key_points", False)
+    metrics.has_overview = sections_detected.get("overview", False)
+    metrics.has_citations_section = sections_detected.get("key_citations", False)
+
+    return metrics
+
+
+def get_word_count_target(report_style: str) -> Dict[str, int]:
+    """Get target word count range for a report style."""
+    targets = {
+        "strategic_investment": {"min": 10000, "max": 15000},
+        "academic": {"min": 3000, "max": 8000},
+        "news": {"min": 800, "max": 2000},
+        "popular_science": {"min": 1500, "max": 4000},
+        "social_media": {"min": 500, "max": 1500},
+        "default": {"min": 1000, "max": 5000},
+    }
+    return targets.get(report_style, targets["default"])