mirror of
https://github.com/bytedance/deer-flow.git
synced 2026-05-22 16:06:50 +00:00
feat(uploads): inject document outline into agent context for converted files (#1738)
* feat(uploads): inject document outline into agent context for converted files
Extract headings from converted .md files and inject them into the
<uploaded_files> context block so the agent can navigate large documents
by line number before reading.
- Add `extract_outline()` to `file_conversion.py`: recognises standard
Markdown headings (#/##/###) and SEC-style bold structural headings
(**ITEM N. BUSINESS**, **PART II**); caps at 50 entries; excludes
cover-page boilerplate (WASHINGTON DC, CURRENT REPORT, SIGNATURES)
- Add `_extract_outline_for_file()` helper in `uploads_middleware.py`:
looks for a sibling `.md` file produced by the conversion pipeline
- Update `UploadsMiddleware._create_files_message()` to render the outline
under each file entry with `L{line}: {title}` format and a `read_file`
prompt for range-based reading
- Tests: 10 new tests for `extract_outline()`, 4 new tests for outline
injection in `UploadsMiddleware`; existing test updated for new `outline`
field in `uploaded_files` state
Partially addresses #1647 (agent ignores uploaded files).
* fix(uploads): stream outline file reads and strip inline bold from heading titles
- Switch extract_outline() from read_text().splitlines() to open()+line iteration
so large converted documents are not loaded into memory on every agent turn;
exits as soon as MAX_OUTLINE_ENTRIES is reached (Copilot suggestion)
- Strip **...** wrapper from standard Markdown heading titles before appending
to outline so agent context stays clean (e.g. "## **Overview**" → "Overview")
(Copilot suggestion)
- Remove unused pathlib.Path import and fix import sort order in test_file_conversion.py
to satisfy ruff CI lint
* fix(uploads): show truncation hint when outline exceeds MAX_OUTLINE_ENTRIES
When extract_outline() hits the cap it now appends a sentinel entry
{"truncated": True} instead of silently dropping the rest of the headings.
UploadsMiddleware reads the sentinel and renders a hint line:
... (showing first 50 headings; use `read_file` to explore further)
Without this the agent had no way to know the outline was incomplete and
would treat the first 50 headings as the full document structure.
* fix(uploads): fall back to configurable.thread_id when runtime.context lacks thread_id
runtime.context does not always carry thread_id (depends on LangGraph
invocation path). ThreadDataMiddleware already falls back to
get_config().configurable.thread_id — apply the same pattern so
UploadsMiddleware can resolve the uploads directory and attach outlines
in all invocation paths.
* style: apply ruff format
---------
Co-authored-by: Willem Jiang <willem.jiang@gmail.com>
This commit is contained in:
@@ -5,6 +5,7 @@ No FastAPI or HTTP dependencies — pure utility functions.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -45,3 +46,90 @@ async def convert_file_to_markdown(file_path: Path) -> Path | None:
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to convert {file_path.name} to markdown: {e}")
|
||||
return None
|
||||
|
||||
|
||||
# Regex for bold-only lines that look like section headings.
|
||||
# Targets SEC filing structural headings that pymupdf4llm renders as **bold**
|
||||
# rather than # Markdown headings (because they use same font size as body text,
|
||||
# distinguished only by bold+caps formatting).
|
||||
#
|
||||
# Pattern requires ALL of:
|
||||
# 1. Entire line is a single **...** block (no surrounding prose)
|
||||
# 2. Starts with a recognised structural keyword:
|
||||
# - ITEM / PART / SECTION (with optional number/letter after)
|
||||
# - SCHEDULE, EXHIBIT, APPENDIX, ANNEX, CHAPTER
|
||||
# All-caps addresses, boilerplate ("CURRENT REPORT", "SIGNATURES",
|
||||
# "WASHINGTON, DC 20549") do NOT start with these keywords and are excluded.
|
||||
#
|
||||
# Chinese headings (第三节...) are already captured as standard # headings
|
||||
# by pymupdf4llm, so they don't need this pattern.
|
||||
_BOLD_HEADING_RE = re.compile(r"^\*\*((ITEM|PART|SECTION|SCHEDULE|EXHIBIT|APPENDIX|ANNEX|CHAPTER)\b[A-Z0-9 .,\-]*)\*\*\s*$")
|
||||
|
||||
# Maximum number of outline entries injected into the agent context.
|
||||
# Keeps prompt size bounded even for very long documents.
|
||||
MAX_OUTLINE_ENTRIES = 50
|
||||
|
||||
|
||||
def extract_outline(md_path: Path) -> list[dict]:
|
||||
"""Extract document outline (headings) from a Markdown file.
|
||||
|
||||
Recognises two heading styles produced by pymupdf4llm:
|
||||
1. Standard Markdown headings: lines starting with one or more '#'
|
||||
2. Bold-only structural headings: **ITEM 1. BUSINESS**, **PART II**, etc.
|
||||
(SEC filings use bold+caps for section headings with the same font size
|
||||
as body text, so pymupdf4llm cannot promote them to # headings)
|
||||
|
||||
Args:
|
||||
md_path: Path to the .md file.
|
||||
|
||||
Returns:
|
||||
List of dicts with keys: title (str), line (int, 1-based).
|
||||
When the outline is truncated at MAX_OUTLINE_ENTRIES, a sentinel entry
|
||||
``{"truncated": True}`` is appended as the last element so callers can
|
||||
render a "showing first N headings" hint without re-scanning the file.
|
||||
Returns an empty list if the file cannot be read or has no headings.
|
||||
"""
|
||||
outline: list[dict] = []
|
||||
try:
|
||||
with md_path.open(encoding="utf-8") as f:
|
||||
for lineno, line in enumerate(f, 1):
|
||||
stripped = line.strip()
|
||||
if not stripped:
|
||||
continue
|
||||
|
||||
# Style 1: standard Markdown heading
|
||||
if stripped.startswith("#"):
|
||||
title = stripped.lstrip("#").strip()
|
||||
# Strip any inline **...** wrapping (e.g. "## **Overview**" → "Overview")
|
||||
if title:
|
||||
if m2 := re.fullmatch(r"\*\*(.+?)\*\*", title):
|
||||
title = m2.group(1).strip()
|
||||
outline.append({"title": title, "line": lineno})
|
||||
|
||||
# Style 2: bold-only line (entire line is **...**)
|
||||
elif m := _BOLD_HEADING_RE.match(stripped):
|
||||
title = m.group(1).strip()
|
||||
if title:
|
||||
outline.append({"title": title, "line": lineno})
|
||||
|
||||
if len(outline) >= MAX_OUTLINE_ENTRIES:
|
||||
outline.append({"truncated": True})
|
||||
break
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
return outline
|
||||
|
||||
|
||||
def _get_pdf_converter() -> str:
|
||||
"""Read pdf_converter setting from app config, defaulting to 'auto'."""
|
||||
try:
|
||||
from deerflow.config.app_config import get_app_config
|
||||
|
||||
cfg = get_app_config()
|
||||
uploads_cfg = getattr(cfg, "uploads", None)
|
||||
if uploads_cfg is not None:
|
||||
return str(getattr(uploads_cfg, "pdf_converter", "auto"))
|
||||
except Exception:
|
||||
pass
|
||||
return "auto"
|
||||
|
||||
Reference in New Issue
Block a user