feat(uploads): inject document outline into agent context for converted files (#1738)

* feat(uploads): inject document outline into agent context for converted files

Extract headings from converted .md files and inject them into the
<uploaded_files> context block so the agent can navigate large documents
by line number before reading.

- Add `extract_outline()` to `file_conversion.py`: recognises standard
  Markdown headings (#/##/###) and SEC-style bold structural headings
  (**ITEM N. BUSINESS**, **PART II**); caps at 50 entries; excludes
  cover-page boilerplate (WASHINGTON DC, CURRENT REPORT, SIGNATURES)
- Add `_extract_outline_for_file()` helper in `uploads_middleware.py`:
  looks for a sibling `.md` file produced by the conversion pipeline
- Update `UploadsMiddleware._create_files_message()` to render the outline
  under each file entry with `L{line}: {title}` format and a `read_file`
  prompt for range-based reading
- Tests: 10 new tests for `extract_outline()`, 4 new tests for outline
  injection in `UploadsMiddleware`; existing test updated for new `outline`
  field in `uploaded_files` state

Partially addresses #1647 (agent ignores uploaded files).

* fix(uploads): stream outline file reads and strip inline bold from heading titles

- Switch extract_outline() from read_text().splitlines() to open()+line iteration
  so large converted documents are not loaded into memory on every agent turn;
  exits as soon as MAX_OUTLINE_ENTRIES is reached (Copilot suggestion)
- Strip **...** wrapper from standard Markdown heading titles before appending
  to outline so agent context stays clean (e.g. "## **Overview**" → "Overview")
  (Copilot suggestion)
- Remove unused pathlib.Path import and fix import sort order in test_file_conversion.py
  to satisfy ruff CI lint

* fix(uploads): show truncation hint when outline exceeds MAX_OUTLINE_ENTRIES

When extract_outline() hits the cap it now appends a sentinel entry
{"truncated": True} instead of silently dropping the rest of the headings.
UploadsMiddleware reads the sentinel and renders a hint line:

  ... (showing first 50 headings; use `read_file` to explore further)

Without this the agent had no way to know the outline was incomplete and
would treat the first 50 headings as the full document structure.

* fix(uploads): fall back to configurable.thread_id when runtime.context lacks thread_id

runtime.context does not always carry thread_id (depends on LangGraph
invocation path). ThreadDataMiddleware already falls back to
get_config().configurable.thread_id — apply the same pattern so
UploadsMiddleware can resolve the uploads directory and attach outlines
in all invocation paths.

* style: apply ruff format

---------

Co-authored-by: Willem Jiang <willem.jiang@gmail.com>
This commit is contained in:
SHIYAO ZHANG
2026-04-03 20:52:47 +08:00
committed by GitHub
parent 46d0c329c1
commit 5ff230eafd
4 changed files with 354 additions and 10 deletions
@@ -10,10 +10,27 @@ from langchain_core.messages import HumanMessage
from langgraph.runtime import Runtime
from deerflow.config.paths import Paths, get_paths
from deerflow.utils.file_conversion import extract_outline
logger = logging.getLogger(__name__)
def _extract_outline_for_file(file_path: Path) -> list[dict]:
"""Return the document outline for *file_path* if a converted .md exists.
Looks for a sibling ``<stem>.md`` file produced by the upload conversion
pipeline. Returns an empty list when the file is not a converted document
or when no headings are found.
"""
md_path = file_path.with_suffix(".md")
if not md_path.is_file():
return []
outline = extract_outline(md_path)
if outline:
logger.debug("Extracted %d outline entries from %s", len(outline), file_path.name)
return outline
class UploadsMiddlewareState(AgentState):
"""State schema for uploads middleware."""
@@ -39,12 +56,31 @@ class UploadsMiddleware(AgentMiddleware[UploadsMiddlewareState]):
super().__init__()
self._paths = Paths(base_dir) if base_dir else get_paths()
def _format_file_entry(self, file: dict, lines: list[str]) -> None:
"""Append a single file entry (name, size, path, optional outline) to lines."""
size_kb = file["size"] / 1024
size_str = f"{size_kb:.1f} KB" if size_kb < 1024 else f"{size_kb / 1024:.1f} MB"
lines.append(f"- {file['filename']} ({size_str})")
lines.append(f" Path: {file['path']}")
outline = file.get("outline") or []
if outline:
truncated = outline[-1].get("truncated", False) if outline else False
visible = [e for e in outline if not e.get("truncated")]
lines.append(" Document outline (use `read_file` with line ranges to read sections):")
for entry in visible:
lines.append(f" L{entry['line']}: {entry['title']}")
if truncated:
lines.append(f" ... (showing first {len(visible)} headings; use `read_file` to explore further)")
lines.append("")
def _create_files_message(self, new_files: list[dict], historical_files: list[dict]) -> str:
"""Create a formatted message listing uploaded files.
Args:
new_files: Files uploaded in the current message.
historical_files: Files uploaded in previous messages.
Each file dict may contain an optional ``outline`` key — a list of
``{title, line}`` dicts extracted from the converted Markdown file.
Returns:
Formatted string inside <uploaded_files> tags.
@@ -55,23 +91,16 @@ class UploadsMiddleware(AgentMiddleware[UploadsMiddlewareState]):
lines.append("")
if new_files:
for file in new_files:
size_kb = file["size"] / 1024
size_str = f"{size_kb:.1f} KB" if size_kb < 1024 else f"{size_kb / 1024:.1f} MB"
lines.append(f"- {file['filename']} ({size_str})")
lines.append(f" Path: {file['path']}")
lines.append("")
self._format_file_entry(file, lines)
else:
lines.append("(empty)")
lines.append("")
if historical_files:
lines.append("The following files were uploaded in previous messages and are still available:")
lines.append("")
for file in historical_files:
size_kb = file["size"] / 1024
size_str = f"{size_kb:.1f} KB" if size_kb < 1024 else f"{size_kb / 1024:.1f} MB"
lines.append(f"- {file['filename']} ({size_str})")
lines.append(f" Path: {file['path']}")
lines.append("")
self._format_file_entry(file, lines)
lines.append("You can read these files using the `read_file` tool with the paths shown above.")
lines.append("</uploaded_files>")
@@ -172,9 +201,16 @@ class UploadsMiddleware(AgentMiddleware[UploadsMiddlewareState]):
"size": stat.st_size,
"path": f"/mnt/user-data/uploads/{file_path.name}",
"extension": file_path.suffix,
"outline": _extract_outline_for_file(file_path),
}
)
# Attach outlines to new files as well
if uploads_dir:
for file in new_files:
phys_path = uploads_dir / file["filename"]
file["outline"] = _extract_outline_for_file(phys_path)
if not new_files and not historical_files:
return None
@@ -5,6 +5,7 @@ No FastAPI or HTTP dependencies — pure utility functions.
"""
import logging
import re
from pathlib import Path
logger = logging.getLogger(__name__)
@@ -45,3 +46,90 @@ async def convert_file_to_markdown(file_path: Path) -> Path | None:
except Exception as e:
logger.error(f"Failed to convert {file_path.name} to markdown: {e}")
return None
# Regex for bold-only lines that look like section headings.
# Targets SEC filing structural headings that pymupdf4llm renders as **bold**
# rather than # Markdown headings (because they use same font size as body text,
# distinguished only by bold+caps formatting).
#
# Pattern requires ALL of:
# 1. Entire line is a single **...** block (no surrounding prose)
# 2. Starts with a recognised structural keyword:
# - ITEM / PART / SECTION (with optional number/letter after)
# - SCHEDULE, EXHIBIT, APPENDIX, ANNEX, CHAPTER
# All-caps addresses, boilerplate ("CURRENT REPORT", "SIGNATURES",
# "WASHINGTON, DC 20549") do NOT start with these keywords and are excluded.
#
# Chinese headings (第三节...) are already captured as standard # headings
# by pymupdf4llm, so they don't need this pattern.
_BOLD_HEADING_RE = re.compile(r"^\*\*((ITEM|PART|SECTION|SCHEDULE|EXHIBIT|APPENDIX|ANNEX|CHAPTER)\b[A-Z0-9 .,\-]*)\*\*\s*$")
# Maximum number of outline entries injected into the agent context.
# Keeps prompt size bounded even for very long documents.
MAX_OUTLINE_ENTRIES = 50
def extract_outline(md_path: Path) -> list[dict]:
"""Extract document outline (headings) from a Markdown file.
Recognises two heading styles produced by pymupdf4llm:
1. Standard Markdown headings: lines starting with one or more '#'
2. Bold-only structural headings: **ITEM 1. BUSINESS**, **PART II**, etc.
(SEC filings use bold+caps for section headings with the same font size
as body text, so pymupdf4llm cannot promote them to # headings)
Args:
md_path: Path to the .md file.
Returns:
List of dicts with keys: title (str), line (int, 1-based).
When the outline is truncated at MAX_OUTLINE_ENTRIES, a sentinel entry
``{"truncated": True}`` is appended as the last element so callers can
render a "showing first N headings" hint without re-scanning the file.
Returns an empty list if the file cannot be read or has no headings.
"""
outline: list[dict] = []
try:
with md_path.open(encoding="utf-8") as f:
for lineno, line in enumerate(f, 1):
stripped = line.strip()
if not stripped:
continue
# Style 1: standard Markdown heading
if stripped.startswith("#"):
title = stripped.lstrip("#").strip()
# Strip any inline **...** wrapping (e.g. "## **Overview**" → "Overview")
if title:
if m2 := re.fullmatch(r"\*\*(.+?)\*\*", title):
title = m2.group(1).strip()
outline.append({"title": title, "line": lineno})
# Style 2: bold-only line (entire line is **...**)
elif m := _BOLD_HEADING_RE.match(stripped):
title = m.group(1).strip()
if title:
outline.append({"title": title, "line": lineno})
if len(outline) >= MAX_OUTLINE_ENTRIES:
outline.append({"truncated": True})
break
except Exception:
return []
return outline
def _get_pdf_converter() -> str:
"""Read pdf_converter setting from app config, defaulting to 'auto'."""
try:
from deerflow.config.app_config import get_app_config
cfg = get_app_config()
uploads_cfg = getattr(cfg, "uploads", None)
if uploads_cfg is not None:
return str(getattr(uploads_cfg, "pdf_converter", "auto"))
except Exception:
pass
return "auto"