feat(uploads): inject document outline into agent context for converted files (#1738)

* feat(uploads): inject document outline into agent context for converted files Extract headings from converted .md files and inject them into the <uploaded_files> context block so the agent can navigate large documents by line number before reading. - Add `extract_outline()` to `file_conversion.py`: recognises standard Markdown headings (#/##/###) and SEC-style bold structural headings (**ITEM N. BUSINESS**, **PART II**); caps at 50 entries; excludes cover-page boilerplate (WASHINGTON DC, CURRENT REPORT, SIGNATURES) - Add `_extract_outline_for_file()` helper in `uploads_middleware.py`: looks for a sibling `.md` file produced by the conversion pipeline - Update `UploadsMiddleware._create_files_message()` to render the outline under each file entry with `L{line}: {title}` format and a `read_file` prompt for range-based reading - Tests: 10 new tests for `extract_outline()`, 4 new tests for outline injection in `UploadsMiddleware`; existing test updated for new `outline` field in `uploaded_files` state Partially addresses #1647 (agent ignores uploaded files). * fix(uploads): stream outline file reads and strip inline bold from heading titles - Switch extract_outline() from read_text().splitlines() to open()+line iteration so large converted documents are not loaded into memory on every agent turn; exits as soon as MAX_OUTLINE_ENTRIES is reached (Copilot suggestion) - Strip **...** wrapper from standard Markdown heading titles before appending to outline so agent context stays clean (e.g. "## **Overview**" → "Overview") (Copilot suggestion) - Remove unused pathlib.Path import and fix import sort order in test_file_conversion.py to satisfy ruff CI lint * fix(uploads): show truncation hint when outline exceeds MAX_OUTLINE_ENTRIES When extract_outline() hits the cap it now appends a sentinel entry {"truncated": True} instead of silently dropping the rest of the headings. UploadsMiddleware reads the sentinel and renders a hint line: ... (showing first 50 headings; use `read_file` to explore further) Without this the agent had no way to know the outline was incomplete and would treat the first 50 headings as the full document structure. * fix(uploads): fall back to configurable.thread_id when runtime.context lacks thread_id runtime.context does not always carry thread_id (depends on LangGraph invocation path). ThreadDataMiddleware already falls back to get_config().configurable.thread_id — apply the same pattern so UploadsMiddleware can resolve the uploads directory and attach outlines in all invocation paths. * style: apply ruff format --------- Co-authored-by: Willem Jiang <willem.jiang@gmail.com>
2026-05-23 00:16:48 +00:00 · 2026-04-03 20:52:47 +08:00
parent 46d0c329c1
commit 5ff230eafd
4 changed files with 354 additions and 10 deletions
@@ -10,10 +10,27 @@ from langchain_core.messages import HumanMessage
 from langgraph.runtime import Runtime

 from deerflow.config.paths import Paths, get_paths
+from deerflow.utils.file_conversion import extract_outline

 logger = logging.getLogger(__name__)


+def _extract_outline_for_file(file_path: Path) -> list[dict]:
+    """Return the document outline for *file_path* if a converted .md exists.
+
+    Looks for a sibling ``<stem>.md`` file produced by the upload conversion
+    pipeline.  Returns an empty list when the file is not a converted document
+    or when no headings are found.
+    """
+    md_path = file_path.with_suffix(".md")
+    if not md_path.is_file():
+        return []
+    outline = extract_outline(md_path)
+    if outline:
+        logger.debug("Extracted %d outline entries from %s", len(outline), file_path.name)
+    return outline
+
+
 class UploadsMiddlewareState(AgentState):
    """State schema for uploads middleware."""

@@ -39,12 +56,31 @@ class UploadsMiddleware(AgentMiddleware[UploadsMiddlewareState]):
        super().__init__()
        self._paths = Paths(base_dir) if base_dir else get_paths()

+    def _format_file_entry(self, file: dict, lines: list[str]) -> None:
+        """Append a single file entry (name, size, path, optional outline) to lines."""
+        size_kb = file["size"] / 1024
+        size_str = f"{size_kb:.1f} KB" if size_kb < 1024 else f"{size_kb / 1024:.1f} MB"
+        lines.append(f"- {file['filename']} ({size_str})")
+        lines.append(f"  Path: {file['path']}")
+        outline = file.get("outline") or []
+        if outline:
+            truncated = outline[-1].get("truncated", False) if outline else False
+            visible = [e for e in outline if not e.get("truncated")]
+            lines.append("  Document outline (use `read_file` with line ranges to read sections):")
+            for entry in visible:
+                lines.append(f"    L{entry['line']}: {entry['title']}")
+            if truncated:
+                lines.append(f"    ... (showing first {len(visible)} headings; use `read_file` to explore further)")
+        lines.append("")
+
    def _create_files_message(self, new_files: list[dict], historical_files: list[dict]) -> str:
        """Create a formatted message listing uploaded files.

        Args:
            new_files: Files uploaded in the current message.
            historical_files: Files uploaded in previous messages.
+                Each file dict may contain an optional ``outline`` key — a list of
+                ``{title, line}`` dicts extracted from the converted Markdown file.

        Returns:
            Formatted string inside <uploaded_files> tags.
@@ -55,23 +91,16 @@ class UploadsMiddleware(AgentMiddleware[UploadsMiddlewareState]):
        lines.append("")
        if new_files:
            for file in new_files:
-                size_kb = file["size"] / 1024
-                size_str = f"{size_kb:.1f} KB" if size_kb < 1024 else f"{size_kb / 1024:.1f} MB"
-                lines.append(f"- {file['filename']} ({size_str})")
-                lines.append(f"  Path: {file['path']}")
-                lines.append("")
+                self._format_file_entry(file, lines)
        else:
            lines.append("(empty)")
+            lines.append("")

        if historical_files:
            lines.append("The following files were uploaded in previous messages and are still available:")
            lines.append("")
            for file in historical_files:
-                size_kb = file["size"] / 1024
-                size_str = f"{size_kb:.1f} KB" if size_kb < 1024 else f"{size_kb / 1024:.1f} MB"
-                lines.append(f"- {file['filename']} ({size_str})")
-                lines.append(f"  Path: {file['path']}")
-                lines.append("")
+                self._format_file_entry(file, lines)

        lines.append("You can read these files using the `read_file` tool with the paths shown above.")
        lines.append("</uploaded_files>")
@@ -172,9 +201,16 @@ class UploadsMiddleware(AgentMiddleware[UploadsMiddlewareState]):
                            "size": stat.st_size,
                            "path": f"/mnt/user-data/uploads/{file_path.name}",
                            "extension": file_path.suffix,
+                            "outline": _extract_outline_for_file(file_path),
                        }
                    )

+        # Attach outlines to new files as well
+        if uploads_dir:
+            for file in new_files:
+                phys_path = uploads_dir / file["filename"]
+                file["outline"] = _extract_outline_for_file(phys_path)
+
        if not new_files and not historical_files:
            return None