mirror of
https://github.com/bytedance/deer-flow.git
synced 2026-05-22 16:06:50 +00:00
fix(uploads): handle split-bold headings and ** ** artefacts in extract_outline (#1838)
* feat(uploads): guide agent to use grep/glob/read_file for uploaded documents Add workflow guidance to the <uploaded_files> context block so the agent knows to use grep and glob (added in #1784) alongside read_file when working with uploaded documents, rather than falling back to web search. This is the final piece of the three-PR PDF agentic search pipeline: - PR1 (#1727): pymupdf4llm converter produces structured Markdown with headings - PR2 (#1738): document outline injected into agent context with line numbers - PR3 (this): agent guided to use outline + grep + read_file workflow * feat(uploads): add file-first priority and fallback guidance to uploaded_files context * fix(uploads): handle split-bold headings and ** ** artefacts in extract_outline - Add _clean_bold_title() to merge adjacent bold spans (** **) produced by pymupdf4llm when bold text crosses span boundaries - Add _SPLIT_BOLD_HEADING_RE (Style 3) to recognise **<num>** **<title>** headings common in academic papers; excludes pure-number table headers and rows with more than 4 bold blocks - When outline is empty, read first 5 non-empty lines of the .md as a content preview and surface a grep hint in the agent context - Update _format_file_entry to render the preview + grep hint instead of silently omitting the outline section - Add 3 new extract_outline tests and 2 new middleware tests (65 total) * fix(uploads): address Copilot review comments on extract_outline regex - Replace ASCII [A-Za-z] guard with negative lookahead to support non-ASCII titles (e.g. **1** **概述**); pure-numeric/punctuation blocks still excluded - Replace .+ with [^*]+ and cap repetition at {0,2} (four blocks total) to keep _SPLIT_BOLD_HEADING_RE linear and avoid ReDoS on malformed input - Remove now-redundant len(blocks) <= 4 code-level check (enforced by regex) - Log debug message with exc_info when preview extraction fails
This commit is contained in:
@@ -420,3 +420,40 @@ class TestExtractOutline:
|
||||
)
|
||||
outline = extract_outline(md)
|
||||
assert outline == []
|
||||
|
||||
def test_split_bold_heading_academic_paper(self, tmp_path):
|
||||
"""**<num>** **<title>** lines from academic papers are recognised (Style 3)."""
|
||||
md = tmp_path / "paper.md"
|
||||
md.write_text(
|
||||
"## **Attention Is All You Need**\n\n**1** **Introduction**\n\nBody text.\n\n**2** **Background**\n\nMore text.\n\n**3.1** **Encoder and Decoder Stacks**\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
outline = extract_outline(md)
|
||||
titles = [e["title"] for e in outline]
|
||||
assert "1 Introduction" in titles
|
||||
assert "2 Background" in titles
|
||||
assert "3.1 Encoder and Decoder Stacks" in titles
|
||||
|
||||
def test_split_bold_year_columns_excluded(self, tmp_path):
|
||||
"""Financial table headers like **2023** **2022** **2021** are NOT headings."""
|
||||
md = tmp_path / "annual.md"
|
||||
md.write_text(
|
||||
"# Financial Summary\n\n**2023** **2022** **2021**\n\nRevenue 100 90 80\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
outline = extract_outline(md)
|
||||
titles = [e["title"] for e in outline]
|
||||
# Only the # heading should appear, not the year-column row
|
||||
assert titles == ["Financial Summary"]
|
||||
|
||||
def test_adjacent_bold_spans_merged_in_markdown_heading(self, tmp_path):
|
||||
"""** ** artefacts inside a # heading are merged into clean plain text."""
|
||||
md = tmp_path / "sec.md"
|
||||
md.write_text(
|
||||
"## **UNITED STATES** **SECURITIES AND EXCHANGE COMMISSION**\n\nBody text.\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
outline = extract_outline(md)
|
||||
assert len(outline) == 1
|
||||
# Title must be clean — no ** ** artefacts
|
||||
assert outline[0]["title"] == "UNITED STATES SECURITIES AND EXCHANGE COMMISSION"
|
||||
|
||||
@@ -290,6 +290,7 @@ class TestBeforeAgent:
|
||||
"path": "/mnt/user-data/uploads/notes.txt",
|
||||
"extension": ".txt",
|
||||
"outline": [],
|
||||
"outline_preview": [],
|
||||
}
|
||||
]
|
||||
|
||||
@@ -429,3 +430,41 @@ class TestBeforeAgent:
|
||||
content = result["messages"][-1].content
|
||||
assert "Chapter 1" in content
|
||||
assert "Chapter 2" in content
|
||||
|
||||
def test_fallback_preview_shown_when_outline_empty(self, tmp_path):
|
||||
"""When .md exists but has no headings, first lines are shown as a preview."""
|
||||
mw = _middleware(tmp_path)
|
||||
uploads_dir = _uploads_dir(tmp_path)
|
||||
(uploads_dir / "report.pdf").write_bytes(b"%PDF fake")
|
||||
# .md with no # headings — plain prose only
|
||||
(uploads_dir / "report.md").write_text(
|
||||
"Annual Financial Report 2024\n\nThis document summarises key findings.\n\nRevenue grew by 12%.\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
msg = _human("analyse", files=[{"filename": "report.pdf", "size": 9, "path": "/mnt/user-data/uploads/report.pdf"}])
|
||||
result = mw.before_agent(self._state(msg), _runtime())
|
||||
|
||||
assert result is not None
|
||||
content = result["messages"][-1].content
|
||||
# Outline section must NOT appear
|
||||
assert "Document outline" not in content
|
||||
# Preview lines must appear
|
||||
assert "Annual Financial Report 2024" in content
|
||||
assert "No structural headings detected" in content
|
||||
# grep hint must appear
|
||||
assert "grep" in content
|
||||
|
||||
def test_fallback_grep_hint_shown_when_no_md_file(self, tmp_path):
|
||||
"""Files with no sibling .md still get the grep hint (outline is empty)."""
|
||||
mw = _middleware(tmp_path)
|
||||
uploads_dir = _uploads_dir(tmp_path)
|
||||
(uploads_dir / "data.csv").write_bytes(b"a,b,c\n1,2,3\n")
|
||||
|
||||
msg = _human("analyse", files=[{"filename": "data.csv", "size": 12, "path": "/mnt/user-data/uploads/data.csv"}])
|
||||
result = mw.before_agent(self._state(msg), _runtime())
|
||||
|
||||
assert result is not None
|
||||
content = result["messages"][-1].content
|
||||
assert "Document outline" not in content
|
||||
assert "grep" in content
|
||||
|
||||
Reference in New Issue
Block a user