fix(uploads): handle split-bold headings and ** ** artefacts in extract_outline (#1838)

* feat(uploads): guide agent to use grep/glob/read_file for uploaded documents Add workflow guidance to the <uploaded_files> context block so the agent knows to use grep and glob (added in #1784) alongside read_file when working with uploaded documents, rather than falling back to web search. This is the final piece of the three-PR PDF agentic search pipeline: - PR1 (#1727): pymupdf4llm converter produces structured Markdown with headings - PR2 (#1738): document outline injected into agent context with line numbers - PR3 (this): agent guided to use outline + grep + read_file workflow * feat(uploads): add file-first priority and fallback guidance to uploaded_files context * fix(uploads): handle split-bold headings and ** ** artefacts in extract_outline - Add _clean_bold_title() to merge adjacent bold spans (** **) produced by pymupdf4llm when bold text crosses span boundaries - Add _SPLIT_BOLD_HEADING_RE (Style 3) to recognise **<num>** **<title>** headings common in academic papers; excludes pure-number table headers and rows with more than 4 bold blocks - When outline is empty, read first 5 non-empty lines of the .md as a content preview and surface a grep hint in the agent context - Update _format_file_entry to render the preview + grep hint instead of silently omitting the outline section - Add 3 new extract_outline tests and 2 new middleware tests (65 total) * fix(uploads): address Copilot review comments on extract_outline regex - Replace ASCII [A-Za-z] guard with negative lookahead to support non-ASCII titles (e.g. **1** **概述**); pure-numeric/punctuation blocks still excluded - Replace .+ with [^*]+ and cap repetition at {0,2} (four blocks total) to keep _SPLIT_BOLD_HEADING_RE linear and avoid ReDoS on malformed input - Remove now-redundant len(blocks) <= 4 code-level check (enforced by regex) - Log debug message with exc_info when preview extraction fails
2026-05-24 00:45:57 +00:00 · 2026-04-04 14:25:08 +08:00
parent 19809800f1
commit 163121d327
4 changed files with 177 additions and 19 deletions
@@ -182,6 +182,19 @@ async def convert_file_to_markdown(file_path: Path) -> Path | None:
 # by pymupdf4llm, so they don't need this pattern.
 _BOLD_HEADING_RE = re.compile(r"^\*\*((ITEM|PART|SECTION|SCHEDULE|EXHIBIT|APPENDIX|ANNEX|CHAPTER)\b[A-Z0-9 .,\-]*)\*\*\s*$")

+# Regex for split-bold headings produced by pymupdf4llm when a heading spans
+# multiple text spans in the PDF (e.g. section number and title are separate spans).
+# Matches lines like:  **1** **Introduction**  or  **3.2** **Multi-Head Attention**
+# Requirements:
+#   1. Entire line consists only of **...** blocks separated by whitespace (no prose)
+#   2. First block is a section number (digits and dots, e.g. "1", "3.2", "A.1")
+#   3. Second block must not be purely numeric/punctuation — excludes financial table
+#      headers like **2023** **2022** **2021** while allowing non-ASCII titles such as
+#      **1** **概述** or accented words (negative lookahead instead of [A-Za-z])
+#   4. At most two additional blocks (four total) with [^*]+ (no * inside) to keep
+#      the regex linear and avoid ReDoS on attacker-controlled content
+_SPLIT_BOLD_HEADING_RE = re.compile(r"^\*\*[\dA-Z][\d\.]*\*\*\s+\*\*(?!\d[\d\s.,\-–—/:()%]*\*\*)[^*]+\*\*(?:\s+\*\*[^*]+\*\*){0,2}\s*$")
+
 # Maximum number of outline entries injected into the agent context.
 # Keeps prompt size bounded even for very long documents.
 MAX_OUTLINE_ENTRIES = 50
@@ -189,14 +202,43 @@ MAX_OUTLINE_ENTRIES = 50
 _ALLOWED_PDF_CONVERTERS = {"auto", "pymupdf4llm", "markitdown"}


+def _clean_bold_title(raw: str) -> str:
+    """Normalise a title string that may contain pymupdf4llm bold artefacts.
+
+    pymupdf4llm sometimes emits adjacent bold spans as ``**A** **B**`` instead
+    of a single ``**A B**`` block.  This helper merges those fragments and then
+    strips the outermost ``**...**`` wrapper so the caller gets plain text.
+
+    Examples::
+
+        "**Overview**"                       → "Overview"
+        "**UNITED STATES** **SECURITIES**"   → "UNITED STATES SECURITIES"
+        "plain text"                         → "plain text"  (unchanged)
+    """
+    # Merge adjacent bold spans: "** **" → " "
+    merged = re.sub(r"\*\*\s*\*\*", " ", raw).strip()
+    # Strip outermost **...** if the whole string is wrapped
+    if m := re.fullmatch(r"\*\*(.+?)\*\*", merged, re.DOTALL):
+        return m.group(1).strip()
+    return merged
+
+
 def extract_outline(md_path: Path) -> list[dict]:
    """Extract document outline (headings) from a Markdown file.

-    Recognises two heading styles produced by pymupdf4llm:
-    1. Standard Markdown headings: lines starting with one or more '#'
-    2. Bold-only structural headings: **ITEM 1. BUSINESS**, **PART II**, etc.
-       (SEC filings use bold+caps for section headings with the same font size
-       as body text, so pymupdf4llm cannot promote them to # headings)
+    Recognises three heading styles produced by pymupdf4llm:
+
+    1. Standard Markdown headings: lines starting with one or more '#'.
+       Inline ``**...**`` wrappers and adjacent bold spans (``** **``) are
+       cleaned so the title is plain text.
+
+    2. Bold-only structural headings: ``**ITEM 1. BUSINESS**``, ``**PART II**``,
+       etc.  SEC filings use bold+caps for section headings with the same font
+       size as body text, so pymupdf4llm cannot promote them to # headings.
+
+    3. Split-bold headings: ``**1** **Introduction**``, ``**3.2** **Attention**``.
+       pymupdf4llm emits these when the section number and title text are
+       separate spans in the underlying PDF (common in academic papers).

    Args:
        md_path: Path to the .md file.
@@ -218,19 +260,23 @@ def extract_outline(md_path: Path) -> list[dict]:

                # Style 1: standard Markdown heading
                if stripped.startswith("#"):
-                    title = stripped.lstrip("#").strip()
-                    # Strip any inline **...** wrapping (e.g. "## **Overview**" → "Overview")
+                    title = _clean_bold_title(stripped.lstrip("#").strip())
                    if title:
-                        if m2 := re.fullmatch(r"\*\*(.+?)\*\*", title):
-                            title = m2.group(1).strip()
                        outline.append({"title": title, "line": lineno})

-                # Style 2: bold-only line (entire line is **...**)
+                # Style 2: single bold block with SEC structural keyword
                elif m := _BOLD_HEADING_RE.match(stripped):
                    title = m.group(1).strip()
                    if title:
                        outline.append({"title": title, "line": lineno})

+                # Style 3: split-bold heading — **<num>** **<title>**
+                # Regex already enforces max 4 blocks and non-numeric second block.
+                elif _SPLIT_BOLD_HEADING_RE.match(stripped):
+                    title = " ".join(re.findall(r"\*\*([^*]+)\*\*", stripped))
+                    if title:
+                        outline.append({"title": title, "line": lineno})
+
                if len(outline) >= MAX_OUTLINE_ENTRIES:
                    outline.append({"truncated": True})
                    break