fix(frontend): sanitize JSON export content via the Markdown content path

Address @ShenAC-SAC's BUG-006 review and the Copilot inline comment on #3131. The previous cut filtered hidden/tool messages out of the JSON export but still serialised `msg.content` verbatim, so: - inline `<think>…</think>` wrappers stayed in the exported `content` even with `includeReasoning: false`, - content-array thinking blocks leaked the `thinking` field, - `<uploaded_files>…</uploaded_files>` markers leaked the workspace paths a user uploaded files to. JSON now goes through the same sanitiser the Markdown path uses (`extractContentFromMessage` + `stripUploadedFilesTag`). Reasoning and tool_calls remain gated behind their `ExportOptions` flags. AI / human rows that sanitise to empty content with no opted-in reasoning or tool calls are dropped so the JSON matches the Markdown path's `continue` on empty assistant fragments. New regression tests cover the three leak shapes the reviewer called out plus the empty-content-drop case. Refs: bytedance/deer-flow#3107 (BUG-006), bytedance/deer-flow#3131 review
2026-05-22 16:06:50 +00:00 · 2026-05-21 18:52:39 +08:00
parent 50e2c257bf
commit 258ca800fe
2 changed files with 96 additions and 10 deletions
@@ -122,6 +122,51 @@ export function formatThreadAsMarkdown(
  return lines.join("\n").trimEnd() + "\n";
 }

+interface JSONExportMessage {
+  type: Message["type"];
+  id: string | undefined;
+  content: string;
+  reasoning?: string;
+  tool_calls?: unknown;
+}
+
+function buildJSONMessage(
+  msg: Message,
+  options: ExportOptions,
+): JSONExportMessage | null {
+  // Run the same sanitiser the Markdown path uses so the JSON `content`
+  // field never carries inline `<think>...</think>` wrappers, content-array
+  // thinking blocks, `<uploaded_files>` markers, or other internal payloads.
+  const content = formatMessageContent(msg);
+  const reasoning =
+    options.includeReasoning && msg.type === "ai"
+      ? (extractReasoningContentFromMessage(msg) ?? undefined)
+      : undefined;
+  const toolCalls =
+    options.includeToolCalls &&
+    msg.type === "ai" &&
+    "tool_calls" in msg &&
+    msg.tool_calls?.length
+      ? msg.tool_calls
+      : undefined;
+
+  // Drop rows with no exportable payload (empty content + no opted-in
+  // reasoning / tool_calls). This matches the Markdown path's `continue`
+  // on `!content && !toolCalls && !reasoning` so the two formats agree on
+  // which AI fragments are visible to the user.
+  if (!content && reasoning === undefined && toolCalls === undefined) {
+    return null;
+  }
+
+  return {
+    type: msg.type,
+    id: msg.id,
+    content,
+    ...(reasoning !== undefined ? { reasoning } : {}),
+    ...(toolCalls !== undefined ? { tool_calls: toolCalls } : {}),
+  };
+}
+
 export function formatThreadAsJSON(
  thread: AgentThread,
  messages: Message[],
@@ -132,16 +177,9 @@ export function formatThreadAsJSON(
    thread_id: thread.thread_id,
    created_at: thread.created_at,
    exported_at: new Date().toISOString(),
-    messages: visibleMessages(messages, options).map((msg) => ({
-      type: msg.type,
-      id: msg.id,
-      content: msg.content,
-      ...(options.includeToolCalls &&
-      msg.type === "ai" &&
-      msg.tool_calls?.length
-        ? { tool_calls: msg.tool_calls }
-        : {}),
-    })),
+    messages: visibleMessages(messages, options)
+      .map((msg) => buildJSONMessage(msg, options))
+      .filter((m): m is JSONExportMessage => m !== null),
  };
  return JSON.stringify(exportData, null, 2);
 }
@@ -191,4 +191,52 @@ describe("formatThreadAsJSON", () => {
    expect(raw).not.toContain("internal trace");
    expect(raw).not.toContain("tool_calls");
  });
+
+  it("strips inline <think>...</think> wrappers from content", () => {
+    // bytedance/deer-flow#3131 review: JSON export must run the same
+    // sanitiser the Markdown path uses so inline reasoning never leaks
+    // even when `includeReasoning` is left at its default false.
+    const message = ai("<think>internal monologue</think>visible answer", {
+      id: "ai-1",
+    } as Partial<Message>);
+    const raw = formatThreadAsJSON(makeThread(), [message]);
+    expect(raw).not.toContain("internal monologue");
+    expect(raw).not.toContain("<think>");
+    expect(raw).toContain("visible answer");
+  });
+
+  it("strips content-array thinking blocks from content", () => {
+    const message = ai("placeholder", {
+      id: "ai-2",
+      content: [
+        { type: "thinking", thinking: "hidden reasoning step" },
+        { type: "text", text: "final visible text" },
+      ],
+    } as unknown as Partial<Message>);
+    const raw = formatThreadAsJSON(makeThread(), [message]);
+    expect(raw).not.toContain("hidden reasoning step");
+    expect(raw).toContain("final visible text");
+  });
+
+  it("strips <uploaded_files> markers from content", () => {
+    const message = human(
+      "real prompt\n<uploaded_files>\n/mnt/user-data/uploads/secret.pdf\n</uploaded_files>",
+      { id: "h-clean" } as Partial<Message>,
+    );
+    const raw = formatThreadAsJSON(makeThread(), [message]);
+    expect(raw).not.toContain("<uploaded_files>");
+    expect(raw).not.toContain("secret.pdf");
+    expect(raw).toContain("real prompt");
+  });
+
+  it("drops AI messages that sanitise to empty content", () => {
+    // Pure-reasoning AI fragments (no visible text, no tool calls) should
+    // not survive as `{content: ""}` rows in the export.
+    const message = ai("<think>only thinking, no answer</think>", {
+      id: "ai-3",
+    } as Partial<Message>);
+    const raw = formatThreadAsJSON(makeThread(), [message]);
+    const parsed = JSON.parse(raw) as { messages: unknown[] };
+    expect(parsed.messages).toHaveLength(0);
+  });
 });