mirror of
https://github.com/bytedance/deer-flow.git
synced 2026-06-10 01:15:58 +00:00
799bef6d9d
* fix(replay-e2e): match by conversation, not the living system prompt The model-replay match key hashed the full input including the lead-agent system prompt. That prompt is edited frequently (e.g. #3195 added a "File Editing Workflow" section), so the committed fixture went stale the moment the prompt changed on main — turning the Layer-2 render gate RED on every unrelated PR (#3430, #3432, ...). This was a self-inflicted false positive. Root-cause fix: - replay_provider._canonical_messages now EXCLUDES the system message from the hash. The conversation (human/ai/tool) is the stable contract that identifies a recorded turn; the system prompt is an internal detail not part of the front-back contract under test. (Mirrors how open-design keys its mock picker on the user prompt, not the system internals.) Proven robust: injecting a prompt edit no longer causes a replay miss. - Layer-1 golden was BLIND to replay misses: the gateway swallows a miss into an assistant error message, so the shape-only golden stayed green on a stale fixture. It now inspects replay_provider.replay_misses() and fails loud. (Layer-2 already fails on a miss.) - Re-recorded write_read_file.ultra fixture + regenerated golden under the new conversation-only hash. - Layer-2 render spec: assert the in-graph auto-title (deterministic); the follow-up suggestion is fired async and depends on a clean JSON model output, so assert it only when the fixture captured one — never gate on its absence (recording flakiness must not block CI). - docs: REPLAY_E2E.md updated. Verified: Layer-1 golden green (no miss), Layer-2 both specs green, CI=true make test 4033 passed / 0 failed, frontend pnpm check clean. * test(replay-e2e): restore suggestions coverage with a reliable capture Addresses review feedback (the suggestion path was dropped from Layer-2): - record spec now waits for the `/suggestions` response before checking capture stability, so the recorded fixture reliably includes the frontend-fired suggestions turn (previously the stability window could return before suggestions fired, yielding a fixture without it). - Re-recorded write_read_file.ultra: 5 turns (write_file, auto-title, read_file, answer, suggestions). Golden unchanged — suggestions is a separate /suggestions call, not part of the /runs/stream SSE sequence. - Layer-2 spec: restore the hard `EXPECTED_SUGGESTION` assertion. With the record spec now waiting for /suggestions, a fixture missing the suggestion turn means a broken recording and must fail loud, not pass silently. Verified: Layer-1 golden green (no miss), Layer-2 both specs green (auto-title + suggestion render), frontend pnpm check clean. * ci: re-trigger (flaky Docker Hub image pull in sandbox e2e, unrelated) backend-unit-tests failed only in test_sandbox_orphan_reconciliation_e2e.py with 'docker pull busybox:latest ... context deadline exceeded' — a CI-runner network flake reaching Docker Hub, not related to this docs/tests-only change. Empty commit to re-run CI. --------- Co-authored-by: DanielWalnut <45447813+hetaoBackend@users.noreply.github.com>
233 lines
7.3 KiB
JSON
233 lines
7.3 KiB
JSON
{
|
|
"scenario": "write_read_file",
|
|
"mode": "ultra",
|
|
"model": "sre/gpt-5",
|
|
"prompt": "Using your own file tools directly, create the file /mnt/user-data/outputs/note.txt with exactly this content: hi from replay. Then read that same file back and reply with its exact contents. Do NOT delegate to a subagent and do NOT use the task tool — do it yourself. Do not ask any clarifying questions.",
|
|
"context": {
|
|
"is_bootstrap": false,
|
|
"mode": "ultra",
|
|
"thinking_enabled": true,
|
|
"is_plan_mode": true,
|
|
"subagent_enabled": true
|
|
},
|
|
"turns": [
|
|
{
|
|
"input_hash": "9c50eda6ab7e8593dabccbdeadc70a4a7bf778b2c0c3f275f1f96cf2c8ab58db",
|
|
"output": {
|
|
"type": "ai",
|
|
"data": {
|
|
"content": "",
|
|
"additional_kwargs": {},
|
|
"response_metadata": {
|
|
"finish_reason": "tool_calls",
|
|
"model_name": "sre/gpt-5",
|
|
"model_provider": "openai"
|
|
},
|
|
"type": "ai",
|
|
"name": null,
|
|
"id": "lc_run--019ea641-acda-7423-9a9f-79725057bc20",
|
|
"tool_calls": [
|
|
{
|
|
"name": "write_file",
|
|
"args": {
|
|
"description": "Create the requested output file with exact content",
|
|
"path": "/mnt/user-data/outputs/note.txt",
|
|
"content": "hi from replay."
|
|
},
|
|
"id": "call_FV7zhKonjx5CAa1RwIcKihpi",
|
|
"type": "tool_call"
|
|
}
|
|
],
|
|
"invalid_tool_calls": [],
|
|
"usage_metadata": {
|
|
"input_tokens": 3664,
|
|
"output_tokens": 434,
|
|
"total_tokens": 4098,
|
|
"input_token_details": {
|
|
"audio": 0,
|
|
"cache_read": 3584
|
|
},
|
|
"output_token_details": {
|
|
"audio": 0,
|
|
"reasoning": 384
|
|
}
|
|
}
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"input_hash": "3598aeb87e221ca8f554e4d61ce6d5e8801754606fa5c95a89c38bd6cb623045",
|
|
"output": {
|
|
"type": "ai",
|
|
"data": {
|
|
"content": "Direct File Creation and Readback",
|
|
"additional_kwargs": {},
|
|
"response_metadata": {
|
|
"finish_reason": "stop",
|
|
"model_name": "sre/gpt-5",
|
|
"model_provider": "openai"
|
|
},
|
|
"type": "ai",
|
|
"name": null,
|
|
"id": "lc_run--019ea641-cf52-7793-900e-15ad4f032c0e",
|
|
"tool_calls": [],
|
|
"invalid_tool_calls": [],
|
|
"usage_metadata": {
|
|
"input_tokens": 104,
|
|
"output_tokens": 656,
|
|
"total_tokens": 760,
|
|
"input_token_details": {
|
|
"audio": 0,
|
|
"cache_read": 0
|
|
},
|
|
"output_token_details": {
|
|
"audio": 0,
|
|
"reasoning": 640
|
|
}
|
|
}
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"input_hash": "6af134379b2a9efa01b4f63032f88211d5f38f459f8bed621eb6c65e8e05c1f9",
|
|
"output": {
|
|
"type": "ai",
|
|
"data": {
|
|
"content": "",
|
|
"additional_kwargs": {},
|
|
"response_metadata": {
|
|
"finish_reason": "tool_calls",
|
|
"model_name": "sre/gpt-5",
|
|
"model_provider": "openai"
|
|
},
|
|
"type": "ai",
|
|
"name": null,
|
|
"id": "lc_run--019ea641-f523-7d60-a416-b051fba469a2",
|
|
"tool_calls": [
|
|
{
|
|
"name": "read_file",
|
|
"args": {
|
|
"description": "Verify contents to echo back exactly",
|
|
"path": "/mnt/user-data/outputs/note.txt"
|
|
},
|
|
"id": "call_YevFCnLcjWfWHaZm8wwMpEk8",
|
|
"type": "tool_call"
|
|
}
|
|
],
|
|
"invalid_tool_calls": [],
|
|
"usage_metadata": {
|
|
"input_tokens": 3719,
|
|
"output_tokens": 35,
|
|
"total_tokens": 3754,
|
|
"input_token_details": {
|
|
"audio": 0,
|
|
"cache_read": 3584
|
|
},
|
|
"output_token_details": {
|
|
"audio": 0,
|
|
"reasoning": 0
|
|
}
|
|
}
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"input_hash": "04751c4f7b0107b78b5c97d417063883fd586f5ebcbc4acf79be6cb3c0cdaec1",
|
|
"output": {
|
|
"type": "ai",
|
|
"data": {
|
|
"content": "hi from replay.",
|
|
"additional_kwargs": {},
|
|
"response_metadata": {
|
|
"finish_reason": "stop",
|
|
"model_name": "sre/gpt-5",
|
|
"model_provider": "openai"
|
|
},
|
|
"type": "ai",
|
|
"name": null,
|
|
"id": "lc_run--019ea641-ff38-7751-9c2b-cc648811883b",
|
|
"tool_calls": [],
|
|
"invalid_tool_calls": [],
|
|
"usage_metadata": {
|
|
"input_tokens": 3768,
|
|
"output_tokens": 8,
|
|
"total_tokens": 3776,
|
|
"input_token_details": {
|
|
"audio": 0,
|
|
"cache_read": 3584
|
|
},
|
|
"output_token_details": {
|
|
"audio": 0,
|
|
"reasoning": 0
|
|
}
|
|
}
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"input_hash": "8b98ebdbb53e88f000556c4753adede8eaa076ff6fd7b8a1285bfd18aee8144d",
|
|
"output": {
|
|
"type": "ai",
|
|
"data": {
|
|
"content": "[\n \"Can you show the file size and last modified time of /mnt/user-data/outputs/note.txt?\",\n \"List the contents of /mnt/user-data/outputs/ to confirm the file exists.\",\n \"Append 'second line' to /mnt/user-data/outputs/note.txt and print its new contents.\"\n]",
|
|
"additional_kwargs": {
|
|
"refusal": null
|
|
},
|
|
"response_metadata": {
|
|
"token_usage": {
|
|
"completion_tokens": 909,
|
|
"prompt_tokens": 224,
|
|
"total_tokens": 1133,
|
|
"completion_tokens_details": {
|
|
"accepted_prediction_tokens": 0,
|
|
"audio_tokens": 0,
|
|
"reasoning_tokens": 832,
|
|
"rejected_prediction_tokens": 0
|
|
},
|
|
"prompt_tokens_details": {
|
|
"audio_tokens": 0,
|
|
"cached_tokens": 0
|
|
},
|
|
"latency_checkpoint": {
|
|
"engine_tbt_ms": 12,
|
|
"engine_ttft_ms": 324,
|
|
"engine_ttlt_ms": 10965,
|
|
"pre_inference_ms": 153,
|
|
"service_tbt_ms": 12,
|
|
"service_ttft_ms": 849,
|
|
"service_ttlt_ms": 11491,
|
|
"total_duration_ms": 11351,
|
|
"user_visible_ttft_ms": 696
|
|
}
|
|
},
|
|
"model_provider": "openai",
|
|
"model_name": "sre/gpt-5",
|
|
"system_fingerprint": null,
|
|
"id": "chatcmpl-DoPFALdwiyEDYOIN7wFYhqBrr6eTA",
|
|
"service_tier": "default",
|
|
"finish_reason": "stop",
|
|
"logprobs": null
|
|
},
|
|
"type": "ai",
|
|
"name": null,
|
|
"id": "lc_run--019ea642-0eac-78f1-a506-931e343184f1-0",
|
|
"tool_calls": [],
|
|
"invalid_tool_calls": [],
|
|
"usage_metadata": {
|
|
"input_tokens": 224,
|
|
"output_tokens": 909,
|
|
"total_tokens": 1133,
|
|
"input_token_details": {
|
|
"audio": 0,
|
|
"cache_read": 0
|
|
},
|
|
"output_token_details": {
|
|
"audio": 0,
|
|
"reasoning": 832
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
]
|
|
} |