fix: improve JSON repair handling for markdown code blocks (#841)

* fix: improve JSON repair handling for markdown code blocks

* unified import path

* compress_crawl_udf

* fix

* reverse
This commit is contained in:
Xun
2026-01-30 08:47:23 +08:00
committed by GitHub
parent 756421c3ac
commit 3adb4e90cb
4 changed files with 394 additions and 6 deletions
+22
View File
@@ -7,6 +7,7 @@ import re
from typing import Any
import json_repair
import re
logger = logging.getLogger(__name__)
@@ -121,6 +122,27 @@ def repair_json_output(content: str) -> str:
if not content:
return content
# Handle markdown code blocks (```json, ```ts, or ```)
# This must be checked first, as content may start with ``` instead of { or [
if "```" in content:
# Remove opening markdown code block markers (```json, ```ts, or ```), allowing
# optional leading spaces and multiple blank lines after the fence.
content = re.sub(
r'^[ \t]*```(?:json|ts)?[ \t]*\n+',
'',
content,
flags=re.IGNORECASE | re.MULTILINE,
)
# Remove closing markdown code block markers (```), allowing optional
# leading newlines and trailing spaces.
content = re.sub(
r'\n*```[ \t]*$',
'',
content,
flags=re.MULTILINE,
)
content = content.strip()
# First attempt: try to extract valid JSON if there are extra tokens
content = _extract_json_from_content(content)