fix: 修复TolerantJsonParser嵌套JSON解析问题
问题描述: - TolerantJsonParser在处理复杂嵌套JSON时只能提取部分内容 - 原因是正则表达式模式无法正确匹配嵌套的JSON对象 - 导致Gemini API响应解析不完整,输出长度异常缩短 修复内容: - 改进JSON对象和数组的正则表达式模式 - 添加手动括号匹配算法作为主要提取方法 - 实现字符串内容的正确处理,避免字符串中的括号干扰 - 优化JSON内容提取逻辑,优先使用最可靠的方法 - 添加comprehensive测试用例验证修复效果 测试结果: - 复杂嵌套JSON解析测试通过 - 括号匹配算法测试通过 - 字符串处理测试通过
This commit is contained in:
parent
906bea49f0
commit
a8c8f2a085
|
|
@ -86,31 +86,32 @@ impl TolerantJsonParser {
|
|||
/// 初始化正则表达式模式
|
||||
fn init_regex_patterns() -> Result<HashMap<String, Regex>> {
|
||||
let mut patterns = HashMap::new();
|
||||
|
||||
// JSON对象模式
|
||||
|
||||
// JSON对象模式 - 使用简单但可靠的方法
|
||||
// Rust的regex crate不支持递归正则,所以使用贪婪匹配
|
||||
patterns.insert(
|
||||
"object".to_string(),
|
||||
Regex::new(r"\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}")?,
|
||||
Regex::new(r"(?s)\{.*\}")?,
|
||||
);
|
||||
|
||||
// JSON数组模式
|
||||
|
||||
// JSON数组模式 - 使用简单但可靠的方法
|
||||
patterns.insert(
|
||||
"array".to_string(),
|
||||
Regex::new(r"\[[^\[\]]*(?:\[[^\[\]]*\][^\[\]]*)*\]")?,
|
||||
Regex::new(r"(?s)\[.*\]")?,
|
||||
);
|
||||
|
||||
|
||||
// Markdown代码块模式
|
||||
patterns.insert(
|
||||
"markdown_fence".to_string(),
|
||||
Regex::new(r"(?s)```(?:json)?\s*\n?(.*?)\n?```")?,
|
||||
);
|
||||
|
||||
|
||||
// 无引号键模式
|
||||
patterns.insert(
|
||||
"unquoted_key".to_string(),
|
||||
Regex::new(r"(\w+):")?,
|
||||
);
|
||||
|
||||
|
||||
// 尾随逗号模式
|
||||
patterns.insert(
|
||||
"trailing_comma".to_string(),
|
||||
|
|
@ -170,16 +171,8 @@ impl TolerantJsonParser {
|
|||
}
|
||||
}
|
||||
|
||||
// 查找JSON模式
|
||||
for pattern_name in ["object", "array"] {
|
||||
if let Some(pattern) = self.regex_patterns.get(pattern_name) {
|
||||
if let Some(mat) = pattern.find(&processed) {
|
||||
processed = mat.as_str().to_string();
|
||||
debug!("Extracted JSON using {} pattern", pattern_name);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
// 使用改进的JSON提取方法
|
||||
processed = self.extract_json_content(&processed);
|
||||
|
||||
// 在Tree-sitter解析之前修复常见的JSON错误
|
||||
processed = self.fix_common_json_errors(&processed);
|
||||
|
|
@ -187,6 +180,74 @@ impl TolerantJsonParser {
|
|||
Ok(processed)
|
||||
}
|
||||
|
||||
/// 改进的JSON内容提取方法
|
||||
fn extract_json_content(&self, text: &str) -> String {
|
||||
// 首先尝试使用手动括号匹配(最可靠的方法)
|
||||
if let Some(json_content) = self.extract_json_by_bracket_matching(text) {
|
||||
debug!("Extracted JSON using bracket matching");
|
||||
return json_content;
|
||||
}
|
||||
|
||||
// 如果括号匹配失败,尝试使用正则表达式
|
||||
for pattern_name in ["object", "array"] {
|
||||
if let Some(pattern) = self.regex_patterns.get(pattern_name) {
|
||||
if let Some(mat) = pattern.find(text) {
|
||||
debug!("Extracted JSON using {} pattern", pattern_name);
|
||||
return mat.as_str().to_string();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 如果所有方法都失败,返回原始文本
|
||||
text.to_string()
|
||||
}
|
||||
|
||||
/// 使用括号匹配提取JSON内容
|
||||
fn extract_json_by_bracket_matching(&self, text: &str) -> Option<String> {
|
||||
// 查找第一个 { 或 [
|
||||
let start_char = if text.contains('{') && text.contains('[') {
|
||||
let brace_pos = text.find('{').unwrap_or(usize::MAX);
|
||||
let bracket_pos = text.find('[').unwrap_or(usize::MAX);
|
||||
if brace_pos < bracket_pos { '{' } else { '[' }
|
||||
} else if text.contains('{') {
|
||||
'{'
|
||||
} else if text.contains('[') {
|
||||
'['
|
||||
} else {
|
||||
return None;
|
||||
};
|
||||
|
||||
let end_char = if start_char == '{' { '}' } else { ']' };
|
||||
|
||||
if let Some(start_pos) = text.find(start_char) {
|
||||
let mut depth = 0;
|
||||
let mut in_string = false;
|
||||
let mut escape_next = false;
|
||||
|
||||
for (i, ch) in text[start_pos..].char_indices() {
|
||||
if escape_next {
|
||||
escape_next = false;
|
||||
continue;
|
||||
}
|
||||
|
||||
match ch {
|
||||
'\\' if in_string => escape_next = true,
|
||||
'"' => in_string = !in_string,
|
||||
c if c == start_char && !in_string => depth += 1,
|
||||
c if c == end_char && !in_string => {
|
||||
depth -= 1;
|
||||
if depth == 0 {
|
||||
return Some(text[start_pos..start_pos + i + 1].to_string());
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
/// 修复常见的JSON错误
|
||||
fn fix_common_json_errors(&self, text: &str) -> String {
|
||||
let mut fixed = text.to_string();
|
||||
|
|
@ -822,4 +883,95 @@ mod tests {
|
|||
println!("Recovery strategies used: {:?}", stats.recovery_strategies_used);
|
||||
assert_eq!(parsed, json!({"name": "test"}));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_complex_nested_json_extraction() {
|
||||
let mut parser = create_test_parser();
|
||||
|
||||
// 测试复杂嵌套JSON(类似Gemini响应的结构)
|
||||
let complex_json = r#"```json
|
||||
{
|
||||
"environment_tags": ["Indoor", "Retail environment"],
|
||||
"environment_color_pattern": {
|
||||
"hue": 0.08,
|
||||
"saturation": 0.05,
|
||||
"value": 0.6
|
||||
},
|
||||
"dress_color_pattern": {
|
||||
"hue": 0.58,
|
||||
"saturation": 0.28,
|
||||
"value": 0.85
|
||||
},
|
||||
"style_description": "整体呈现休闲舒适的风格",
|
||||
"products": [
|
||||
{
|
||||
"category": "下装",
|
||||
"color_pattern": {
|
||||
"hue": 0.6,
|
||||
"saturation": 0.35,
|
||||
"value": 0.85
|
||||
},
|
||||
"design_styles": ["休闲", "复古", "街头"]
|
||||
}
|
||||
]
|
||||
}
|
||||
```"#;
|
||||
|
||||
let result = parser.parse(complex_json);
|
||||
assert!(result.is_ok(), "Complex nested JSON parsing failed");
|
||||
|
||||
let (parsed, stats) = result.unwrap();
|
||||
assert!(parsed.is_object());
|
||||
|
||||
let obj = parsed.as_object().unwrap();
|
||||
|
||||
// 检查顶级字段
|
||||
assert!(obj.contains_key("environment_tags"));
|
||||
assert!(obj.contains_key("environment_color_pattern"));
|
||||
assert!(obj.contains_key("dress_color_pattern"));
|
||||
assert!(obj.contains_key("style_description"));
|
||||
assert!(obj.contains_key("products"));
|
||||
|
||||
// 检查嵌套对象
|
||||
let env_color = obj.get("environment_color_pattern").unwrap();
|
||||
assert!(env_color.is_object());
|
||||
|
||||
// 检查数组
|
||||
let products = obj.get("products").unwrap();
|
||||
assert!(products.is_array());
|
||||
let products_array = products.as_array().unwrap();
|
||||
assert_eq!(products_array.len(), 1);
|
||||
|
||||
println!("✅ 复杂嵌套JSON解析测试通过");
|
||||
println!("📊 解析统计: 总节点数={}, 错误节点数={}, 错误率={:.2}%, 解析时间={}ms",
|
||||
stats.total_nodes, stats.error_nodes, stats.error_rate, stats.parse_time_ms);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bracket_matching_method() {
|
||||
let parser = create_test_parser();
|
||||
|
||||
// 测试括号匹配功能
|
||||
let text_with_nested_braces = r#"Some text before {
|
||||
"outer": {
|
||||
"inner": {
|
||||
"deep": "value"
|
||||
}
|
||||
},
|
||||
"array": [1, 2, {"nested": true}]
|
||||
} some text after"#;
|
||||
|
||||
let result = parser.extract_json_by_bracket_matching(text_with_nested_braces);
|
||||
assert!(result.is_some());
|
||||
|
||||
let json_str = result.unwrap();
|
||||
let parsed: Value = serde_json::from_str(&json_str).unwrap();
|
||||
assert!(parsed.is_object());
|
||||
|
||||
let obj = parsed.as_object().unwrap();
|
||||
assert!(obj.contains_key("outer"));
|
||||
assert!(obj.contains_key("array"));
|
||||
|
||||
println!("✅ 括号匹配测试通过");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue