fix: markdown解析bug
This commit is contained in:
parent
725514af2d
commit
dde679fd6e
|
|
@ -63,6 +63,8 @@ pub struct Position {
|
||||||
pub column: usize,
|
pub column: usize,
|
||||||
/// 字符偏移量(从0开始)
|
/// 字符偏移量(从0开始)
|
||||||
pub offset: usize,
|
pub offset: usize,
|
||||||
|
/// 字节偏移量(从0开始)
|
||||||
|
pub byte_offset: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// 范围信息
|
/// 范围信息
|
||||||
|
|
@ -154,13 +156,14 @@ impl MarkdownParser {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
/// 计算文本中的位置信息
|
/// 根据字节偏移计算位置信息
|
||||||
fn calculate_position(&self, text: &str, offset: usize) -> Position {
|
fn calculate_position_from_byte_offset(&self, text: &str, byte_offset: usize) -> Position {
|
||||||
let mut line = 0;
|
let mut line = 0;
|
||||||
let mut column = 0;
|
let mut column = 0;
|
||||||
|
let mut char_offset = 0;
|
||||||
|
|
||||||
for (i, ch) in text.char_indices() {
|
for (byte_idx, ch) in text.char_indices() {
|
||||||
if i >= offset {
|
if byte_idx >= byte_offset {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if ch == '\n' {
|
if ch == '\n' {
|
||||||
|
|
@ -169,15 +172,52 @@ impl MarkdownParser {
|
||||||
} else {
|
} else {
|
||||||
column += 1;
|
column += 1;
|
||||||
}
|
}
|
||||||
|
char_offset += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
Position {
|
Position {
|
||||||
line,
|
line,
|
||||||
column,
|
column,
|
||||||
offset,
|
offset: char_offset,
|
||||||
|
byte_offset,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// 根据字符偏移计算位置信息
|
||||||
|
fn calculate_position_from_char_offset(&self, text: &str, char_offset: usize) -> Position {
|
||||||
|
let mut line = 0;
|
||||||
|
let mut column = 0;
|
||||||
|
let mut current_char_offset = 0;
|
||||||
|
let mut byte_offset = 0;
|
||||||
|
|
||||||
|
for (byte_idx, ch) in text.char_indices() {
|
||||||
|
if current_char_offset >= char_offset {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if ch == '\n' {
|
||||||
|
line += 1;
|
||||||
|
column = 0;
|
||||||
|
} else {
|
||||||
|
column += 1;
|
||||||
|
}
|
||||||
|
current_char_offset += 1;
|
||||||
|
byte_offset = byte_idx + ch.len_utf8();
|
||||||
|
}
|
||||||
|
|
||||||
|
Position {
|
||||||
|
line,
|
||||||
|
column,
|
||||||
|
offset: char_offset,
|
||||||
|
byte_offset,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// 计算文本中的位置信息(保持向后兼容)
|
||||||
|
fn calculate_position(&self, text: &str, offset: usize) -> Position {
|
||||||
|
// 为了向后兼容,假设传入的是字节偏移
|
||||||
|
self.calculate_position_from_byte_offset(text, offset)
|
||||||
|
}
|
||||||
|
|
||||||
/// 解析Markdown文本
|
/// 解析Markdown文本
|
||||||
pub fn parse(&mut self, text: &str) -> Result<MarkdownParseResult> {
|
pub fn parse(&mut self, text: &str) -> Result<MarkdownParseResult> {
|
||||||
let start_time = std::time::Instant::now();
|
let start_time = std::time::Instant::now();
|
||||||
|
|
@ -222,8 +262,8 @@ impl MarkdownParser {
|
||||||
node_type: MarkdownNodeType::Document,
|
node_type: MarkdownNodeType::Document,
|
||||||
content: source_text.to_string(),
|
content: source_text.to_string(),
|
||||||
range: Range {
|
range: Range {
|
||||||
start: Position { line: 0, column: 0, offset: 0 },
|
start: self.calculate_position_from_byte_offset(source_text, 0),
|
||||||
end: self.calculate_position(source_text, source_text.len()),
|
end: self.calculate_position_from_byte_offset(source_text, source_text.len()),
|
||||||
},
|
},
|
||||||
children: Vec::new(),
|
children: Vec::new(),
|
||||||
attributes: HashMap::new(),
|
attributes: HashMap::new(),
|
||||||
|
|
|
||||||
|
|
@ -99,13 +99,13 @@ export const EnhancedMarkdownRenderer: React.FC<EnhancedMarkdownRendererProps> =
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
// 计算节点在原始文本中的字符偏移位置
|
// 计算节点在原始文本中的字节偏移位置(与grounding数据的字节偏移匹配)
|
||||||
const nodeStartOffset = node.range?.start?.offset || 0;
|
const nodeStartOffset = node.range?.start?.byte_offset || 0;
|
||||||
const nodeEndOffset = node.range?.end?.offset || 0;
|
const nodeEndOffset = node.range?.end?.byte_offset || 0;
|
||||||
|
|
||||||
// 查找与当前节点位置重叠的grounding支持信息
|
// 查找与当前节点位置重叠的grounding支持信息
|
||||||
|
|
||||||
const relatedSupports = groundingMetadata.grounding_supports.filter(support => {
|
const relatedSupports = groundingMetadata.grounding_supports.filter(support => {
|
||||||
// 这里是字符串二进制的offset
|
// grounding数据使用字节偏移
|
||||||
const segmentStart = support.segment.startIndex;
|
const segmentStart = support.segment.startIndex;
|
||||||
const segmentEnd = support.segment.endIndex;
|
const segmentEnd = support.segment.endIndex;
|
||||||
|
|
||||||
|
|
@ -113,7 +113,11 @@ export const EnhancedMarkdownRenderer: React.FC<EnhancedMarkdownRendererProps> =
|
||||||
// 检查节点范围与grounding片段是否有重叠
|
// 检查节点范围与grounding片段是否有重叠
|
||||||
return hasOverlap;
|
return hasOverlap;
|
||||||
});
|
});
|
||||||
|
console.log({
|
||||||
|
relatedSupports,
|
||||||
|
nodeStartOffset,
|
||||||
|
nodeEndOffset
|
||||||
|
})
|
||||||
if (relatedSupports.length > 0) {
|
if (relatedSupports.length > 0) {
|
||||||
// 获取相关的来源信息
|
// 获取相关的来源信息
|
||||||
const relatedSources = relatedSupports.flatMap(support =>
|
const relatedSources = relatedSupports.flatMap(support =>
|
||||||
|
|
|
||||||
|
|
@ -37,6 +37,8 @@ export interface Position {
|
||||||
column: number;
|
column: number;
|
||||||
/** 字符偏移量(从0开始) */
|
/** 字符偏移量(从0开始) */
|
||||||
offset: number;
|
offset: number;
|
||||||
|
/** 字节偏移量(从0开始) */
|
||||||
|
byte_offset: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue